In [1]:
import os,sys,re,time,math


from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn import cluster
from sklearn.model_selection import train_test_split
from sklearn.metrics import *
from keras.callbacks import EarlyStopping



import matplotlib as mpl
import numpy as np
import pandas as pd

import sklearn
from matplotlib import pyplot as plt


from keras import backend as K

from keras.optimizers import Adam
from keras.models import *
from keras.layers import *
from keras.utils.np_utils import to_categorical

from keras.callbacks import ModelCheckpoint

Using TensorFlow backend.


In [2]:
import tensorflow as tf
import keras.backend.tensorflow_backend as KTF
 

#指定第一块GPU可用 
os.environ["CUDA_VISIBLE_DEVICES"] = '0'
config = tf.ConfigProto() 
#不全部占满显存, 按需分配
config.gpu_options.allow_growth=True
sess = tf.Session(config=config)

KTF.set_session(sess)

In [3]:
def read_svm(file):
    encodings = []
    labels = []
    with open(file) as f:
        records = f.readlines()

    for line in records:
        line = re.sub('\d+:', '', line)
        array = line.strip().split() if line.strip() != '' else None
        encodings.append(array[1:])
        labels.append(int(array[0]))

    return np.array(encodings).astype(float), np.array(labels).astype(int)

In [4]:
def draw_ROC_curve(y_test,y_predict,savepath=None):
    '''
    画ROC曲线
    '''
    false_positive_rate,true_positive_rate,thresholds=roc_curve(y_test, y_predict)
    roc_auc=auc(false_positive_rate, true_positive_rate)
    plt.title('ROC')
    plt.plot(false_positive_rate, true_positive_rate,'b',label='AUC = %0.3f'% roc_auc)
    plt.legend(loc='lower right')
    plt.plot([0,1],[0,1],'r--')
    plt.ylabel('TPR')
    plt.xlabel('FPR')
    plt.savefig(savepath)
    plt.close(0)

In [5]:
def performance(labelArr, predictArr):
    #labelArr[i] is actual value,predictArr[i] is predict value
    TP = 0.; TN = 0.; FP = 0.; FN = 0.
    for i in range(len(labelArr)):
        if labelArr[i] == 1 and predictArr[i] == 1:
            TP += 1.
        if labelArr[i] == 1 and predictArr[i] == 0:
            FN += 1.
        if labelArr[i] == 0 and predictArr[i] == 1:
            FP += 1.
        if labelArr[i] == 0 and predictArr[i] == 0:
            TN += 1.
    if (TP + FN)==0:
        SN=0
    else:
        SN = TP/(TP + FN) #Sensitivity = TP/P  and P = TP + FN
    if (FP+TN)==0:
        SP=0
    else:
        SP = TN/(FP + TN) #Specificity = TN/N  and N = TN + FP
    if (TP+FP)==0:
        precision=0
    else:
        precision=TP/(TP+FP)
    if (TP+FN)==0:
        recall=0
    else:
        recall=TP/(TP+FN)
    GM=math.sqrt(recall*SP)
    #MCC = (TP*TN-FP*FN)/math.sqrt((TP+FP)*(TP+FN)*(TN+FP)*(TN+FN))
    return precision,recall,SN,SP,GM,TP,TN,FP,FN

In [6]:
# 分割并序列编码
AA = 'GAVLIFWYDNEKQMSTCPHR'
def pep(path, seq_len):
    seqs = open(path).readlines()
    cut = (len(seqs[0].split()[0]) - 1 - seq_len) // 2
    X = [[AA.index(res.upper()) if res.upper() in AA else 0
          for res in (seq.split()[0][cut:-cut] if cut != 0 else seq.split()[0])]
        for seq in seqs if seq.strip() != '']
    y = [int(seq.split()[-1]) for seq in seqs if seq.strip() != '']
    return np.array(X), np.array(y)


In [7]:
def create_cnn_model2(input_length=29,dropout=0.4, shape=(130, 1)):
    model = Sequential()
    model.add(Embedding(22, 32, input_length = input_length))
    model.add(Conv1D(128, 8, activation='relu', padding='same',  input_shape=shape))
    model.add(MaxPooling1D(2))
    model.add(Dropout(dropout))

    model.add(Conv1D(128, 8, activation='relu', padding='same'))
    model.add(MaxPooling1D(2))
    model.add(Dropout(dropout))

    model.add(Conv1D(128, 8, activation='relu', padding='same'))
    model.add(MaxPooling1D(2))
    model.add(Dropout(dropout))

    model.add(Flatten())
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
    return model


In [8]:
def create_cnn_model3(shape, dropout=0.4):
    model = Sequential()

    model.add(Conv1D(128, 8, activation='relu', padding='same',  input_shape=shape))
    model.add(MaxPooling1D(2))
    model.add(Dropout(dropout))

    model.add(Conv1D(128, 8, activation='relu', padding='same'))
    model.add(MaxPooling1D(2))
    model.add(Dropout(dropout))

    model.add(Conv1D(128, 8, activation='relu', padding='same'))
    model.add(MaxPooling1D(2))
    model.add(Dropout(dropout))

    model.add(Flatten())
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
    return model


In [9]:
def write_score(path,pre,label):
    fw = open(path, 'w')

    for i in range(0,len(pre)):
        fw.write(str(pre[i]).replace('[','').replace(']',''))
        fw.write('\t')
        fw.write(str(label[i]))
        fw.write('\n')

    fw.close()

In [11]:
# keras 编码循环输出result
name = 'EAAC'
gap = '_gap5'
#q = [21,23,25]
q = [21,23,25,27,29,31,35,37]
#q=[29]
for t in q :
    path_train = 'C:/Users/Crow/Desktop/human_data_12.12/Step_11_CV/'+ name +'/Train_'+str(t)+'_'+ name + gap +'.txt'
    path_test = 'C:/Users/Crow/Desktop/human_data_12.12/Step_11_IND/'+ name +'/Test_'+str(t)+'_'+ name + gap +'.txt'
    train = read_svm(path_train)
    test = read_svm(path_test)


    x_train = train[0]
    y_train = train[1]

    x_test = test[0]
    y_test = test[1]
    x_train = np.expand_dims(x_train, axis=2) 
    x_test = np.expand_dims(x_test, axis=2) 
    shape = x_train.shape[1:]
    
    # k-fold
    kf = StratifiedKFold(n_splits = 10)

    for train_index, test_index in kf.split(x_train, y_train):
        x_train3, x_test3 = x_train[train_index], x_train[test_index]
        y_train3, y_test3 = y_train[train_index], y_train[test_index]
        
        model = create_cnn_model3(shape=shape)
    
        early_stopping = EarlyStopping(monitor='val_loss', patience=3)
        callbacks_list = [early_stopping]
        model.fit(x_train3, y_train3, validation_data = (x_test3, y_test3), epochs = 20, batch_size = 256,
                  shuffle=True,callbacks=callbacks_list, verbose=0)
    
    print(t)
    pre = model.predict(x_train)
    write_score('C:/Users/Crow/Desktop/human_data_12.12/Step_11_CV/'+ name +'/Train_keras_'+ str(t) + '_'+ name + gap+'_result.txt',pre,y_train)



    pre1 = model.predict(x_test)
    write_score('C:/Users/Crow/Desktop/human_data_12.12/Step_11_IND/'+ name +'/Test_keras_'+ str(t) + '_'+ name+ gap+'_result.txt',pre1,y_test)

    

21
23
25
27
29
31
35
37
