In [1]:
import os,sys,re,time,math


from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn import cluster
from sklearn.model_selection import train_test_split
from sklearn.metrics import *
from keras.callbacks import EarlyStopping



import matplotlib as mpl
import numpy as np
import pandas as pd

import sklearn
from matplotlib import pyplot as plt


from keras import backend as K

from keras.optimizers import Adam
from keras.models import *
from keras.layers import *
from keras.utils.np_utils import to_categorical

from keras.callbacks import ModelCheckpoint

Using TensorFlow backend.


In [2]:
import tensorflow as tf
import keras.backend.tensorflow_backend as KTF
 

#指定第一块GPU可用 
os.environ["CUDA_VISIBLE_DEVICES"] = '0'
config = tf.ConfigProto() 
#不全部占满显存, 按需分配
config.gpu_options.allow_growth=True
sess = tf.Session(config=config)

KTF.set_session(sess)

In [3]:
def read_svm(file):
    encodings = []
    labels = []
    with open(file) as f:
        records = f.readlines()

    for line in records:
        line = re.sub('\d+:', '', line)
        array = line.strip().split() if line.strip() != '' else None
        encodings.append(array[1:])
        labels.append(int(array[0]))

    return np.array(encodings).astype(float), np.array(labels).astype(int)

In [4]:
def draw_ROC_curve(y_test,y_predict,savepath=None):
    '''
    画ROC曲线
    '''
    false_positive_rate,true_positive_rate,thresholds=roc_curve(y_test, y_predict)
    roc_auc=auc(false_positive_rate, true_positive_rate)
    plt.title('ROC')
    plt.plot(false_positive_rate, true_positive_rate,'b',label='AUC = %0.3f'% roc_auc)
    plt.legend(loc='lower right')
    plt.plot([0,1],[0,1],'r--')
    plt.ylabel('TPR')
    plt.xlabel('FPR')
    plt.savefig(savepath)
    plt.close(0)

In [5]:
# AUC for a binary classifier
def auc_1(y_true, y_pred):
    ptas = tf.stack([binary_PTA(y_true,y_pred,k) for k in np.linspace(0, 1, 1000)],axis=0)
    pfas = tf.stack([binary_PFA(y_true,y_pred,k) for k in np.linspace(0, 1, 1000)],axis=0)
    pfas = tf.concat([tf.ones((1,)) ,pfas],axis=0)
    binSizes = -(pfas[1:]-pfas[:-1])
    s = ptas*binSizes
    return K.sum(s, axis=0)

# PFA, prob false alert for binary classifier
def binary_PFA(y_true, y_pred, threshold=K.variable(value=0.5)):
    y_pred = K.cast(y_pred >= threshold, 'float32')
    # N = total number of negative labels
    N = K.sum(1 - y_true)
    # FP = total number of false alerts, alerts from the negative class labels
    FP = K.sum(y_pred - y_pred * y_true)
    return FP/N

# P_TA prob true alerts for binary classifier
def binary_PTA(y_true, y_pred, threshold=K.variable(value=0.5)):
    y_pred = K.cast(y_pred >= threshold, 'float32')
    # P = total number of positive labels
    P = K.sum(y_true)
    # TP = total number of correct alerts, alerts from the positive class labels
    TP = K.sum(y_pred * y_true)
    return TP/P
 


In [6]:
def performance(labelArr, predictArr):
    #labelArr[i] is actual value,predictArr[i] is predict value
    TP = 0.; TN = 0.; FP = 0.; FN = 0.
    for i in range(len(labelArr)):
        if labelArr[i] == 1 and predictArr[i] == 1:
            TP += 1.
        if labelArr[i] == 1 and predictArr[i] == 0:
            FN += 1.
        if labelArr[i] == 0 and predictArr[i] == 1:
            FP += 1.
        if labelArr[i] == 0 and predictArr[i] == 0:
            TN += 1.
    if (TP + FN)==0:
        SN=0
    else:
        SN = TP/(TP + FN) #Sensitivity = TP/P  and P = TP + FN
    if (FP+TN)==0:
        SP=0
    else:
        SP = TN/(FP + TN) #Specificity = TN/N  and N = TN + FP
    if (TP+FP)==0:
        precision=0
    else:
        precision=TP/(TP+FP)
    if (TP+FN)==0:
        recall=0
    else:
        recall=TP/(TP+FN)
    GM=math.sqrt(recall*SP)
    #MCC = (TP*TN-FP*FN)/math.sqrt((TP+FP)*(TP+FN)*(TN+FP)*(TN+FN))
    return precision,recall,SN,SP,GM,TP,TN,FP,FN

In [7]:
# 分割并序列编码
AA = 'GAVLIFWYDNEKQMSTCPHR'
def pep(path, seq_len):
    seqs = open(path).readlines()
    cut = (len(seqs[0].split()[0]) - 1 - seq_len) // 2
    X = [[AA.index(res.upper()) if res.upper() in AA else 0
          for res in (seq.split()[0][cut:-cut] if cut != 0 else seq.split()[0])]
        for seq in seqs if seq.strip() != '']
    y = [int(seq.split()[-1]) for seq in seqs if seq.strip() != '']
    return np.array(X), np.array(y)


In [None]:
# data path
path1 = 'C:/Users/Crow/Desktop/human_data/Step_11_CV/Train.txt'
path2 = 'C:/Users/Crow/Desktop/human_data/Step_11_IND/Independent.txt'
path_train =  'C:/Users/Crow/Desktop/human_data/Step_11_CV/Train_29_EGAAC_cv.txt'
path_train2 = 'C;/Users/Crow/Desktop/human_data/Step_11_CV/Train_29_EGAAC_gap4.txt'
path_test =  'C:/Users/Crow/Desktop/human_data/Step_11_IND/Test_29_EGAAC_gap4.txt'


plant_train = 'C:/Users/Crow/Desktop/plant_data/Step_11_CV/Train.txt'
plant_test = 'C:/Users/Crow/Desktop/plant_data/Step_11_IND/Independent.txt'

In [None]:
path_train = 'C:/Users/Crow/Desktop/human_data_12.12/Step_11_CV/EGAAC/Train_29_EGAAC_gap4.txt'
path_test =  'C:/Users/Crow/Desktop/human_data_12.12/Step_11_IND/EGAAC/Test_29_EGAAC_gap4.txt'

In [None]:
x_plant_train,y_plant_train = pep(plant_train,27)
x_plant_test,y_plant_test = pep(plant_test,27)

In [None]:
train = read_svm(path_train)
test = read_svm(path_test)

x_train = train[0]
y_train = train[1]

x_test = test[0]
y_test = test[1]

# x_train2,y_train2 = pep(path1,27)
# x_test2,y_test2 = pep(path2,27)

In [8]:
def create_cnn_model(neurons=130,window=32,dropout=0.3,input_length=29):
    # create model
    model = Sequential()
    model.add(Embedding(22, 32, input_length = input_length))

    model.add(Conv1D(neurons, window, activation='relu', padding='same'))
    model.add(MaxPooling1D(2))
    model.add(Dropout(dropout))
    model.add(Conv1D(neurons, window, activation='relu', padding='same'))
    model.add(MaxPooling1D(2))
    model.add(Dropout(dropout))
    model.add(Conv1D(neurons, window, activation='relu', padding='same'))
    model.add(MaxPooling1D(2))
    model.add(Dropout(dropout))
    model.add(Conv1D(neurons, window, activation='relu', padding='same'))
    model.add(MaxPooling1D(2))
    model.add(Dropout(dropout))
    model.add(Flatten())
    model.add(Dense(neurons, activation='relu'))
    model.add(Dropout(dropout))
    model.add(Dense(1, activation='sigmoid'))
    # Compile model
    #model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
    return model




In [9]:
def create_1dcnn_model(neurons=130,window=32,dropout=0.4,input_length=29):
    model = Sequential()
    model.add(Embedding(22, 32, input_length = input_length))
    #model.add(Convolution1D(nb_filter=neurons, filter_length=window))
    model.add(Conv1D(neurons, window, activation='relu', padding='same'))
    model.add(Activation('relu'))
    model.add(Dropout(dropout))
    model.add(Flatten())
    model.add(Dropout(dropout))
    #model.add(Dense(2048, activation='relu'))
    #model.add(Dense(256, activation='relu'))
    #model.add(Dropout(dropout))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
    return model

In [10]:
def create_lstm(input_length=29):
    model = Sequential()
    model.add(Embedding(1024, 32, input_length = input_length))

    model.add(LSTM(256))

    model.add(Dropout(0.5))
    model.add(Dense(1, activation='sigmoid'))

    # model.compile(loss='binary_crossentropy',
    #               optimizer='adam',
    #               metrics=['accuracy',auc_1])

    model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
    return model

In [11]:
def create_bi_lstm(input_length=29):
    model = Sequential()
    model.add(Embedding(23, 32, input_length = input_length))
    model.add(Dropout(0.4))
    model.add(Bidirectional(LSTM(32,return_sequences=True)))
    model.add(Dropout(0.4))
    
    model.add(Flatten())
    model.add(Dense(1, activation='sigmoid'))

    # model.compile(loss='binary_crossentropy',
    #               optimizer='adam',
    #               metrics=['accuracy',auc_1])

    model.compile(loss='binary_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])
    return model

In [12]:
def create_cnn_model2(input_length=29,dropout=0.4, shape=(130, 1)):
    model = Sequential()
    model.add(Embedding(22, 32, input_length = input_length))
    model.add(Conv1D(128, 8, activation='relu', padding='same',  input_shape=shape))
    model.add(MaxPooling1D(2))
    model.add(Dropout(dropout))

    model.add(Conv1D(128, 8, activation='relu', padding='same'))
    model.add(MaxPooling1D(2))
    model.add(Dropout(dropout))

    model.add(Conv1D(128, 8, activation='relu', padding='same'))
    model.add(MaxPooling1D(2))
    model.add(Dropout(dropout))

    model.add(Flatten())
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
    return model


In [13]:
def create_cnn_model4(input_length=29,dropout=0.4, shape=(130, 1)):
    model = Sequential()
    model.add(Embedding(21, 5, input_length = input_length))
    model.add(Conv1D(128, 8, activation='relu', padding='same',  input_shape=shape))
    model.add(MaxPooling1D(2))
    model.add(Dropout(dropout))

    model.add(Conv1D(128, 8, activation='relu', padding='same'))
    model.add(MaxPooling1D(2))
    model.add(Dropout(dropout))

    model.add(Conv1D(128, 8, activation='relu', padding='same'))
    model.add(MaxPooling1D(2))
    model.add(Dropout(dropout))

    model.add(Flatten())
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
    return model


In [17]:
def create_cnn_model5(input_length=29,dropout=0.4):
    model = Sequential()
    model.add(Embedding(21, 5, input_length = input_length))
    model.add(Conv1D(128, 8, activation='relu', padding='same'))
    model.add(MaxPooling1D(2))
    model.add(Dropout(dropout))

    model.add(Conv1D(128, 8, activation='relu', padding='same'))
    model.add(MaxPooling1D(2))
    model.add(Dropout(dropout))

    model.add(Conv1D(128, 8, activation='relu', padding='same'))
    model.add(MaxPooling1D(2))
    model.add(Dropout(dropout))

    model.add(Flatten())
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
    return model


In [16]:
def create_cnn_model6(input_length=29,dropout=0.4):
    model = Sequential()
    model.add(Embedding(21, 5, input_length = input_length))
    model.add(Conv1D(128, 16, activation='relu', padding='same'))
    model.add(MaxPooling1D(2))
    model.add(Dropout(dropout))

    model.add(Conv1D(128, 8, activation='relu', padding='same'))
    model.add(MaxPooling1D(2))
    model.add(Dropout(dropout))

    model.add(Conv1D(128, 4, activation='relu', padding='same'))
    model.add(MaxPooling1D(2))
    model.add(Dropout(dropout))

    model.add(Flatten())
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
    return model


In [None]:
def create_cnn_model3(shape, dropout=0.4):
    model = Sequential()

    model.add(Conv1D(128, 8, activation='relu', padding='same',  input_shape=shape))
    model.add(MaxPooling1D(2))
    model.add(Dropout(dropout))

    model.add(Conv1D(128, 8, activation='relu', padding='same'))
    model.add(MaxPooling1D(2))
    model.add(Dropout(dropout))

    model.add(Conv1D(128, 8, activation='relu', padding='same'))
    model.add(MaxPooling1D(2))
    model.add(Dropout(dropout))

    model.add(Flatten())
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
    return model


In [19]:
def write_score(path,pre,label):
    fw = open(path, 'w')

    for i in range(0,len(pre)):
        fw.write(str(pre[i]).replace('[','').replace(']',''))
        fw.write('\t')
        fw.write(str(label[i]))
        fw.write('\n')

    fw.close()

In [18]:
def mean(a):
    return sum(a) / len(a)

In [None]:
model = create_1dcnn_model()
early_stopping = EarlyStopping(monitor='val_loss', patience=3)
callbacks_list = [early_stopping]

model.fit(x_train2, y_train2,
                   epochs = 25,
                   batch_size = 32,
                   shuffle=True,validation_split = 0.2,
         callbacks=callbacks_list, verbose=1)
print(model.evaluate(x_test2, y_test2, batch_size=256))
pre = model.predict(x_test2)
fpr, tpr, thresholds = roc_curve(y_test2,pre,pos_label=1)
print(sklearn.metrics.auc(fpr, tpr))


In [None]:
## k-fold 
# human_data

kf = KFold(n_splits = 10)

for train_index, test_index in kf.split(x_train2):

    x_train3, x_test3 = x_train2[train_index], x_train2[test_index]
    y_train3, y_test3 = y_train2[train_index], y_train2[test_index]
    
    human_model = create_cnn_model(input_length=29)
    #filepath="C:/Users/Crow/Desktop/human_data/CNN/checkpoint-{epoch:02d}e-val_acc_{val_acc:.2f}.hdf5"

    #checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True,mode='max', period=4)
    early_stopping = EarlyStopping(monitor='val_loss', patience=4)
    callbacks_list = [early_stopping]

    human_model.fit(x_train3, y_train3, validation_data = (x_test3, y_test3), epochs = 20, batch_size = 512,shuffle=True,
         callbacks=callbacks_list, verbose=1)

In [None]:
print(human_model.evaluate(x_test2, y_test2, batch_size=256))
pre = human_model.predict(x_test2)
fpr, tpr, thresholds = roc_curve(y_test2,pre,pos_label=1)
print(sklearn.metrics.auc(fpr, tpr))
draw_ROC_curve(y_test2,pre,savepath='C:/Users/Crow/Desktop/human_data/CNN_k-fold_12.4.png')

In [None]:
# plant_data

kf = KFold(n_splits = 10)

for train_index, test_index in kf.split(x_plant_train):

    x_train3, x_test3 = x_plant_train[train_index], x_plant_train[test_index]
    y_train3, y_test3 = y_plant_train[train_index], y_plant_train[test_index]
    
    plant_model = create_cnn_model(input_length=51)
    #filepath="C:/Users/Crow/Desktop/human_data/CNN/checkpoint-{epoch:02d}e-val_acc_{val_acc:.2f}.hdf5"

    #checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True,mode='max', period=4)
    early_stopping = EarlyStopping(monitor='val_loss', patience=4)
    callbacks_list = [early_stopping]

    human_model.fit(x_train3, y_train3, validation_data = (x_test3, y_test3), epochs = 20, batch_size = 256,shuffle=True,
         callbacks=callbacks_list, verbose=1)
    
    

print(plant_model.evaluate(x_plant_test, y_plant_test, batch_size=256))
pre = plant_model.predict(x_plant_test)
fpr, tpr, thresholds = roc_curve(y_plant_test,pre,pos_label=1)
print(sklearn.metrics.auc(fpr, tpr))
draw_ROC_curve(y_plant_test,pre,savepath='C:/Users/Crow/Desktop/plant_data/CNN_10-fold.png')

In [None]:
## k-fold 
# human_data

kf = KFold(n_splits = 10)

for train_index, test_index in kf.split(x_train2):

    x_train3, x_test3 = x_train2[train_index], x_train2[test_index]
    y_train3, y_test3 = y_train2[train_index], y_train2[test_index]
    
    human_model = create_lstm(input_length=29)
    #filepath="C:/Users/Crow/Desktop/human_data/CNN/checkpoint-{epoch:02d}e-val_acc_{val_acc:.2f}.hdf5"

    #checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True,mode='max', period=4)
    early_stopping = EarlyStopping(monitor='val_loss', patience=4)
    callbacks_list = [early_stopping]

    human_model.fit(x_train3, y_train3, validation_data = (x_test3, y_test3), epochs = 20, batch_size = 512,shuffle=True,
         callbacks=callbacks_list, verbose=1)

In [None]:
human_model = create_bi_lstm(input_length=29)
#human_model = create_lstm(input_length=29)
    #filepath="C:/Users/Crow/Desktop/human_data/CNN/checkpoint-{epoch:02d}e-val_acc_{val_acc:.2f}.hdf5"

    #checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True,mode='max', period=4)
early_stopping = EarlyStopping(monitor='val_loss', patience=5)
callbacks_list = [early_stopping]

human_model.fit(x_train2, y_train2,
                   epochs = 25,
                   batch_size = 32,
                   shuffle=True,validation_split = 0.2,
         callbacks=callbacks_list, verbose=1)
print(human_model.evaluate(x_test2, y_test2, batch_size=256))
pre = human_model.predict(x_test2)
fpr, tpr, thresholds = roc_curve(y_test2,pre,pos_label=1)
print(sklearn.metrics.auc(fpr, tpr))
draw_ROC_curve(y_test2,pre,savepath='C:/Users/Crow/Desktop/human_data/BiLSTM_12.4.png')

In [None]:
model = Sequential()
model.add(Bidirectional(LSTM(20, return_sequences=True)))
model.add(TimeDistributed(Dense(1, activation='sigmoid')))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])



early_stopping = EarlyStopping(monitor='val_loss', patience=5)
callbacks_list = [early_stopping]

model.fit(x_train2, y_train2,
                   epochs = 25,
                   batch_size = 512,
                   shuffle=True,validation_split = 0.2,
         callbacks=callbacks_list, verbose=1)
print(model.evaluate(x_test2, y_test2, batch_size=256))
pre = model.predict(x_test2)
fpr, tpr, thresholds = roc_curve(y_test2,pre,pos_label=1)
print(sklearn.metrics.auc(fpr, tpr))
draw_ROC_curve(y_test2,pre,savepath='C:/Users/Crow/Desktop/human_data/BiLSTM_12.4.png')

In [None]:
dropout=0.4

model = Sequential()

model.add(Embedding(22, 32, input_length = 29))
model.add(Conv1D(128, 8, activation='relu', padding='same'))
model.add(MaxPooling1D(2))
model.add(Dropout(dropout))

model.add(Conv1D(128, 8, activation='relu', padding='same'))
model.add(MaxPooling1D(2))
model.add(Dropout(dropout))

model.add(Conv1D(128, 8, activation='relu', padding='same'))
model.add(MaxPooling1D(2))
model.add(Dropout(dropout))

model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
early_stopping = EarlyStopping(monitor='val_loss', patience=3)
callbacks_list = [early_stopping]




model.fit(x_train2, y_train2,
                   epochs = 8,
                   batch_size = 512,
                   shuffle=True,validation_split = 0.2,
         callbacks=callbacks_list, verbose=1)
print(model.evaluate(x_test2, y_test2, batch_size=256))
pre = model.predict(x_test2)
fpr, tpr, thresholds = roc_curve(y_test2,pre,pos_label=1)
print(sklearn.metrics.auc(fpr, tpr))
draw_ROC_curve(y_test2,pre,savepath='C:/Users/Crow/Desktop/human_data/CNN/CNN_12.5.png')



In [None]:
model.save('C:/Users/Crow/Desktop/human_data/CNN/128_8_0.4_3conv1d_12.5.hdf5')

In [None]:
x_plant_train,y_plant_train = pep(plant_train,27)
x_plant_test,y_plant_test = pep(plant_test,27)


dropout=0.5

model = Sequential()

model.add(Embedding(22, 256, input_length = 29))
model.add(Conv1D(128, 8, activation='relu', padding='same'))
model.add(MaxPooling1D(2))
model.add(Dropout(dropout))

model.add(Conv1D(128, 8, activation='relu', padding='same'))
model.add(MaxPooling1D(2))
model.add(Dropout(dropout))

model.add(Conv1D(128, 8, activation='relu', padding='same'))
model.add(MaxPooling1D(2))
model.add(Dropout(dropout))

model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))

#model.add(GlobalAveragePooling1D())

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
early_stopping = EarlyStopping(monitor='val_loss', patience=3)
callbacks_list = [early_stopping]




model.fit(x_plant_train, y_plant_train,
                   epochs = 10,
                   batch_size = 512,
                   shuffle=True,validation_split = 0.2,
         callbacks=callbacks_list, verbose=1)
print(model.evaluate(x_plant_test, y_plant_test, batch_size=256))
pre = model.predict(x_plant_test)
fpr, tpr, thresholds = roc_curve(y_plant_test,pre,pos_label=1)
print(sklearn.metrics.auc(fpr, tpr))
draw_ROC_curve(y_plant_test,pre,savepath='C:/Users/Crow/Desktop/plant_data/CNN/CNN_12.7.png')



In [None]:
model.summary()


In [None]:
model.save('C:/Users/Crow/Desktop/plant_data/CNN/128_8_0.4_3conv1d_12.5.hdf5')

In [None]:
dropout=0.4

model = Sequential()

model.add(Embedding(22, 32, input_length = 130))
model.add(Conv1D(128, 8, activation='relu', padding='same',  input_shape=(130, 1)))
model.add(MaxPooling1D(2))
model.add(Dropout(dropout))

model.add(Conv1D(128, 8, activation='relu', padding='same'))
model.add(MaxPooling1D(2))
model.add(Dropout(dropout))

model.add(Conv1D(128, 8, activation='relu', padding='same'))
model.add(MaxPooling1D(2))
model.add(Dropout(dropout))

model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
early_stopping = EarlyStopping(monitor='val_loss', patience=3)
callbacks_list = [early_stopping]




model.fit(x_train, y_train,
                   epochs = 30,
                   batch_size = 512,
                   shuffle=True,validation_split = 0.2,
         callbacks=callbacks_list, verbose=1)
print(model.evaluate(x_test, y_test, batch_size=256))
pre = model.predict(x_test)
fpr, tpr, thresholds = roc_curve(y_test,pre,pos_label=1)
print(sklearn.metrics.auc(fpr, tpr))
draw_ROC_curve(y_test,pre,savepath='C:/Users/Crow/Desktop/human_data/CNN/CNN_12.10.png')



In [None]:
import keras
seed=13
np.random.seed(seed)

# 创建 1 维向量，并扩展维度适应 Keras 对输入的要求， data_1d 的大小为 (1, 25, 1)
data_1d = np.random.normal(size=25)
data_1d = np.expand_dims(data_1d, 0)
data_1d = np.expand_dims(data_1d, 2)

# 定义卷积层
filters = 1 # 卷积核数量为 1
kernel_size = 5 # 卷积核大小为 5
convolution_1d_layer = Conv1D(filters, kernel_size, strides=1, padding='valid', input_shape=(25, 1), activation="relu", name="convolution_1d_layer")

# 定义最大化池化层
max_pooling_layer = MaxPool1D(pool_size=5, strides=1, padding="valid", name="max_pooling_layer")

# 平铺层，调整维度适应全链接层
reshape_layer = Flatten(name="reshape_layer")

# 定义全链接层
full_connect_layer = Dense(5, kernel_initializer=keras.initializers.RandomNormal(mean=0.0, stddev=0.1, seed=seed), bias_initializer="random_normal", use_bias=True, name="full_connect_layer")

# 编译模型
model = Sequential()
model.add(convolution_1d_layer)
model.add(max_pooling_layer)
model.add(reshape_layer)
model.add(full_connect_layer)

# 打印 full_connect_layer 层的输出
output = Model(inputs=model.input, outputs=model.get_layer('full_connect_layer').output).predict(data_1d)
print(output)

# 打印网络结构
print(model.summary())

In [None]:
kf = KFold(n_splits = 10)

for train_index, test_index in kf.split(x_train2):

    x_train3, x_test3 = x_train2[train_index], x_train2[test_index]
    y_train3, y_test3 = y_train2[train_index], y_train2[test_index]
    
    human_model = create_cnn_model2(input_length=29)
    #filepath="C:/Users/Crow/Desktop/human_data/CNN/checkpoint-{epoch:02d}e-val_acc_{val_acc:.2f}.hdf5"

    #checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True,mode='max', period=4)
    early_stopping = EarlyStopping(monitor='val_loss', patience=4)
    callbacks_list = [early_stopping]

    human_model.fit(x_train3, y_train3, validation_data = (x_test3, y_test3), epochs = 20, batch_size = 512,shuffle=True,
         callbacks=callbacks_list, verbose=1)

In [None]:
print(human_model.evaluate(x_test2, y_test2, batch_size=256))
pre = human_model.predict(x_test2)
fpr, tpr, thresholds = roc_curve(y_test2,pre,pos_label=1)
print(sklearn.metrics.auc(fpr, tpr))
draw_ROC_curve(y_test2,pre,savepath='C:/Users/Crow/Desktop/human_data/CNN_k-fold_12.10.png')

In [None]:
x_train0,y_train0 = pep('C:/Users/Crow/Desktop/human_test/Train.txt',27)
x_test0,y_test0 = pep('C:/Users/Crow/Desktop/human_test/Independent.txt',27)

x_test_small,y_test_small = pep('C:/Users/Crow/Desktop/human_test/human_small.txt',27)


kf = KFold(n_splits = 10)

for train_index, test_index in kf.split(x_train2):

    x_train3, x_test3 = x_train2[train_index], x_train2[test_index]
    y_train3, y_test3 = y_train2[train_index], y_train2[test_index]
    
    human_model = create_cnn_model2(input_length=29)
    #filepath="C:/Users/Crow/Desktop/human_data/CNN/checkpoint-{epoch:02d}e-val_acc_{val_acc:.2f}.hdf5"

    #checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True,mode='max', period=4)
    early_stopping = EarlyStopping(monitor='val_loss', patience=4)
    callbacks_list = [early_stopping]

    human_model.fit(x_train3, y_train3, validation_data = (x_test3, y_test3), epochs = 20, batch_size = 256,shuffle=True,
         callbacks=callbacks_list, verbose=1)
    
print(human_model.evaluate(x_test0, y_test0, batch_size=256))
pre = human_model.predict(x_test0)
fpr, tpr, thresholds = roc_curve(y_test0,pre,pos_label=1)
print(sklearn.metrics.auc(fpr, tpr))

print(human_model.evaluate(x_test_small, y_test_small, batch_size=256))
pre = human_model.predict(x_test_small)
fpr, tpr, thresholds = roc_curve(y_test_small,pre,pos_label=1)
print(sklearn.metrics.auc(fpr, tpr))



In [None]:
x_train2,y_train2 = pep(path1,27)
x_test2,y_test2 = pep(path2,27)
x_train0,y_train0 = pep('C:/Users/Crow/Desktop/human_test/Train.txt',27)
x_test0,y_test0 = pep('C:/Users/Crow/Desktop/human_test/Independent.txt',27)

x_test_small,y_test_small = pep('C:/Users/Crow/Desktop/human_test/human_small.txt',27)



In [None]:

# model = Sequential()


# #     model.add(Embedding(22, 32, input_length = input_length))
# #     model.add(Conv1D(128, 8, activation='relu', padding='same',  input_shape=(130, 1)))
# #     model.add(MaxPooling1D(2))
# #     model.add(Dropout(dropout))


# model.add(Embedding(22, 32, input_length=29))
# model.add(Convolution1D(256, 8, padding='same',activation='relu'))
# model.add(MaxPool1D(2))
# model.add(Convolution1D(128, 8, padding='same',activation='relu'))
# model.add(MaxPool1D(2))
# model.add(Convolution1D(64, 3, padding='same',activation='relu'))
# model.add(Flatten())
# model.add(Dropout(0.1))
# model.add(BatchNormalization()) # (批)规范化层
# model.add(Dense(256,activation='relu'))
# model.add(Dropout(0.1))
# model.add(Dense(1,activation='softmax'))


# model.compile(loss='binary_crossentropy',
#               optimizer='adam',
#               metrics=['accuracy'])


# model = Sequential()

# model.add(Embedding(22, 32, input_length = 29))
# model.add(Conv1D(128, 8, activation='relu', padding='same'))
# model.add(MaxPooling1D(2))
# model.add(Dropout(dropout))

# model.add(Conv1D(128, 8, activation='relu', padding='same'))
# model.add(MaxPooling1D(2))
# model.add(Dropout(dropout))

# model.add(Conv1D(128, 8, activation='relu', padding='same'))
# model.add(MaxPooling1D(2))
# model.add(Dropout(dropout))

# model.add(Flatten())
# model.add(Dense(1, activation='sigmoid'))
# model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])








# early_stopping = EarlyStopping(monitor='val_loss', patience=5)
# callbacks_list = [early_stopping]




# model.fit(x_train0, y_train0,
#                    epochs = 30,
#                    batch_size = 256,
#                    shuffle=True,validation_split = 0.2,
#          callbacks=callbacks_list, verbose=1)













kf = KFold(n_splits = 10)

for train_index, test_index in kf.split(x_train2):

    x_train3, x_test3 = x_train2[train_index], x_train2[test_index]
    y_train3, y_test3 = y_train2[train_index], y_train2[test_index]
    
    model = create_cnn_model4(input_length=29)
    #filepath="C:/Users/Crow/Desktop/human_data/CNN/checkpoint-{epoch:02d}e-val_acc_{val_acc:.2f}.hdf5"

    #checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True,mode='max', period=4)
    early_stopping = EarlyStopping(monitor='val_loss', patience=4)
    callbacks_list = [early_stopping]

    model.fit(x_train3, y_train3, validation_data = (x_test3, y_test3), epochs = 20, batch_size = 256,shuffle=True,
         callbacks=callbacks_list, verbose=1)
    







print(model.evaluate(x_train2, y_train2, batch_size=256))
pre = model.predict(x_train2)
fpr, tpr, thresholds = roc_curve(y_train2,pre,pos_label=1)
pre1 = model.predict_classes(x_train2)
print('human 合并 train')
print( "ACC:  %f "  %accuracy_score(y_train2,pre1))
print("AUC: %f" % sklearn.metrics.auc(fpr, tpr))
print("MCC: %f " %matthews_corrcoef(y_train2,pre1))



print(model.evaluate(x_test2, y_test2, batch_size=256))
pre = model.predict(x_test2)
fpr, tpr, thresholds = roc_curve(y_test2,pre,pos_label=1)
pre1 = model.predict_classes(x_test2)
print('human 合并test')
print( "ACC:  %f "  %accuracy_score(y_test2,pre1))
print("AUC: %f" % sklearn.metrics.auc(fpr, tpr))
print("MCC: %f " %matthews_corrcoef(y_test2,pre1))



print(model.evaluate(x_test0, y_test0, batch_size=256))
pre = model.predict(x_test0)
fpr, tpr, thresholds = roc_curve(y_test0,pre,pos_label=1)
pre1 = model.predict_classes(x_test0)
print('human large test')
print( "ACC:  %f "  %accuracy_score(y_test0,pre1))
print("AUC: %f" % sklearn.metrics.auc(fpr, tpr))
print("MCC: %f " %matthews_corrcoef(y_test0,pre1))


print(model.evaluate(x_test_small, y_test_small, batch_size=256))
pre = model.predict(x_test_small)
fpr, tpr, thresholds = roc_curve(y_test_small,pre,pos_label=1)
pre1 = model.predict_classes(x_test_small)
print('human mall')
print( "ACC:  %f "  %accuracy_score(y_test_small,pre1))
print("AUC: %f" % sklearn.metrics.auc(fpr, tpr))
print("MCC: %f " %matthews_corrcoef(y_test_small,pre1))


In [None]:
model.summary()

In [None]:
performance(y_test_small,pre1)

In [None]:
model.save('c:/Users/Crow/Desktop/human_test/human_large_train.hdf5')

In [None]:
print(human_model.evaluate(x_train2, y_train2, batch_size=256))
pre = human_model.predict(x_train2)
fpr, tpr, thresholds = roc_curve(y_train2,pre,pos_label=1)
print(sklearn.metrics.auc(fpr, tpr))

pre1 = model.predict_classes(x_train2)
print("MCC: %f " %matthews_corrcoef(y_train2,pre1))
print( "ACC:  %f "  %accuracy_score(y_train2,pre1))
sklearn.metrics.accuracy_score(y_train2,pre1)

In [None]:
pre1 = np.argmax(pre, axis=1)

In [None]:
pre1

In [None]:
print("MCC: %f " %matthews_corrcoef(y_train2,pre1))


print( "ACC:  %f "  %accuracy_score(y_train2,pre1))

In [None]:
performance(y_test_small,pre1)

In [None]:
human_model.evaluate(x_test0, y_test0, batch_size=256)


In [None]:
#y_cat = np.argmax(y_train2, axis = 1) # convert one hot array to integers
kf = StratifiedKFold(n_splits = 10)

for train_index, test_index in kf.split(x_train2, y_train2):

   
    x_train3, x_test3 = x_train2[train_index], x_train2[test_index]
    y_train3, y_test3 = y_train2[train_index], y_train2[test_index]
    model = create_cnn_model2(input_length=29)
    
    early_stopping = EarlyStopping(monitor='val_loss', patience=3)
    callbacks_list = [early_stopping]
    #model.fit(x_train, y_train, validation_data = (x_test, y_test), epochs = 1, batch_size = 64)
    model.fit(x_train3, y_train3, validation_data = (x_test3, y_test3), epochs = 20, batch_size = 256,shuffle=True,
         callbacks=callbacks_list, verbose=1)
    
    
    

pre = model.predict(x_train2)
fpr, tpr, thresholds = roc_curve(y_train2,pre,pos_label=1)
pre1 = model.predict_classes(x_train2)
precision,recall,SN,SP,GM,TP,TN,FP,FN = performance(y_train2,pre1)
print('human 合并 train')
print(model.evaluate(x_train2, y_train2, batch_size=256))
print("ACC:  %f "  %accuracy_score(y_train2,pre1))
print("Sn: %f" %SN)
print("Sp: %f" %SP)
print("MCC: %f " %matthews_corrcoef(y_train2,pre1))
print("AUC: %f" % auc(fpr, tpr))

draw_ROC_curve(y_train2,pre,'human 合并 train')
write_score('C:/Users/Crow/Desktop/human_data_12.12/Step_11_CV/Train_keras_result.txt',pre,y_train2)



pre = model.predict(x_test2)
fpr, tpr, thresholds = roc_curve(y_test2,pre,pos_label=1)
pre2 = model.predict_classes(x_test2)
precision,recall,SN,SP,GM,TP,TN,FP,FN = performance(y_test2,pre2)
print('human 合并test')
print(model.evaluate(x_test2, y_test2, batch_size=256))
print("ACC:  %f "  %accuracy_score(y_test2,pre2))
print("Sn: %f" %SN)
print("Sp: %f" %SP)
print("MCC: %f " %matthews_corrcoef(y_test2,pre2))
print("AUC: %f" % auc(fpr, tpr))

draw_ROC_curve(y_test2,pre,'human 合并test')
write_score('C:/Users/Crow/Desktop/human_data_12.12/Step_11_IND/Test_keras_result.txt',pre,y_test2)

# print(model.evaluate(x_test0, y_test0, batch_size=256))
# pre = model.predict(x_test0)
# fpr, tpr, thresholds = roc_curve(y_test0,pre,pos_label=1)
# pre3 = model.predict_classes(x_test0)
# print('human large test')
# print( "ACC:  %f "  %accuracy_score(y_test0,pre3))
# print("AUC: %f" % sklearn.metrics.auc(fpr, tpr))
# print("MCC: %f " %matthews_corrcoef(y_test0,pre3))
# #draw_ROC_curve(y_test0,pre3,'human large test')

# print(model.evaluate(x_test_small, y_test_small, batch_size=256))
# pre = model.predict(x_test_small)
# fpr, tpr, thresholds = roc_curve(y_test_small,pre,pos_label=1)
# pre4 = model.predict_classes(x_test_small)
# print('human mall')
# print( "ACC:  %f "  %accuracy_score(y_test_small,pre4))
# print("AUC: %f" % sklearn.metrics.auc(fpr, tpr))
# print("MCC: %f " %matthews_corrcoef(y_test_small,pre4))
#draw_ROC_curve(y_test_small,pre4,'human mall')

In [None]:
print(model.evaluate(x_test_small, y_test_small, batch_size=256))
pre = model.predict(x_test_small)
fpr, tpr, thresholds = roc_curve(y_test_small,pre,pos_label=1)
pre4 = model.predict_classes(x_test_small)
print('human mall')
print( "ACC:  %f "  %accuracy_score(y_test_small,pre4))
print("AUC: %f" % sklearn.metrics.auc(fpr, tpr))
print("MCC: %f " %matthews_corrcoef(y_test_small,pre4))
draw_ROC_curve(y_test_small,pre,'human mall')

In [None]:
model.predict(x_test_small)[0:5]

In [None]:
model.predict_classes(x_test_small)

In [None]:
precision, recall, SN, SP, GM, TP, TN, FP, FN = performance(y_test_small,pre1)

In [None]:
pre1

In [None]:
performance(y_test_small,pre1)

In [None]:
from itertools import cycle
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt


def plot_roc(y_pred, y_true, classes=None, title=None, savefile=None):
    """This function plot the ROC curve and return the AUC"""
    if len(y_pred.shape)==1:
        y_pred = y_pred.reshape(y_pred.shape+(1,))
        y_true = y_true.reshape(y_true.shape+(1,))
   
    n_classes = y_pred.shape[1]
    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    
    if classes is None:
        legends = ['class'+str(j+1) for j in range(n_classes)]
    elif len(classes) == n_classes:
        legends = classes
    else:
        raise ValueError("Number of classes doesn't match labels")    
    
    for i in range(n_classes):
        fpr[i], tpr[i], _ = roc_curve(y_true[:, i], y_pred[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])
        
    colors = cycle(['darkorange', 'cornflowerblue', 'navy', 'aqua'])    

    for i, color in zip(range(n_classes), colors):
        plt.plot(fpr[i], tpr[i], color=color,
                 label='ROC curve of {0} (area = {1:0.4f})'
                 ''.format(legends[i], roc_auc[i]))
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    if title:
        plt.title(title)
    else:
        plt.title('ROC curves for all classes')
    plt.legend(loc="lower right")
    if savefile:
        plt.savefig(savefile, dpi=300)
    return roc_auc_score

In [None]:
draw_ROC_curve(y_test_small,pre1,'s')

In [None]:
# 测试

model = create_cnn_model2(input_length=29)
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

early_stopping = EarlyStopping(monitor='val_loss', patience=4)
callbacks_list = [early_stopping]
#model.fit(x_train, y_train, validation_data = (x_test, y_test), epochs = 1, batch_size = 64)
model.fit(x_train2, y_train2,
                   epochs = 8,
                   batch_size = 512,
                   shuffle=True,validation_split = 0.2,
         callbacks=callbacks_list, verbose=1)

print(model.evaluate(x_test2, y_test2, batch_size=256))
pre = model.predict(x_test2)
pre2 = model.predict_classes(x_test2)
fpr, tpr, thresholds = roc_curve(y_test2,pre,pos_label=1)
print(sklearn.metrics.auc(fpr, tpr))
_,_,SN,SP,_,_,_,_,_ = performance(y_test2,pre2)
print("ACC:  %f "  %accuracy_score(y_test2,pre2))
print("Sn: %f" %SN)
print("Sp: %f" %SP)
print("MCC: %f " %matthews_corrcoef(y_test2,pre2))
print("AUC: %f" % auc(fpr, tpr))


In [None]:
pre

In [None]:
fw = open('C:/Users/Crow/Desktop/human_data_12.12/record.txt', 'w')

for i in range(0,len(pre)):
    fw.write(str(pre[i]).replace('[','').replace(']',''))
    fw.write('\t')
    fw.write(str(y_test2[i]))
    fw.write('\n')

fw.close()

In [None]:
np.set_printoptions(suppress=True)

In [None]:
import matplotlib
import numpy as np  
import matplotlib.pyplot as plt  
Recall = Sn
Precison = Sp
plt.figure()
plt.ylim(0,1.1)
plt.xlabel("Recall")
plt.xlim(0,1.1)
plt.ylabel("Precison")
plt.plot(Recall,Precison)
plt.show()

In [None]:
path_train = 'C:/Users/Crow/Desktop/human_data_12.12/Step_11_CV/EAAC/Train_29_EAAC_gap5.txt'
path_test =  'C:/Users/Crow/Desktop/human_data_12.12/Step_11_IND/EAAC/Test_29_EAAC_gap5.txt'

train = read_svm(path_train)
test = read_svm(path_test)

x_train = train[0]
y_train = train[1]

x_test = test[0]
y_test = test[1]


In [None]:
x_train.shape
x_train = np.expand_dims(x_train, axis=2) 
x_test = np.expand_dims(x_test, axis=2) 


In [None]:
x_train2.shape[1:]

In [None]:
# 编码测试

dropout=0.4

model = Sequential()

#model.add(Embedding(22, 32, input_length = 500))
model.add(Conv1D(128, 8, activation='relu', padding='same',input_shape=(500,1)))
model.add(MaxPooling1D(2))
model.add(Dropout(dropout))

model.add(Conv1D(128, 8, activation='relu', padding='same'))
model.add(MaxPooling1D(2))
model.add(Dropout(dropout))

model.add(Conv1D(128, 8, activation='relu', padding='same'))
model.add(MaxPooling1D(2))
model.add(Dropout(dropout))

model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])


early_stopping = EarlyStopping(monitor='val_loss', patience=4)
callbacks_list = [early_stopping]




model.fit(x_train, y_train,
                   epochs = 20,
                   batch_size = 512,
                   shuffle=True,validation_split = 0.2,
         callbacks=callbacks_list, verbose=1)


pre = model.predict(x_test)
fpr, tpr, thresholds = roc_curve(y_test,pre,pos_label=1)
pre1 = model.predict_classes(x_test)
precision,recall,SN,SP,GM,TP,TN,FP,FN = performance(y_test,pre1)
print('human 合并 train')
print(model.evaluate(x_test, y_test, batch_size=256))
print("ACC:  %f "  %accuracy_score(y_test,pre1))
print("Sn: %f" %SN)
print("Sp: %f" %SP)
print("MCC: %f " %matthews_corrcoef(y_test,pre1))
print("AUC: %f" % auc(fpr, tpr))

draw_ROC_curve(y_test,pre,'human 合并 train')




In [None]:
input = np.array(np.arange(1, 1+10*8*16).reshape([10, 8, 16]), dtype=np.float32)
print(input.shape)

In [None]:
# keras 编码循环输出result
name = 'ZSCALE'
gap = ''
#q = [21,23,25]
#q = [21,23,25,27,29,31,35,37]
q=[29]
for t in q :
    path_train = 'C:/Users/Crow/Desktop/human_data_12.12/Step_11_CV/'+ name +'/Train_'+str(t)+'_'+ name + gap +'.txt'
    path_test = 'C:/Users/Crow/Desktop/human_data_12.12/Step_11_IND/'+ name +'/Test_'+str(t)+'_'+ name + gap +'.txt'
    train = read_svm(path_train)
    test = read_svm(path_test)


    x_train = train[0]
    y_train = train[1]

    x_test = test[0]
    y_test = test[1]
    x_train = np.expand_dims(x_train, axis=2) 
    x_test = np.expand_dims(x_test, axis=2) 
    shape = x_train.shape[1:]
    
    # k-fold
    kf = StratifiedKFold(n_splits = 10)

    for train_index, test_index in kf.split(x_train, y_train):
        x_train3, x_test3 = x_train[train_index], x_train[test_index]
        y_train3, y_test3 = y_train[train_index], y_train[test_index]
        
        model = create_cnn_model3(shape=shape)
    
        early_stopping = EarlyStopping(monitor='val_loss', patience=3)
        callbacks_list = [early_stopping]
        model.fit(x_train3, y_train3, validation_data = (x_test3, y_test3), epochs = 20, batch_size = 256,
                  shuffle=True,callbacks=callbacks_list, verbose=0)
    
    print(t)
    pre = model.predict(x_train)
    write_score('C:/Users/Crow/Desktop/human_data_12.12/Step_11_CV/'+ name +'/Train_keras_'+ str(t) + '_'+ name + gap+'_result.txt',pre,y_train)



    pre1 = model.predict(x_test)
    write_score('C:/Users/Crow/Desktop/human_data_12.12/Step_11_IND/'+ name +'/Test_keras_'+ str(t) + '_'+ name+ gap+'_result.txt',pre1,y_test)

    

In [None]:
# keras 编码循环输出result
name = 'EGAAC'
gap = '_gap4'
#q = [21,23,25]
q = [21,23,25,27,29,31,35,37]
test_cutoff = [0.267451,0.257024,0.240165,0.25257,0.230415,0.207153,0.23201,0.243988]
train_cutoff = [0.260379,0.254653,0.235919,0.24859,0.225346,0.198662,0.224679,0.239181]
c = 0
for t in q :
    path_train = 'C:/Users/Crow/Desktop/human_data_12.12/Step_11_CV/'+ name +'/Train_'+str(t)+'_'+ name + gap +'.txt'
    path_test = 'C:/Users/Crow/Desktop/human_data_12.12/Step_11_IND/'+ name +'/Test_'+str(t)+'_'+ name + gap +'.txt'
    train = read_svm(path_train)
    test = read_svm(path_test)


    x_train = train[0]
    y_train = train[1]

    x_test = test[0]
    y_test = test[1]
    x_train = np.expand_dims(x_train, axis=2) 
    x_test = np.expand_dims(x_test, axis=2) 
    shape = x_train.shape[1:]
    
    # k-fold
    kf = StratifiedKFold(n_splits = 10)

    for train_index, test_index in kf.split(x_train, y_train):
        x_train3, x_test3 = x_train[train_index], x_train[test_index]
        y_train3, y_test3 = y_train[train_index], y_train[test_index]
        
        model = create_cnn_model3(shape=shape)
    
        early_stopping = EarlyStopping(monitor='val_loss', patience=3)
        callbacks_list = [early_stopping]
        model.fit(x_train3, y_train3, validation_data = (x_test3, y_test3), epochs = 20, batch_size = 256,
                  shuffle=True,callbacks=callbacks_list, verbose=0)
    
    

    pre1 = model.predict(x_test)
    test_pred = model.predict_classes(x_test)
    for i in range(0,len(pre1)):
        if pre1[i] > test_cutoff[c]:       
            test_pred[i] = 1
        else:
            test_pred[i] = 0
    fpr, tpr, thresholds = roc_curve(y_test,pre1,pos_label=1)
    precision,recall,SN,SP,GM,TP,TN,FP,FN = performance(y_test,test_pred)
    print(t)
    print('test:')
    print("ACC:  %f "  %accuracy_score(y_test,test_pred))
    print("Sn: %f" %SN)
    print("Sp: %f" %SP)
    print("MCC: %f " %matthews_corrcoef(y_test,test_pred))
    print("AUC: %f" % auc(fpr, tpr))
    print("AUC: %f" % roc_auc_score(y_test,pre1))
    
    
    pre = model.predict(x_train)
    train_pred = model.predict_classes(x_train)
    for i in range(0,len(pre)):
        if pre[i] > train_cutoff[c]:       
            train_pred[i] = 1
        else:
            train_pred[i] = 0
    fpr, tpr, thresholds = roc_curve(y_train,pre,pos_label=1)
    precision,recall,SN,SP,GM,TP,TN,FP,FN = performance(y_train,train_pred)
    print(t)
    print('train:')
    print("ACC:  %f "  %accuracy_score(y_train,train_pred))
    print("Sn: %f" %SN)
    print("Sp: %f" %SP)
    print("MCC: %f " %matthews_corrcoef(y_train,train_pred))
    print("AUC: %f" % auc(fpr, tpr))
    print("AUC: %f" % roc_auc_score(y_train,pre))
    
    c+=1
    

In [None]:
pre

In [None]:
name = 'ZSCALE'
gap = ''
#q = [21,23,25]
q = [21,23,25,27,29,31,35,37]
test_cutoff = [0.277751,0.316778,0.264698,0.259903,0.3181,0.287056,0.285303,0.280947]
train_cutoff = [0.269478,0.306824,0.255314,0.250627,0.306797,0.28015,0.27627,0.271199]


c = 0
for t in q :
    path_train = 'C:/Users/Crow/Desktop/human_data_12.12/Step_11_CV/'+ name +'/Train_'+str(t)+'_'+ name + gap +'.txt'
    path_test = 'C:/Users/Crow/Desktop/human_data_12.12/Step_11_IND/'+ name +'/Test_'+str(t)+'_'+ name + gap +'.txt'
    train = read_svm(path_train)
    test = read_svm(path_test)


    x_train = train[0]
    y_train = train[1]

    x_test = test[0]
    y_test = test[1]
    
    path_test_result = 'C:/Users/Crow/Desktop/human_data_12.12/Step_11_IND/'+ name +'/Test_keras_'+ str(t) + '_'+ name+ gap+'_result.txt'
    path_train_result = 'C:/Users/Crow/Desktop/human_data_12.12/Step_11_CV/'+ name +'/Train_keras_'+ str(t) + '_'+ name + gap+'_result.txt'
    
    test_result = pd.read_table(path_test_result, header=None)

    train_result = pd.read_table(path_train_result, header=None)
    
    test_result_score = test_result[0]
    test_pred = np.array(test_result[1]) 
    
    train_result_score = train_result[0]
    train_pred = np.array(train_result[1])
    
    
    for i in range(0,len(test_result_score)):
        if test_result_score[i] > test_cutoff[c]:       
            test_pred[i] = 1
        else:
            test_pred[i] = 0
    fpr, tpr, thresholds = roc_curve(y_test,test_result_score,pos_label=1)
    precision,recall,SN,SP,GM,TP,TN,FP,FN = performance(y_test,test_pred)
    print(t)
    print('test:')
    print("ACC:  %f "  %accuracy_score(y_test,test_pred))
    print("Sn: %f" %SN)
    print("Sp: %f" %SP)
    print("MCC: %f " %matthews_corrcoef(y_test,test_pred))
    print("AUC: %f" % auc(fpr, tpr))
    print("AUC: %f" % roc_auc_score(y_test,test_result_score))
    
    
    for i in range(0,len(train_result_score)):
        if train_result_score[i] > train_cutoff[c]:       
            train_pred[i] = 1
        else:
            train_pred[i] = 0
    fpr, tpr, thresholds = roc_curve(y_train,train_result_score,pos_label=1)
    precision,recall,SN,SP,GM,TP,TN,FP,FN = performance(y_train,train_pred)
    print(t)
    print('train:')
    print("ACC:  %f "  %accuracy_score(y_train,train_pred))
    print("Sn: %f" %SN)
    print("Sp: %f" %SP)
    print("MCC: %f " %matthews_corrcoef(y_train,train_pred))
    print("AUC: %f" % auc(fpr, tpr))
    print("AUC: %f" % roc_auc_score(y_train,train_result_score))
    
    c+=1

In [None]:
model = create_cnn_model4()

In [None]:
model.summary()

In [None]:
# world embedding sliding windows
# keras 编码循环输出result
name = 'Embedding'
gap = ''
#q = [21,23,25]
q = [21,23,25,27,29,31,35,37]
#q=[29]
for t in q :
    path_train = 'C:/Users/Crow/Desktop/human_data_12.12/Train.txt'
    path_test =  'C:/Users/Crow/Desktop/human_data_12.12/Independent.txt'

    x_train,y_train = pep(path_train,t-2)
    x_test,y_test = pep(path_test,t-2)


    
    shape = x_train.shape[1:]
    
    # k-fold
    kf = StratifiedKFold(n_splits = 10)

    for train_index, test_index in kf.split(x_train, y_train):
        x_train3, x_test3 = x_train[train_index], x_train[test_index]
        y_train3, y_test3 = y_train[train_index], y_train[test_index]
        
        model = create_cnn_model5(input_length=t)
    
        early_stopping = EarlyStopping(monitor='val_loss', patience=3)
        callbacks_list = [early_stopping]
        model.fit(x_train3, y_train3, validation_data = (x_test3, y_test3), epochs = 20, batch_size = 256,
                  shuffle=True,callbacks=callbacks_list, verbose=0)
    
    print(t)
    pre = model.predict(x_train)
    write_score('C:/Users/Crow/Desktop/human_data_12.12/Step_11_CV/'+ name +'/Train_keras_'+ str(t) + '_'+ name + gap+'_result.txt',pre,y_train)


    pre1 = model.predict(x_test)
    write_score('C:/Users/Crow/Desktop/human_data_12.12/Step_11_IND/'+ name +'/Test_keras_'+ str(t) + '_'+ name+ gap+'_result.txt',pre1,y_test)

    

In [None]:
# world embedding sliding windows
# keras 编码循环输出result
name = 'Embedding'
gap = ''
q = [21,23,25,27,29,31,35,37]
test_cutoff = [0.273422,0.268753,0.210156,0.272848,0.258505,0.300157,0.310499,0.201006]
train_cutoff = [0.26536,0.25023,0.203575,0.250609,0.235935,0.27466,0.287076,0.184949]

c = 0
for t in q :
    path_train = 'C:/Users/Crow/Desktop/human_data_12.12/Train.txt'
    path_test =  'C:/Users/Crow/Desktop/human_data_12.12/Independent.txt'

    x_train,y_train = pep(path_train,t-2)
    x_test,y_test = pep(path_test,t-2)




    
    path_train_result = 'C:/Users/Crow/Desktop/human_data_12.12/Step_11_CV/'+ name +'/Train_keras_'+ str(t) + '_'+ name + gap+'_result.txt'
    path_test_result = 'C:/Users/Crow/Desktop/human_data_12.12/Step_11_IND/'+ name +'/Test_keras_'+ str(t) + '_'+ name+ gap+'_result.txt'
    
    test_result = pd.read_table(path_test_result, header=None)

    train_result = pd.read_table(path_train_result, header=None)
    
    test_result_score = test_result[0]
    test_pred = np.array(test_result[1]) 
    
    train_result_score = train_result[0]
    train_pred = np.array(train_result[1])
    
    
    for i in range(0,len(test_result_score)):
        if test_result_score[i] > test_cutoff[c]:       
            test_pred[i] = 1
        else:
            test_pred[i] = 0
    fpr, tpr, thresholds = roc_curve(y_test,test_result_score,pos_label=1)
    precision,recall,SN,SP,GM,TP,TN,FP,FN = performance(y_test,test_pred)
    print(t)
    print('test:')
    print("ACC:  %f "  %accuracy_score(y_test,test_pred))
    print("Sn: %f" %SN)
    print("Sp: %f" %SP)
    print("MCC: %f " %matthews_corrcoef(y_test,test_pred))
    print("AUC: %f" % auc(fpr, tpr))
    print("AUC: %f" % roc_auc_score(y_test,test_result_score))
    
    
    for i in range(0,len(train_result_score)):
        if train_result_score[i] > train_cutoff[c]:       
            train_pred[i] = 1
        else:
            train_pred[i] = 0
    fpr, tpr, thresholds = roc_curve(y_train,train_result_score,pos_label=1)
    precision,recall,SN,SP,GM,TP,TN,FP,FN = performance(y_train,train_pred)
    print(t)
    print('train:')
    print("ACC:  %f "  %accuracy_score(y_train,train_pred))
    print("Sn: %f" %SN)
    print("Sp: %f" %SP)
    print("MCC: %f " %matthews_corrcoef(y_train,train_pred))
    print("AUC: %f" % auc(fpr, tpr))
    print("AUC: %f" % roc_auc_score(y_train,train_result_score))
    
    c+=1

In [None]:
name = 'Embedding'
gap = ''
# 读取数据


# path_train = 'C:/Users/Crow/Desktop/human_data_12.12/Step_11_CV/'+ name +'/Train_29_'+ name + gap +'.txt'
# path_test = 'C:/Users/Crow/Desktop/human_data_12.12/Step_11_IND/'+ name +'/Test_29_'+ name + gap +'.txt'

# train = read_svm(path_train)
# test = read_svm(path_test)


# x_train = train[0]
# y_train = train[1]

# x_test = test[0]
# y_test = test[1]

# x_train = np.expand_dims(x_train, axis=2) 
# x_test = np.expand_dims(x_test, axis=2) 

# shape = x_train.shape[1:]
path_train = 'C:/Users/Crow/Desktop/human_data_12.12/Train.txt'
path_test =  'C:/Users/Crow/Desktop/human_data_12.12/Independent.txt'

x_train,y_train = pep(path_train,29-2)
x_test,y_test = pep(path_test,29-2)



shape = x_train.shape[1:]

kf = KFold(n_splits = 10,random_state=5,shuffle=True)
j = 1 
for train_index, test_index in kf.split(x_train):
    x_train3, x_test3 = x_train[train_index], x_train[test_index]
    y_train3, y_test3 = y_train[train_index], y_train[test_index]
    
#     model = create_cnn_model5(input_length=29)
    
#     early_stopping = EarlyStopping(monitor='val_loss', patience=5)
#     callbacks_list = [early_stopping]
#     model.fit(x_train3, y_train3, validation_data = (x_test3, y_test3), epochs = 20, batch_size = 256,shuffle=True,
#          callbacks=callbacks_list, verbose=1)
    model = create_cnn_model5(input_length=29)
    
    early_stopping = EarlyStopping(monitor='val_loss', patience=3)
    callbacks_list = [early_stopping]
    model.fit(x_train3, y_train3, validation_data = (x_test3, y_test3), epochs = 20, batch_size = 256,
              shuffle=True,callbacks=callbacks_list, verbose=0)
    
    
    
    
    test_pred_proba = model.predict(x_test3)
    fpr, tpr, thresholds = roc_curve(y_test3,test_pred_proba,pos_label=1)
    #print("ACC:  %f "  %accuracy_score(y_test3,test_pred))
    print("AUC: %f" % auc(fpr, tpr))
    #print("MCC: %f " %matthews_corrcoef(y_test3,test_pred))
    fw = open('C:/Users/Crow/Desktop/result/29_kfold_CNN_'+ name + gap+'_result_'+ str(j) +'.txt','w')
   
    for t in range(0,len(test_pred_proba)):
        fw.write(str(test_pred_proba[t][0]))
        fw.write('\t')
        fw.write(str(y_test3[t]))
        fw.write('\n')
    fw.close()

    
    if j == 10:        
        model.save('C:/Users/Crow/Desktop/result/model/CNN_kfold_'+ name + gap +'.h5') 
        
        test_pred_proba = model.predict(x_test)
        fpr, tpr, thresholds = roc_curve(y_test,test_pred_proba,pos_label=1)
        print("总AUC: %f" % auc(fpr, tpr))
        fw = open('C:/Users/Crow/Desktop/result/29_kfold_CNN_'+ name + gap +'_result.txt','w')
        for t in range(0,len(test_pred_proba)):
            fw.write(str(test_pred_proba[t][0]))
            fw.write('\t')
            fw.write(str(y_test[t]))
            fw.write('\n') 
        fw.close()
    j+=1
    

In [None]:
from keras.utils.vis_utils import plot_model
model = create_cnn_model4(input_length=29)
plot_model(model, to_file='C://Users/Crow/Desktop/model1.png',show_shapes=True)

In [None]:
# 组蛋白Train dataset 结果
histone_data_p = pd.read_excel('C:/Users/Crow/Desktop/human_data_12.12/histone_train.xlsx',sheet_name='组蛋白正样本')

In [None]:
histone_data_n = pd.read_excel('C:/Users/Crow/Desktop/human_data_12.12/histone_train.xlsx',sheet_name='组蛋白负样本')


In [None]:
histone_data = histone_data_p.append(histone_data_n)
order = ['Sequences', 'Protein ID', 'Site', 'Label']
histone_data = histone_data[order]
histone_data.to_csv('C:/Users/Crow/Desktop/1111.csv')

In [None]:
# model 预测组蛋白数据
model = load_model('C:/Users/Crow/Desktop/result/model/CNN_kfold_Embedding.h5')

In [None]:
path_test =  'C:/Users/Crow/Desktop/histone_train.txt'
x_test,y_test = pep(path_test,29-2)
test_pred_proba = model.predict(x_test)
test_pred = model.predict_classes(x_test)
fpr, tpr, thresholds = roc_curve(y_test,test_pred_proba,pos_label=1)
print("ACC:  %f "  %accuracy_score(y_test,test_pred))
print("AUC: %f" % auc(fpr, tpr))


In [None]:
# histone  no-histone train dataset 查重输出到文件
path_test =  'C:/Users/Crow/Desktop/histone_train.txt'
histone_data = pd.read_table(path_test,names= ['Sequences', 'Protein ID', 'Site', 'Label'])

path_train = 'C:/Users/Crow/Desktop/human_data_12.12/Train.txt'
path_test2 =  'C:/Users/Crow/Desktop/human_data_12.12/Independent.txt'
human_train = pd.read_table(path_train,names= ['Sequences', 'Protein ID', 'Site', 'Label'])
human_test = pd.read_table(path_test2,names= ['Sequences', 'Protein ID', 'Site', 'Label'])
# 查重输出文件
# a = human_train.append(histone_data)
# #a = histone_data.append(human_train)
# dIndex = a.duplicated(['Protein ID', 'Site'])
# b = a[dIndex]
# b.to_excel('C:/Users/Crow/Desktop/hhh.xlsx')

In [None]:
# histone dataset 去除 no-histone dataset 重复的50条数据后，添加到no-histone dataset 训练测试
a = human_train.append(histone_data)
#a = histone_data.append(human_train)
newDF=a.drop_duplicates(['Protein ID', 'Site'])
newDF.to_excel('C:/Users/Crow/Desktop/histone_no-histone_train.xlsx')

In [None]:
# 使用histone_no-histone_train 训练模型
path_train1 = 'C:/Users/Crow/Desktop/no-histone.txt'
path_train2 = 'C:/Users/Crow/Desktop/histone.txt'
#path_train = 'C:/Users/Crow/Desktop/human_data_12.12/Train.txt'
path_test =  'C:/Users/Crow/Desktop/human_data_12.12/Independent.txt'

x_train1,y_train1 = pep(path_train1,29-2)
x_train2,y_train2 = pep(path_train2,29-2)
# 合并 numpy.ndarray
x_train = np.concatenate((x_train1, x_train2))
y_train = np.concatenate((y_train1, y_train2))
x_test,y_test = pep(path_test,29-2)

In [None]:
kf = KFold(n_splits = 10,random_state=5,shuffle=True)
j = 1 
for train_index, test_index in kf.split(x_train):
    x_train3, x_test3 = x_train[train_index], x_train[test_index]
    y_train3, y_test3 = y_train[train_index], y_train[test_index]
    
    model = create_cnn_model5(input_length=29)
    
    early_stopping = EarlyStopping(monitor='val_loss', patience=5)
    callbacks_list = [early_stopping]
    model.fit(x_train3, y_train3, validation_data = (x_test3, y_test3), epochs = 20, batch_size = 256,shuffle=True,
         callbacks=callbacks_list, verbose=1)
    
    test_pred_proba = model.predict(x_test3)
    fpr, tpr, thresholds = roc_curve(y_test3,test_pred_proba,pos_label=1)
    #print("ACC:  %f "  %accuracy_score(y_test3,test_pred))
    print("AUC: %f" % auc(fpr, tpr))
    

In [None]:
path_test =  'C:/Users/Crow/Desktop/histone_train.txt'
x_test,y_test = pep(path_test,29-2)
test_pred_proba = model.predict(x_test)
test_pred = model.predict_classes(x_test)
fpr, tpr, thresholds = roc_curve(y_test,test_pred_proba,pos_label=1)
print("ACC:  %f "  %accuracy_score(y_test,test_pred))
print("AUC: %f" % auc(fpr, tpr))

In [None]:
path_test =  'C:/Users/Crow/Desktop/human_data_12.12/Independent.txt'

x_test,y_test = pep(path_test,29-2)
test_pred_proba = model.predict(x_test)
test_pred = model.predict_classes(x_test)
fpr, tpr, thresholds = roc_curve(y_test,test_pred_proba,pos_label=1)
print("ACC:  %f "  %accuracy_score(y_test,test_pred))
print("AUC: %f" % auc(fpr, tpr))


In [None]:
x_test,y_test = pep(path_test,29-2)
test_pred_proba = model.predict(x_test)
test_pred = model.predict_classes(x_test)
fpr, tpr, thresholds = roc_curve(y_test,test_pred_proba,pos_label=1)
print("ACC:  %f "  %accuracy_score(y_test,test_pred))
print("AUC: %f" % auc(fpr, tpr))


In [None]:
# 重新训练CNN word Embedding

name = 'Embedding'
gap = ''

path_train = 'C:/Users/Crow/Desktop/human_data_12.12/Train.txt'
path_test =  'C:/Users/Crow/Desktop/human_data_12.12/Independent.txt'

x_train,y_train = pep(path_train,29-2)
x_test,y_test = pep(path_test,29-2)

kf = KFold(n_splits = 10,random_state=5,shuffle=True)
j = 1 
for train_index, test_index in kf.split(x_train):
    x_train3, x_test3 = x_train[train_index], x_train[test_index]
    y_train3, y_test3 = y_train[train_index], y_train[test_index]
    
#     model = create_cnn_model5(input_length=29)
    
#     early_stopping = EarlyStopping(monitor='val_loss', patience=5)
#     callbacks_list = [early_stopping]
#     model.fit(x_train3, y_train3, validation_data = (x_test3, y_test3), epochs = 20, batch_size = 256,shuffle=True,
#          callbacks=callbacks_list, verbose=1)
    model = create_cnn_model5(input_length=29,dropout=0.5)
    
    filepath='C:/Users/Crow/Desktop/result/re_CNN_5/model/dropout0.5_checkpoint'+ str(j) +'-{val_loss:.2f}-{epoch:02d}e-val_acc_{val_acc:.2f}.hdf5'
   
    #filepath="C:/Users/Crow/Desktop/result/re_CNN/model/weights.best.hdf5"
    checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=False,mode='auto', period=10)
    
    early_stopping = EarlyStopping(monitor='val_loss', patience=10)
    callbacks_list = [early_stopping,checkpoint]
    model.fit(x_train3, y_train3, validation_data = (x_test3, y_test3), epochs = 200, batch_size = 256,
              shuffle=True,callbacks=callbacks_list, verbose=1)
    
    
    
    
    test_pred_proba = model.predict(x_test3)
    fpr, tpr, thresholds = roc_curve(y_test3,test_pred_proba,pos_label=1)
    #print("ACC:  %f "  %accuracy_score(y_test3,test_pred))
    print("AUC: %f" % auc(fpr, tpr))
    #print("MCC: %f " %matthews_corrcoef(y_test3,test_pred))
    fw = open('C:/Users/Crow/Desktop/result/re_CNN_5/29_kfold_CNN_'+ name + gap+'_result_'+ str(j) +'.txt','w')
   
    for t in range(0,len(test_pred_proba)):
        fw.write(str(test_pred_proba[t][0]))
        fw.write('\t')
        fw.write(str(y_test3[t]))
        fw.write('\n')
    fw.close()

    
    if j == 10:   
        print("总AUC: %f" % mean(auc_mean))
#         model.save('C:/Users/Crow/Desktop/result/re_CNN_5/model/CNN_kfold_'+ name + gap +'.h5') 
        
#         test_pred_proba = model.predict(x_test)
#         fpr, tpr, thresholds = roc_curve(y_test,test_pred_proba,pos_label=1)
#         print("总AUC: %f" % auc(fpr, tpr))
#         fw = open('C:/Users/Crow/Desktop/result/re_CNN_5/29_kfold_CNN_'+ name + gap +'_result.txt','w')
#         for t in range(0,len(test_pred_proba)):
#             fw.write(str(test_pred_proba[t][0]))
#             fw.write('\t')
#             fw.write(str(y_test[t]))
#             fw.write('\n') 
#         fw.close()
    j+=1

In [None]:

x_train[-4]


In [21]:
# 重新训练CNN word Embedding

name = 'Embedding'
gap = ''
auc_mean=[]
path_train = 'C:/Users/Crow/Desktop/human_data_12.12/Train.txt'
path_test =  'C:/Users/Crow/Desktop/human_data_12.12/Independent.txt'

x_train,y_train = pep(path_train,29-2)
x_test,y_test = pep(path_test,29-2)

kf = KFold(n_splits = 10,random_state=5,shuffle=True)
j = 1 
for train_index, test_index in kf.split(x_train):
    x_train3, x_test3 = x_train[train_index], x_train[test_index]
    y_train3, y_test3 = y_train[train_index], y_train[test_index]
    
#     model = create_cnn_model5(input_length=29)
    
#     early_stopping = EarlyStopping(monitor='val_loss', patience=5)
#     callbacks_list = [early_stopping]
#     model.fit(x_train3, y_train3, validation_data = (x_test3, y_test3), epochs = 20, batch_size = 256,shuffle=True,
#          callbacks=callbacks_list, verbose=1)
    model = create_cnn_model6(input_length=29,dropout=0.5)
    
    filepath='C:/Users/Crow/Desktop/new_result/CNN6/model/29_kfold_CNN_'+ name + gap+'_'+ str(j) +'.hdf5'
   
    #filepath="C:/Users/Crow/Desktop/result/re_CNN/model/weights.best.hdf5"
    checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=False,mode='auto', period=50)
    
    early_stopping = EarlyStopping(monitor='val_loss', patience=50)
    callbacks_list = [early_stopping,checkpoint]
    model.fit(x_train3, y_train3, validation_data = (x_test3, y_test3), epochs = 2000, batch_size = 256,
              shuffle=True,callbacks=callbacks_list, verbose=1)
    
    
    
    
    test_pred_proba = model.predict(x_test3)
    fpr, tpr, thresholds = roc_curve(y_test3,test_pred_proba,pos_label=1)
    #print("ACC:  %f "  %accuracy_score(y_test3,test_pred))
    print("AUC: %f" % auc(fpr, tpr))
    auc_mean.append(auc(fpr, tpr))
    #print("MCC: %f " %matthews_corrcoef(y_test3,test_pred))
    fw = open('C:/Users/Crow/Desktop/new_result/CNN6/29_kfold_CNN_'+ name + gap+'_result_'+ str(j) +'.txt','w')
    for t in range(0,len(test_pred_proba)):
        fw.write(str(test_pred_proba[t][0]))
        fw.write('\t')
        fw.write(str(y_test3[t]))
        fw.write('\n')
    fw.close()
    
    fw = open('C:/Users/Crow/Desktop/new_result/CNN6/29_kfold_CNN_'+ name + gap+'_test_'+ str(j) +'.txt','w')
    for t in range(0,len(y_test3)):
        fw.write(str(y_test3[t]))
        fw.write('\n')
    fw.close()
    
    if j == 10:
        print(auc_mean)
        print(print("CV AUC: %f" % mean(auc_mean)))
#         model.save('C:/Users/Crow/Desktop/result/re_CNN/model/CNN_kfold_'+ name + gap +'.h5') 
        
#         test_pred_proba = model.predict(x_test)
#         fpr, tpr, thresholds = roc_curve(y_test,test_pred_proba,pos_label=1)
#         print("总AUC: %f" % auc(fpr, tpr))
#         fw = open('C:/Users/Crow/Desktop/result/re_CNN/29_kfold_CNN_'+ name + gap +'_result.txt','w')
#         for t in range(0,len(test_pred_proba)):
#             fw.write(str(test_pred_proba[t][0]))
#             fw.write('\t')
#             fw.write(str(y_test[t]))
#             fw.write('\n') 
#         fw.close()
    j+=1

Train on 66413 samples, validate on 7380 samples
Epoch 1/2000
Epoch 2/2000
Epoch 3/2000
Epoch 4/2000
Epoch 5/2000
Epoch 6/2000
Epoch 7/2000
Epoch 8/2000
Epoch 9/2000
Epoch 10/2000
Epoch 11/2000
Epoch 12/2000
Epoch 13/2000
Epoch 14/2000
Epoch 15/2000
Epoch 16/2000
Epoch 17/2000
Epoch 18/2000
Epoch 19/2000
Epoch 20/2000
Epoch 21/2000
Epoch 22/2000
Epoch 23/2000
Epoch 24/2000
Epoch 25/2000
Epoch 26/2000
Epoch 27/2000
Epoch 28/2000
Epoch 29/2000
Epoch 30/2000
Epoch 31/2000
Epoch 32/2000
Epoch 33/2000
Epoch 34/2000
Epoch 35/2000
Epoch 36/2000
Epoch 37/2000
Epoch 38/2000
Epoch 39/2000
Epoch 40/2000
Epoch 41/2000
Epoch 42/2000
Epoch 43/2000
Epoch 44/2000
Epoch 45/2000
Epoch 46/2000
Epoch 47/2000
Epoch 48/2000
Epoch 49/2000
Epoch 50/2000

Epoch 00050: saving model to C:/Users/Crow/Desktop/new_result/CNN6/model/29_kfold_CNN_Embedding_1.hdf5
Epoch 51/2000
Epoch 52/2000
Epoch 53/2000
Epoch 54/2000
Epoch 55/2000
Epoch 56/2000
Epoch 57/2000
Epoch 58/2000


Epoch 59/2000
Epoch 60/2000
Epoch 61/2000
Epoch 62/2000
Epoch 63/2000
AUC: 0.850693
Train on 66413 samples, validate on 7380 samples
Epoch 1/2000
Epoch 2/2000
Epoch 3/2000
Epoch 4/2000
Epoch 5/2000
Epoch 6/2000
Epoch 7/2000
Epoch 8/2000
Epoch 9/2000
Epoch 10/2000
Epoch 11/2000
Epoch 12/2000
Epoch 13/2000
Epoch 14/2000
Epoch 15/2000
Epoch 16/2000
Epoch 17/2000
Epoch 18/2000
Epoch 19/2000
Epoch 20/2000
Epoch 21/2000
Epoch 22/2000
Epoch 23/2000
Epoch 24/2000
Epoch 25/2000
Epoch 26/2000
Epoch 27/2000
Epoch 28/2000
Epoch 29/2000
Epoch 30/2000
Epoch 31/2000
Epoch 32/2000
Epoch 33/2000
Epoch 34/2000
Epoch 35/2000
Epoch 36/2000
Epoch 37/2000
Epoch 38/2000
Epoch 39/2000
Epoch 40/2000
Epoch 41/2000
Epoch 42/2000
Epoch 43/2000
Epoch 44/2000
Epoch 45/2000
Epoch 46/2000
Epoch 47/2000
Epoch 48/2000
Epoch 49/2000
Epoch 50/2000

Epoch 00050: saving model to C:/Users/Crow/Desktop/new_result/CNN6/model/29_kfold_CNN_Embedding_2.hdf5
Epoch 51/2000
Epoch 52/2000


Epoch 53/2000
Epoch 54/2000
Epoch 55/2000
Epoch 56/2000
Epoch 57/2000
Epoch 58/2000
Epoch 59/2000
Epoch 60/2000
Epoch 61/2000
Epoch 62/2000
Epoch 63/2000
Epoch 64/2000
Epoch 65/2000
AUC: 0.834875
Train on 66413 samples, validate on 7380 samples
Epoch 1/2000
Epoch 2/2000
Epoch 3/2000
Epoch 4/2000
Epoch 5/2000
Epoch 6/2000
Epoch 7/2000
Epoch 8/2000
Epoch 9/2000
Epoch 10/2000
Epoch 11/2000
Epoch 12/2000
Epoch 13/2000
Epoch 14/2000
Epoch 15/2000
Epoch 16/2000
Epoch 17/2000
Epoch 18/2000
Epoch 19/2000
Epoch 20/2000
Epoch 21/2000
Epoch 22/2000
Epoch 23/2000
Epoch 24/2000
Epoch 25/2000
Epoch 26/2000
Epoch 27/2000
Epoch 28/2000
Epoch 29/2000
Epoch 30/2000
Epoch 31/2000
Epoch 32/2000
Epoch 33/2000
Epoch 34/2000
Epoch 35/2000
Epoch 36/2000
Epoch 37/2000
Epoch 38/2000
Epoch 39/2000
Epoch 40/2000
Epoch 41/2000
Epoch 42/2000
Epoch 43/2000
Epoch 44/2000
Epoch 45/2000


Epoch 46/2000
Epoch 47/2000
Epoch 48/2000
Epoch 49/2000
Epoch 50/2000

Epoch 00050: saving model to C:/Users/Crow/Desktop/new_result/CNN6/model/29_kfold_CNN_Embedding_3.hdf5
Epoch 51/2000
Epoch 52/2000
Epoch 53/2000
Epoch 54/2000
Epoch 55/2000
Epoch 56/2000
Epoch 57/2000
Epoch 58/2000
Epoch 59/2000
Epoch 60/2000
Epoch 61/2000
Epoch 62/2000
AUC: 0.846401
Train on 66414 samples, validate on 7379 samples
Epoch 1/2000
Epoch 2/2000
Epoch 3/2000
Epoch 4/2000
Epoch 5/2000
Epoch 6/2000
Epoch 7/2000
Epoch 8/2000
Epoch 9/2000
Epoch 10/2000
Epoch 11/2000
Epoch 12/2000
Epoch 13/2000
Epoch 14/2000
Epoch 15/2000
Epoch 16/2000
Epoch 17/2000
Epoch 18/2000
Epoch 19/2000
Epoch 20/2000
Epoch 21/2000
Epoch 22/2000
Epoch 23/2000
Epoch 24/2000
Epoch 25/2000
Epoch 26/2000
Epoch 27/2000
Epoch 28/2000
Epoch 29/2000
Epoch 30/2000
Epoch 31/2000
Epoch 32/2000
Epoch 33/2000
Epoch 34/2000
Epoch 35/2000
Epoch 36/2000
Epoch 37/2000
Epoch 38/2000
Epoch 39/2000
Epoch 40/2000


Epoch 41/2000
Epoch 42/2000
Epoch 43/2000
Epoch 44/2000
Epoch 45/2000
Epoch 46/2000
Epoch 47/2000
Epoch 48/2000
Epoch 49/2000
Epoch 50/2000

Epoch 00050: saving model to C:/Users/Crow/Desktop/new_result/CNN6/model/29_kfold_CNN_Embedding_4.hdf5
Epoch 51/2000
Epoch 52/2000
Epoch 53/2000
Epoch 54/2000
Epoch 55/2000
Epoch 56/2000
Epoch 57/2000
Epoch 58/2000
Epoch 59/2000
Epoch 60/2000
Epoch 61/2000
Epoch 62/2000
Epoch 63/2000
Epoch 64/2000
Epoch 65/2000
Epoch 66/2000
Epoch 67/2000
AUC: 0.846892
Train on 66414 samples, validate on 7379 samples
Epoch 1/2000
Epoch 2/2000
Epoch 3/2000
Epoch 4/2000
Epoch 5/2000
Epoch 6/2000
Epoch 7/2000
Epoch 8/2000
Epoch 9/2000
Epoch 10/2000
Epoch 11/2000
Epoch 12/2000
Epoch 13/2000
Epoch 14/2000
Epoch 15/2000
Epoch 16/2000
Epoch 17/2000
Epoch 18/2000
Epoch 19/2000
Epoch 20/2000
Epoch 21/2000
Epoch 22/2000
Epoch 23/2000
Epoch 24/2000
Epoch 25/2000
Epoch 26/2000
Epoch 27/2000
Epoch 28/2000
Epoch 29/2000
Epoch 30/2000


Epoch 31/2000
Epoch 32/2000
Epoch 33/2000
Epoch 34/2000
Epoch 35/2000
Epoch 36/2000
Epoch 37/2000
Epoch 38/2000
Epoch 39/2000
Epoch 40/2000
Epoch 41/2000
Epoch 42/2000
Epoch 43/2000
Epoch 44/2000
Epoch 45/2000
Epoch 46/2000
Epoch 47/2000
Epoch 48/2000
Epoch 49/2000
Epoch 50/2000

Epoch 00050: saving model to C:/Users/Crow/Desktop/new_result/CNN6/model/29_kfold_CNN_Embedding_5.hdf5
Epoch 51/2000
Epoch 52/2000
Epoch 53/2000
Epoch 54/2000
Epoch 55/2000
Epoch 56/2000
Epoch 57/2000
Epoch 58/2000
Epoch 59/2000
Epoch 60/2000
Epoch 61/2000
Epoch 62/2000
AUC: 0.855829
Train on 66414 samples, validate on 7379 samples
Epoch 1/2000
Epoch 2/2000
Epoch 3/2000
Epoch 4/2000
Epoch 5/2000
Epoch 6/2000
Epoch 7/2000
Epoch 8/2000
Epoch 9/2000
Epoch 10/2000
Epoch 11/2000
Epoch 12/2000
Epoch 13/2000
Epoch 14/2000
Epoch 15/2000
Epoch 16/2000
Epoch 17/2000
Epoch 18/2000
Epoch 19/2000
Epoch 20/2000
Epoch 21/2000
Epoch 22/2000
Epoch 23/2000
Epoch 24/2000
Epoch 25/2000


Epoch 26/2000
Epoch 27/2000
Epoch 28/2000
Epoch 29/2000
Epoch 30/2000
Epoch 31/2000
Epoch 32/2000
Epoch 33/2000
Epoch 34/2000
Epoch 35/2000
Epoch 36/2000
Epoch 37/2000
Epoch 38/2000
Epoch 39/2000
Epoch 40/2000
Epoch 41/2000
Epoch 42/2000
Epoch 43/2000
Epoch 44/2000
Epoch 45/2000
Epoch 46/2000
Epoch 47/2000
Epoch 48/2000
Epoch 49/2000
Epoch 50/2000

Epoch 00050: saving model to C:/Users/Crow/Desktop/new_result/CNN6/model/29_kfold_CNN_Embedding_6.hdf5
Epoch 51/2000
Epoch 52/2000
Epoch 53/2000
Epoch 54/2000
Epoch 55/2000
Epoch 56/2000
Epoch 57/2000
Epoch 58/2000
Epoch 59/2000
Epoch 60/2000
Epoch 61/2000
Epoch 62/2000
Epoch 63/2000
Epoch 64/2000
Epoch 65/2000
Epoch 66/2000
Epoch 67/2000
Epoch 68/2000
Epoch 69/2000
Epoch 70/2000
AUC: 0.835227
Train on 66414 samples, validate on 7379 samples
Epoch 1/2000
Epoch 2/2000
Epoch 3/2000
Epoch 4/2000
Epoch 5/2000
Epoch 6/2000
Epoch 7/2000
Epoch 8/2000
Epoch 9/2000
Epoch 10/2000
Epoch 11/2000
Epoch 12/2000


Epoch 13/2000
Epoch 14/2000
Epoch 15/2000
Epoch 16/2000
Epoch 17/2000
Epoch 18/2000
Epoch 19/2000
Epoch 20/2000
Epoch 21/2000
Epoch 22/2000
Epoch 23/2000
Epoch 24/2000
Epoch 25/2000
Epoch 26/2000
Epoch 27/2000
Epoch 28/2000
Epoch 29/2000
Epoch 30/2000
Epoch 31/2000
Epoch 32/2000
Epoch 33/2000
Epoch 34/2000
Epoch 35/2000
Epoch 36/2000
Epoch 37/2000
Epoch 38/2000
Epoch 39/2000
Epoch 40/2000
Epoch 41/2000
Epoch 42/2000
Epoch 43/2000
Epoch 44/2000
Epoch 45/2000
Epoch 46/2000
Epoch 47/2000
Epoch 48/2000
Epoch 49/2000
Epoch 50/2000

Epoch 00050: saving model to C:/Users/Crow/Desktop/new_result/CNN6/model/29_kfold_CNN_Embedding_7.hdf5
Epoch 51/2000
Epoch 52/2000
Epoch 53/2000
Epoch 54/2000
Epoch 55/2000
Epoch 56/2000
Epoch 57/2000
Epoch 58/2000
Epoch 59/2000
Epoch 60/2000
AUC: 0.851594
Train on 66414 samples, validate on 7379 samples
Epoch 1/2000
Epoch 2/2000
Epoch 3/2000
Epoch 4/2000
Epoch 5/2000
Epoch 6/2000
Epoch 7/2000
Epoch 8/2000
Epoch 9/2000


Epoch 10/2000
Epoch 11/2000
Epoch 12/2000
Epoch 13/2000
Epoch 14/2000
Epoch 15/2000
Epoch 16/2000
Epoch 17/2000
Epoch 18/2000
Epoch 19/2000
Epoch 20/2000
Epoch 21/2000
Epoch 22/2000
Epoch 23/2000
Epoch 24/2000
Epoch 25/2000
Epoch 26/2000
Epoch 27/2000
Epoch 28/2000
Epoch 29/2000
Epoch 30/2000
Epoch 31/2000
Epoch 32/2000
Epoch 33/2000
Epoch 34/2000
Epoch 35/2000
Epoch 36/2000
Epoch 37/2000
Epoch 38/2000
Epoch 39/2000
Epoch 40/2000
Epoch 41/2000
Epoch 42/2000
Epoch 43/2000
Epoch 44/2000
Epoch 45/2000
Epoch 46/2000
Epoch 47/2000
Epoch 48/2000
Epoch 49/2000
Epoch 50/2000

Epoch 00050: saving model to C:/Users/Crow/Desktop/new_result/CNN6/model/29_kfold_CNN_Embedding_8.hdf5
Epoch 51/2000
Epoch 52/2000
Epoch 53/2000
Epoch 54/2000
Epoch 55/2000
Epoch 56/2000
Epoch 57/2000
Epoch 58/2000
Epoch 59/2000
Epoch 60/2000
Epoch 61/2000
Epoch 62/2000
Epoch 63/2000
Epoch 64/2000
Epoch 65/2000
Epoch 66/2000


Epoch 67/2000
Epoch 68/2000
Epoch 69/2000
Epoch 70/2000
Epoch 71/2000
Epoch 72/2000
Epoch 73/2000
Epoch 74/2000
Epoch 75/2000
Epoch 76/2000
AUC: 0.851434
Train on 66414 samples, validate on 7379 samples
Epoch 1/2000
Epoch 2/2000
Epoch 3/2000
Epoch 4/2000
Epoch 5/2000
Epoch 6/2000
Epoch 7/2000
Epoch 8/2000
Epoch 9/2000
Epoch 10/2000
Epoch 11/2000
Epoch 12/2000
Epoch 13/2000
Epoch 14/2000
Epoch 15/2000
Epoch 16/2000
Epoch 17/2000
Epoch 18/2000
Epoch 19/2000
Epoch 20/2000
Epoch 21/2000
Epoch 22/2000
Epoch 23/2000
Epoch 24/2000
Epoch 25/2000
Epoch 26/2000
Epoch 27/2000
Epoch 28/2000
Epoch 29/2000
Epoch 30/2000
Epoch 31/2000
Epoch 32/2000
Epoch 33/2000
Epoch 34/2000
Epoch 35/2000
Epoch 36/2000
Epoch 37/2000
Epoch 38/2000
Epoch 39/2000
Epoch 40/2000
Epoch 41/2000
Epoch 42/2000
Epoch 43/2000
Epoch 44/2000
Epoch 45/2000
Epoch 46/2000
Epoch 47/2000
Epoch 48/2000


Epoch 49/2000
Epoch 50/2000

Epoch 00050: saving model to C:/Users/Crow/Desktop/new_result/CNN6/model/29_kfold_CNN_Embedding_9.hdf5
Epoch 51/2000
Epoch 52/2000
Epoch 53/2000
Epoch 54/2000
Epoch 55/2000
Epoch 56/2000
Epoch 57/2000
Epoch 58/2000
Epoch 59/2000
Epoch 60/2000
Epoch 61/2000
AUC: 0.855499
Train on 66414 samples, validate on 7379 samples
Epoch 1/2000
Epoch 2/2000
Epoch 3/2000
Epoch 4/2000
Epoch 5/2000
Epoch 6/2000
Epoch 7/2000
Epoch 8/2000
Epoch 9/2000
Epoch 10/2000
Epoch 11/2000
Epoch 12/2000
Epoch 13/2000
Epoch 14/2000
Epoch 15/2000
Epoch 16/2000
Epoch 17/2000
Epoch 18/2000
Epoch 19/2000
Epoch 20/2000
Epoch 21/2000
Epoch 22/2000
Epoch 23/2000
Epoch 24/2000
Epoch 25/2000
Epoch 26/2000
Epoch 27/2000
Epoch 28/2000
Epoch 29/2000
Epoch 30/2000
Epoch 31/2000
Epoch 32/2000
Epoch 33/2000
Epoch 34/2000
Epoch 35/2000
Epoch 36/2000
Epoch 37/2000
Epoch 38/2000
Epoch 39/2000
Epoch 40/2000
Epoch 41/2000
Epoch 42/2000
Epoch 43/2000
Epoch 44/2000


Epoch 45/2000
Epoch 46/2000
Epoch 47/2000
Epoch 48/2000
Epoch 49/2000
Epoch 50/2000

Epoch 00050: saving model to C:/Users/Crow/Desktop/new_result/CNN6/model/29_kfold_CNN_Embedding_10.hdf5
Epoch 51/2000
Epoch 52/2000
Epoch 53/2000
Epoch 54/2000
Epoch 55/2000
Epoch 56/2000
Epoch 57/2000
Epoch 58/2000
Epoch 59/2000
Epoch 60/2000
Epoch 61/2000
Epoch 62/2000
Epoch 63/2000
Epoch 64/2000
AUC: 0.838664
[0.8506930812870355, 0.8348745285797352, 0.846400560535623, 0.8468916244763691, 0.8558294355313357, 0.8352272596641743, 0.8515938626339442, 0.8514344448206197, 0.8554987682627918, 0.8386644514105983]
CV AUC: 0.846711
None


In [None]:
# 5种特征 CNN
# 重新训练CNN word Embedding

name = 'BINARY'
gap = ''
auc_mean=[]
# path_train = 'C:/Users/Crow/Desktop/human_data_12.12/Train.txt'
# path_test =  'C:/Users/Crow/Desktop/human_data_12.12/Independent.txt'
# x_train,y_train = pep(path_train,29-2)
# x_test,y_test = pep(path_test,29-2)

path_train = 'C:/Users/Crow/Desktop/human_data_12.12/Step_11_CV/'+ name +'/Train_29_'+ name + gap +'.txt'
path_test = 'C:/Users/Crow/Desktop/human_data_12.12/Step_11_IND/'+ name +'/Test_29_'+ name + gap +'.txt'

train = read_svm(path_train)
test = read_svm(path_test)


x_train = train[0]
y_train = train[1]

x_test = test[0]
y_test = test[1]

x_train = np.expand_dims(x_train, axis=2) 
x_test = np.expand_dims(x_test, axis=2) 

shape = x_train.shape[1:]

kf = KFold(n_splits = 10,random_state=5,shuffle=True)
j = 1 
for train_index, test_index in kf.split(x_train):
    x_train3, x_test3 = x_train[train_index], x_train[test_index]
    y_train3, y_test3 = y_train[train_index], y_train[test_index]
    
#     model = create_cnn_model5(input_length=29)
    
#     early_stopping = EarlyStopping(monitor='val_loss', patience=5)
#     callbacks_list = [early_stopping]
#     model.fit(x_train3, y_train3, validation_data = (x_test3, y_test3), epochs = 20, batch_size = 256,shuffle=True,
#          callbacks=callbacks_list, verbose=1)
    model = create_cnn_model3(shape=shape,dropout=0.6)
    
    #filepath='C:/Users/Crow/Desktop/result/re_CNN3/model/'+ str(j) +'checkpoint-{val_loss:.2f}-{epoch:02d}e-val_acc_{val_acc:.2f}.hdf5'
    filepath='C:/Users/Crow/Desktop/new_result/CNN/model/29_kfold_CNN_'+ name + gap+'_'+ str(j) +'.hdf5'
    #filepath="C:/Users/Crow/Desktop/result/re_CNN/model/weights.best.hdf5"
    checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=False,mode='auto', period=50)
    
    early_stopping = EarlyStopping(monitor='val_loss', patience=50)
    callbacks_list = [early_stopping,checkpoint]
    model.fit(x_train3, y_train3, validation_data = (x_test3, y_test3), epochs = 1000, batch_size = 256,
              shuffle=True,callbacks=callbacks_list, verbose=1)
    
    
    
    
    test_pred_proba = model.predict(x_test3)
    fpr, tpr, thresholds = roc_curve(y_test3,test_pred_proba,pos_label=1)
    #print("ACC:  %f "  %accuracy_score(y_test3,test_pred))
    print("AUC: %f" % auc(fpr, tpr))
    auc_mean.append(auc(fpr, tpr))
    #print("MCC: %f " %matthews_corrcoef(y_test3,test_pred))
    fw = open('C:/Users/Crow/Desktop/new_result/CNN/29_kfold_CNN_'+ name + gap+'_result_'+ str(j) +'.txt','w')
    for t in range(0,len(test_pred_proba)):
        fw.write(str(test_pred_proba[t][0]))
        fw.write('\t')
        fw.write(str(y_test3[t]))
        fw.write('\n')
    fw.close()
    
    fw = open('C:/Users/Crow/Desktop/new_result/CNN/29_kfold_CNN_'+ name + gap+'_test_'+ str(j) +'.txt','w')
    for t in range(0,len(y_test3)):
        fw.write(str(y_test3[t]))
        fw.write('\n')
    fw.close()
    
    if j == 10:
        print(auc_mean)
        print(print("CV AUC: %f" % mean(auc_mean)))
#         model.save('C:/Users/Crow/Desktop/result/re_CNN/model/CNN_kfold_'+ name + gap +'.h5') 
        
#         test_pred_proba = model.predict(x_test)
#         fpr, tpr, thresholds = roc_curve(y_test,test_pred_proba,pos_label=1)
#         print("总AUC: %f" % auc(fpr, tpr))
#         fw = open('C:/Users/Crow/Desktop/result/re_CNN/29_kfold_CNN_'+ name + gap +'_result.txt','w')
#         for t in range(0,len(test_pred_proba)):
#             fw.write(str(test_pred_proba[t][0]))
#             fw.write('\t')
#             fw.write(str(y_test[t]))
#             fw.write('\n') 
#         fw.close()
    j+=1

In [None]:
# 5种特征 CNN
# 重新训练CNN word Embedding

name = 'EAAC'
gap = '_gap5'
auc_mean=[]
# path_train = 'C:/Users/Crow/Desktop/human_data_12.12/Train.txt'
# path_test =  'C:/Users/Crow/Desktop/human_data_12.12/Independent.txt'
# x_train,y_train = pep(path_train,29-2)
# x_test,y_test = pep(path_test,29-2)

path_train = 'C:/Users/Crow/Desktop/human_data_12.12/Step_11_CV/'+ name +'/Train_29_'+ name + gap +'.txt'
path_test = 'C:/Users/Crow/Desktop/human_data_12.12/Step_11_IND/'+ name +'/Test_29_'+ name + gap +'.txt'

train = read_svm(path_train)
test = read_svm(path_test)


x_train = train[0]
y_train = train[1]

x_test = test[0]
y_test = test[1]

x_train = np.expand_dims(x_train, axis=2) 
x_test = np.expand_dims(x_test, axis=2) 

shape = x_train.shape[1:]

kf = KFold(n_splits = 10,random_state=5,shuffle=True)
j = 1 
for train_index, test_index in kf.split(x_train):
    x_train3, x_test3 = x_train[train_index], x_train[test_index]
    y_train3, y_test3 = y_train[train_index], y_train[test_index]
    
#     model = create_cnn_model5(input_length=29)
    
#     early_stopping = EarlyStopping(monitor='val_loss', patience=5)
#     callbacks_list = [early_stopping]
#     model.fit(x_train3, y_train3, validation_data = (x_test3, y_test3), epochs = 20, batch_size = 256,shuffle=True,
#          callbacks=callbacks_list, verbose=1)
    model = create_cnn_model3(shape=shape,dropout=0.6)
    
    #filepath='C:/Users/Crow/Desktop/result/re_CNN3/model/'+ str(j) +'checkpoint-{val_loss:.2f}-{epoch:02d}e-val_acc_{val_acc:.2f}.hdf5'
    filepath='C:/Users/Crow/Desktop/new_result/CNN/model/29_kfold_CNN_'+ name + gap+'_'+ str(j) +'.hdf5'
    #filepath="C:/Users/Crow/Desktop/result/re_CNN/model/weights.best.hdf5"
    checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=False,mode='auto', period=50)
    
    early_stopping = EarlyStopping(monitor='val_loss', patience=50)
    callbacks_list = [early_stopping,checkpoint]
    model.fit(x_train3, y_train3, validation_data = (x_test3, y_test3), epochs = 1000, batch_size = 256,
              shuffle=True,callbacks=callbacks_list, verbose=1)
    
    
    
    
    test_pred_proba = model.predict(x_test3)
    fpr, tpr, thresholds = roc_curve(y_test3,test_pred_proba,pos_label=1)
    #print("ACC:  %f "  %accuracy_score(y_test3,test_pred))
    print("AUC: %f" % auc(fpr, tpr))
    auc_mean.append(auc(fpr, tpr))
    #print("MCC: %f " %matthews_corrcoef(y_test3,test_pred))
    fw = open('C:/Users/Crow/Desktop/new_result/CNN/29_kfold_CNN_'+ name + gap+'_result_'+ str(j) +'.txt','w')
    for t in range(0,len(test_pred_proba)):
        fw.write(str(test_pred_proba[t][0]))
        fw.write('\t')
        fw.write(str(y_test3[t]))
        fw.write('\n')
    fw.close()
    
    fw = open('C:/Users/Crow/Desktop/new_result/CNN/29_kfold_CNN_'+ name + gap+'_test_'+ str(j) +'.txt','w')
    for t in range(0,len(y_test3)):
        fw.write(str(y_test3[t]))
        fw.write('\n')
    fw.close()
    
    if j == 10:
        print(auc_mean)
        print(print("CV AUC: %f" % mean(auc_mean)))
#         model.save('C:/Users/Crow/Desktop/result/re_CNN/model/CNN_kfold_'+ name + gap +'.h5') 
        
#         test_pred_proba = model.predict(x_test)
#         fpr, tpr, thresholds = roc_curve(y_test,test_pred_proba,pos_label=1)
#         print("总AUC: %f" % auc(fpr, tpr))
#         fw = open('C:/Users/Crow/Desktop/result/re_CNN/29_kfold_CNN_'+ name + gap +'_result.txt','w')
#         for t in range(0,len(test_pred_proba)):
#             fw.write(str(test_pred_proba[t][0]))
#             fw.write('\t')
#             fw.write(str(y_test[t]))
#             fw.write('\n') 
#         fw.close()
    j+=1

In [None]:
auc_mean

In [None]:
path_test =  'C:/Users/Crow/Desktop/human_data_12.12/Independent.txt'
x_test,y_test = pep(path_test,29-2)

In [None]:
model = load_model('C:/Users/Crow/Desktop/result/re_CNN/model/5checkpoint-0.23-50e-val_acc_0.90.hdf5')

test_pred_proba = model.predict(x_test)
test_pred = model.predict_classes(x_test)
fpr, tpr, thresholds = roc_curve(y_test,test_pred_proba,pos_label=1)
print("ACC:  %f "  %accuracy_score(y_test,test_pred))
print("AUC: %f" % auc(fpr, tpr))

In [None]:
# 读取 AUC AUC01
name = 'AAC'
gap = ''
model = ''
q = [21,23,25,27,29,31,35,37]
#q = [29]
for t in q :
    path_test='C:/Users/Crow/Desktop/human_data_12.12/result/'+ name +'/Test'+ model +'_' +str(t)+'_'+ name + gap +'_ROC01_result.txt'
    print('test: %i'%t)
    #read_result(path_test)
    fr = open(path_test,'r')
    for i in range(2):
        print(fr.readline().split(':')[1])
    fr.close() 

    
for t in q :
    path_train='C:/Users/Crow/Desktop/human_data_12.12/result/'+ name +'/Train'+ model +'_'+str(t)+'_'+ name +  gap +'_ROC01_result.txt'
    print('train:  %i'%t)
    #read_result(path_train)
    print('')
    fr = open(path_train,'r')
    for i in range(2):
        print(fr.readline().split(':')[1])
    fr.close() 
    

In [None]:
name = 'Embedding'


path_train = 'C:/Users/Crow/Desktop/human_data_12.12/Train.txt'
path_test =  'C:/Users/Crow/Desktop/human_data_12.12/Independent.txt'

x_train,y_train = pep(path_train,27)
x_test,y_test = pep(path_test,27)

model = create_cnn_model5(input_length=29,dropout=0.6)
model.summary()
#early_stopping = EarlyStopping(monitor='val_loss', patience=50)
#callbacks_list = [early_stopping]
#model.fit(x_train, y_train, epochs = 20, batch_size = 256,callbacks=callbacks_list, verbose=1)
hit = model.fit(x_train, y_train, epochs = 10000,batch_size = 256,validation_split = 0.2, verbose=1, shuffle=True)


print(model.evaluate(x_test, y_test, batch_size=256))
pre = model.predict(x_test)
pre2 = model.predict_classes(x_test)
fpr, tpr, thresholds = roc_curve(y_test, pre,pos_label=1)
print(sklearn.metrics.auc(fpr, tpr))
_,_,SN,SP,_,_,_,_,_ = performance(y_test,pre2)
print("ACC:  %f "  %accuracy_score(y_test,pre2))
print("Sn: %f" %SN) 
print("Sp: %f" %SP)
print("MCC: %f " %matthews_corrcoef(y_test,pre2))
print("AUC: %f" % auc(fpr, tpr))

In [None]:
# 分割并序列编码
AA = '_GAVLIFWYDNEKQMSTCPHR'
def pep(path, seq_len):
    seqs = open(path).readlines()
    cut = (len(seqs[0].split()[0]) - 1 - seq_len) // 2
    X = [[AA.index(res.upper()) if res.upper() in AA else 0
          for res in (seq.split()[0][cut:-cut] if cut != 0 else seq.split()[0])]
        for seq in seqs if seq.strip() != '']
    y = [int(seq.split()[-1]) for seq in seqs if seq.strip() != '']
    return np.array(X), np.array(y)


In [None]:
AA.index('_')

In [None]:
def trans(str1):
    a = []
    dic = {'A':1,'C':2,'D':3,'E':4,'F':5,'G':6,'H':7,'I':8,'K':9,'L':10,'M':11,'N':12,'P':13,'Q':14,'R':15,'S':16,'T':17,'V':18,'W':19,'Y':20,'-':21}
    for i in range(len(str1)):
        a.append(dic.get(str1[i]))
    return a




In [None]:
trans('ACKL--')

In [None]:
def createTrainData(str1):
    sequence_num = []
    label_num = []
    for line in open(str1):
        proteinId, sequence, label = line.split(",")
        proteinId = proteinId.strip(' \t\r\n');
        sequence = sequence.strip(' \t\r\n');
        sequence_num.append(trans(sequence))
        label = label.strip(' \t\r\n');
        label_num.append(int(label))

    return sequence_num,label_num

In [None]:
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn import cluster
from sklearn.model_selection import train_test_split
from sklearn.metrics import *

from sklearn.externals import joblib

from sklearn.ensemble import RandomForestClassifier


In [None]:
# 训练数据集大小对独立测试集性能影响 十折交叉验证 AUC AUC01
# DL
# RF EGAAC _gap4
# 临时在此


name = 'EGAAC'
gap = '_gap4'
# 读取数据


path_train = 'C:/Users/Crow/Desktop/human_data_12.12/Step_11_CV/'+ name +'/Train_29_'+ name + gap +'.txt'
path_test = 'C:/Users/Crow/Desktop/human_data_12.12/Step_11_IND/'+ name +'/Test_29_'+ name + gap +'.txt'

train = read_svm(path_train)
test = read_svm(path_test)


x_train_ori = train[0]
y_train_ori = train[1]
size = 0.0625
size2 = 1-size
x_train, _, y_train, _ =train_test_split (x_train_ori,y_train_ori,test_size = size2,random_state=5, shuffle=True, stratify=y_train_ori)

x_test = test[0]
y_test = test[1]



kf = KFold(n_splits = 10,random_state=5,shuffle=True)
j = 1
auc_mean = []
for train_index, test_index in kf.split(x_train):
    x_train3, x_test3 = x_train[train_index], x_train[test_index]
    y_train3, y_test3 = y_train[train_index], y_train[test_index]
    
    clf = RandomForestClassifier(n_estimators=1600,oob_score=True,n_jobs=3,
                                 random_state=50,max_depth=11,min_samples_split=30,max_features=19,min_samples_leaf=20)
#     clf = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
#             max_depth=10, max_features='auto', max_leaf_nodes=None,
#             min_impurity_decrease=0.0, min_impurity_split=None,
#             min_samples_leaf=1, min_samples_split=2,
#             min_weight_fraction_leaf=0.0, n_estimators=30, n_jobs=4,
#             oob_score=False, random_state=0, verbose=0, warm_start=False)
    clf.fit(x_train3, y_train3)
    test_pred = clf.predict(x_test3)
    test_pred_proba = clf.predict_proba(x_test3)[:, 1]
    fpr, tpr, thresholds = roc_curve(y_test3,test_pred_proba,pos_label=1)
    #print("ACC:  %f "  %accuracy_score(y_test3,test_pred))
    print("AUC: %f" % auc(fpr, tpr))

    auc_mean.append(auc(fpr, tpr))
    #print("MCC: %f " %matthews_corrcoef(y_test3,test_pred))
    joblib.dump(clf, 'C:/Users/Crow/Desktop/new_result/datasize/model/'+ str(j) +'_'+str(size)+'_clf_kfold_'+ name + gap +'.pkl')
    fw = open('C:/Users/Crow/Desktop/new_result/datasize/29_kfold_'+str(size)+'_'+ name + gap+'_result_'+ str(j) +'.txt','w')
   
    for t in range(0,len(test_pred_proba)):
        fw.write(str(test_pred_proba[t]))
        fw.write('\t')
        fw.write(str(y_test3[t]))
        fw.write('\n')
    fw.close()

    
    if j == 10:        
        print(auc_mean)
        print(print("CV AUC: %f" % mean(auc_mean)))
        
    j+=1
        

In [None]:
len(train[0])

In [None]:
len(x_train)

In [None]:
# 训练数据集大小对独立测试集性能影响 十折交叉验证 AUC AUC01
# DL
# Embedding
# 临时在此
name = 'Embedding'
gap = ''
auc_mean=[]

path_train = 'C:/Users/Crow/Desktop/human_data_12.12/Train.txt'
path_test =  'C:/Users/Crow/Desktop/human_data_12.12/Independent.txt'

x_train_ori,y_train_ori = pep(path_train,29-2)
size = 0.0625
size2 = 1-size
x_train, _, y_train, _ =train_test_split (x_train_ori,y_train_ori,test_size = size2,random_state=5, shuffle=True, stratify=y_train_ori)






shape = x_train.shape[1:]


x_test,y_test = pep(path_test,29-2)




kf = KFold(n_splits = 10,random_state=5,shuffle=True)
j = 1 
for train_index, test_index in kf.split(x_train):
    x_train3, x_test3 = x_train[train_index], x_train[test_index]
    y_train3, y_test3 = y_train[train_index], y_train[test_index]
    
#     model = create_cnn_model5(input_length=29)
    
#     early_stopping = EarlyStopping(monitor='val_loss', patience=5)
#     callbacks_list = [early_stopping]
#     model.fit(x_train3, y_train3, validation_data = (x_test3, y_test3), epochs = 20, batch_size = 256,shuffle=True,
#          callbacks=callbacks_list, verbose=1)
    model = create_cnn_model6(input_length=29,dropout=0.5)
    
    #filepath='C:/Users/Crow/Desktop/new_result/datasize/CNN/model/29_kfold_CNN_'+'_'+str(size)+'_'+ name + gap+'_'+ str(j) +'.hdf5'
   
    #filepath="C:/Users/Crow/Desktop/result/re_CNN/model/weights.best.hdf5"
    checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=False,mode='auto', period=50)
    
    early_stopping = EarlyStopping(monitor='val_loss', patience=50)
    callbacks_list = [early_stopping,checkpoint]
    model.fit(x_train3, y_train3, validation_data = (x_test3, y_test3), epochs = 2000, batch_size = 256,
              shuffle=True,callbacks=callbacks_list, verbose=1)
    
    
    
    
    test_pred_proba = model.predict(x_test3)
    fpr, tpr, thresholds = roc_curve(y_test3,test_pred_proba,pos_label=1)
    #print("ACC:  %f "  %accuracy_score(y_test3,test_pred))
    print("AUC: %f" % auc(fpr, tpr))
    auc_mean.append(auc(fpr, tpr))
    #print("MCC: %f " %matthews_corrcoef(y_test3,test_pred))
    fw = open('C:/Users/Crow/Desktop/new_result/datasize/CNN/29_kfold_CNN_'+'_'+str(size)+'_'+ name + gap+'_result_'+ str(j) +'.txt','w')
    for t in range(0,len(test_pred_proba)):
        fw.write(str(test_pred_proba[t][0]))
        fw.write('\t')
        fw.write(str(y_test3[t]))
        fw.write('\n')
    fw.close()
    
    fw = open('C:/Users/Crow/Desktop/new_result/datasize/CNN/29_kfold_CNN_'+'_'+str(size)+'_'+ name + gap+'_test_'+ str(j) +'.txt','w')
    for t in range(0,len(y_test3)):
        fw.write(str(y_test3[t]))
        fw.write('\n')
    fw.close()
    
    if j == 10:
        print(auc_mean)
        print(print("CV AUC: %f" % mean(auc_mean)))
#         model.save('C:/Users/Crow/Desktop/result/re_CNN/model/CNN_kfold_'+ name + gap +'.h5') 
        
#         test_pred_proba = model.predict(x_test)
#         fpr, tpr, thresholds = roc_curve(y_test,test_pred_proba,pos_label=1)
#         print("总AUC: %f" % auc(fpr, tpr))
#         fw = open('C:/Users/Crow/Desktop/result/re_CNN/29_kfold_CNN_'+ name + gap +'_result.txt','w')
#         for t in range(0,len(test_pred_proba)):
#             fw.write(str(test_pred_proba[t][0]))
#             fw.write('\t')
#             fw.write(str(y_test[t]))
#             fw.write('\n') 
#         fw.close()
    j+=1

In [None]:
x_train_ori,y_train_ori = pep(path_train,29-2)
size = 0.125
size2 = 1-size
x_train, _, y_train, _ =train_test_split (x_train_ori,y_train_ori,test_size = size2,random_state=5, shuffle=True, stratify=y_train_ori)


len(x_train)

In [None]:
# RF EGAAC hap4
AUC_1_16 = [0.7337822979659153, 0.7124106652006597, 0.688857006217121, 0.7232804232804232, 0.7363413819962168, 0.7151188140396774, 0.7185363247863248, 0.6952214452214452, 0.6517357222844344, 0.6957026713124274]
AUC01_1_16 = [0.014431,0.0141561,0.0133907,0.0202381,0.00806721,0.0218552,0.0142094,0.00728438,0.012007,0.0149245]

AUC_1_8 = [0.7756614074029635, 0.7099938418228204, 0.7381056069678825, 0.7371937942297224, 0.7329593483439637, 0.7822943949711891, 0.7330908551068883, 0.7419570267131242, 0.7281856333115326, 0.801059878745174]
AUC01_1_8 = [0.0225718,0.0123897,0.0193658,0.0187534,0.0170291,0.0222106,0.0201306,0.0204704,0.0183807,0.0258511]

AUC_1_4 = [0.7935586221554249, 0.7521843738128403, 0.7691161930687349, 0.755496899348068, 0.7406853296857027, 0.7740323881438553, 0.7755122655122655, 0.7821445289847478, 0.7853042479908152, 0.7712202429295937]
AUC01_1_4 = [0.0234587,0.0213649,0.0221818,0.0202894,0.0176967,0.0222927,0.0210786,0.0197692,0.0186064,0.0229718]

AUC_1_2 = [0.7668826151560179, 0.8126446838063074, 0.7966258773354422, 0.7765518076373588, 0.7807456852211682, 0.7566508126399111, 0.8010958963919467, 0.785541560304506, 0.7886318932583471, 0.7762957504298698]
AUC01_1_2 = [0.0185594,0.0237894,0.0221814,0.0190358,0.0225772,0.0198215,0.0225137,0.0226929,0.0248152,0.0228411]

In [None]:
# CNN Embedding
AUC_1_16 = [0.7035459043430456, 0.7205882352941176, 0.7131516021042564, 0.7584656084656084, 0.6444308445532435, 0.618759537824286, 0.7250534188034188, 0.6939831002331003, 0.6928580316038323, 0.7059233449477352]
AUC01_1_16 = [0.0186229,0.0164239,0.0241989,0.0185185,0.00723267,0.0134075,0.0213675,0.0178467,0.0204678,0.0170151]

AUC_1_8 = [0.8114463975043494, 0.7565467288349316, 0.6903375068045727, 0.7692161132280892, 0.7759778682855606, 0.7646804609743321, 0.7917013064133016, 0.7632984901277585, 0.8010546108567691, 0.7904464115324202]
AUC01_1_8 = [0.0256614,0.0175655,0.0100572,0.023571,0.0208714,0.0219094,0.0272565,0.0216899,0.0273872,0.0240601]

AUC_1_4 = [0.819647492826889, 0.7984915157654806, 0.8049506285391441, 0.8291302273811416, 0.7943505766779618, 0.813339947042744, 0.8088023088023087, 0.8210400553354712, 0.8091883776384351, 0.8198927933293627]
AUC01_1_4 = [0.0208817,0.0225837,0.0188851,0.026179,0.0246565,0.0303862,0.0248449,0.0248101,0.0280456,0.0219467]

AUC_1_2 = [0.8178964453080353, 0.8528667819598018, 0.8418421925971953, 0.8442124125239122, 0.8417263687278547, 0.8308962498756591, 0.842994285839032, 0.8578390004490363, 0.8314427023418971, 0.839707265011562]
AUC01_1_2 =[0.0246401,0.0312474,0.0297923,0.0278558,0.0274966,0.0246649,0.0278959,0.0330277,0.0263588,0.0278793]

In [None]:
mean(AUC_1_16)

In [None]:
RF_datasize_AUC = [0.7480501787615261,0.7699255091632049,0.7841666582180875,0.791109499999999]
RF_datasize_AUC01 = [0.019715319999999998,0.02097102,0.021882759999999998,0.02216211]

CNN_datasize_AUC = [0.7714705894562085,0.811883392333894,0.8401423704633986,0.863449099999999]
CNN_datasize_AUC01 = [0.022002960000000002,0.024321950000000002,0.028085879999999997,0.03232128]

In [None]:

name='EGAAC'
gap='_gap4'
size = 0.0625
a = []
b = []
for i in range(1,11):
    path = 'C:/Users/Crow/Desktop/new_result/datasize/result/29_kfold_'+str(size)+'_'+name+gap+'_result_ROC01_result_'+str(i)+'.txt'
    fr = open(path,'r')
    for line in fr.readlines()[0:1]:
        print(line.split(':')[1].split('\n')[0])
        a.append(float(line.split(':')[1].split('\n')[0]))
        print(',')
    fr.close()
    fr = open(path,'r')
    for line in fr.readlines()[1:2]:
        print(line.split(':')[1].split('\n')[0])
        b.append(float(line.split(':')[1].split('\n')[0]))
        print(',')
    fr.close()

In [None]:
b

In [None]:
# 训练数据集大小对独立测试集性能影响 十折交叉验证 AUC AUC01
# CNN Embedding


In [None]:
sys.getsizeof()

In [None]:
def CNN(x):
    block = Conv1D(filter_nr, kernel_size=filter_size, padding=same, activation=linear, 
                kernel_regularizer=conv_kern_reg, bias_regularizer=conv_bias_reg)(x)
    block = BatchNormalization()(block)
    block = PReLU()(block)
    block = Conv1D(filter_nr, kernel_size=filter_size, padding=same, activation=linear, 
                kernel_regularizer=conv_kern_reg, bias_regularizer=conv_bias_reg)(block)
    block = BatchNormalization()(block)
    block = PReLU()(block)
    return block

def DPCNN():
    filter_nr = 64 #滤波器通道个数
    filter_size = 3 #卷积核
    max_pool_size = 3 #池化层的pooling_size
    max_pool_strides = 2 #池化层的步长
    dense_nr = 256 #全连接层
    spatial_dropout = 0.2
    dense_dropout = 0.5
    train_embed = False
    conv_kern_reg = regularizers.l2(0.00001)
    conv_bias_reg = regularizers.l2(0.00001)

    comment = Input(shape=(maxlen,))
    emb_comment = Embedding(max_features, embed_size, weights=[embedding_matrix], trainable=train_embed)(comment)
    emb_comment = SpatialDropout1D(spatial_dropout)(emb_comment)

    #region embedding层
    resize_emb = Conv1D(filter_nr, kernel_size=1, padding=same, activation=linear, 
                kernel_regularizer=conv_kern_reg, bias_regularizer=conv_bias_reg)(emb_comment)
    resize_emb = PReLU()(resize_emb)
    #第一层
    block1 = CNN(emb_comment)
    block1_output = add([block1, resize_emb])
    block1_output = MaxPooling1D(pool_size=max_pool_size, strides=max_pool_strides)(block1_output)
    #第二层
    block2 = CNN(block1_output)
    block2_output = add([block2, block1_output])
    block2_output = MaxPooling1D(pool_size=max_pool_size, strides=max_pool_strides)(block2_output)
    #第三层
    block3 = CNN(block2_output)
    block3_output = add([block3, block2_output])
    block3_output = MaxPooling1D(pool_size=max_pool_size, strides=max_pool_strides)(block3_output)  
    #第四层
    block4 = CNN(block3_output) 
    block4_output = add([block4, block3_output])
    block4_output = MaxPooling1D(pool_size=max_pool_size, strides=max_pool_strides)(block4_output)
    #第五层
    block5 = CNN(block4_output) 
    block5_output = add([block5, block4_output])
    block5_output = MaxPooling1D(pool_size=max_pool_size, strides=max_pool_strides)(block5_output)
    #第六层
    block6 = CNN(block5_output) 
    block6_output = add([block6, block5_output])
    block6_output = MaxPooling1D(pool_size=max_pool_size, strides=max_pool_strides)(block6_output)
    #第七层
    block7 = CNN(block6_output) 
    block7_output = add([block7, block6_output])
    output = GlobalMaxPooling1D()(block7_output)
    #全连接层
    output = Dense(dense_nr, activation=linear)(output)
    output = BatchNormalization()(output)
    output = PReLU()(output)
    output = Dropout(dense_dropout)(output)
    output = Dense(6, activation=sigmoid)(output)

    model = Model(comment, output)
    model.summary()
    model.compile(loss=binary_crossentropy, 
                optimizer=optimizers.Adam(),
                metrics=[accuracy])
    return model