In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

In [2]:
df_aaindex = pd.read_csv('../data/aaindex/df_aaindex19.csv')
print(df_aaindex.shape)
df_aaindex.head(1)
tmp = df_aaindex.drop('Unnamed: 0',axis=1).T
aa2val = dict()
for aa, val in zip(tmp.index, tmp.values):
    aa2val[aa]=val

(19, 21)


In [3]:
df_detect_peptide_train = pd.read_csv('../data/df_detect_peptide_train.csv')
test = pd.read_csv('../data/df_detect_peptide_test.csv')
train, val = train_test_split(df_detect_peptide_train, test_size=0.2, random_state=7)

In [4]:
train.head(1)

Unnamed: 0,peptide,En,Ec,E1,E2,protein,PEP,ID
595411,K.QELNEPPKQSTSFLVLQEILESEEKGDPNK.P,VYKMLQEKQELNEPP,EEKGDPNKPSGFRSV,QELNEPPKQSTSFLV,EILESEEKGDPNKPS,sp|O00151|PDLI1_HUMAN,QELNEPPKQSTSFLVLQEILESEEKGDPNK,0


# Train

In [6]:
from tensorflow.python.client import device_lib

import tensorflow as tf
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        tf.config.experimental.set_virtual_device_configuration(gpus[0],
        [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=1024*4)])
    except RuntimeError as e:
        print(e)

In [7]:
def plot_graphs(history, metric):
    plt.plot(history.history[metric])
    plt.plot(history.history['val_'+metric], '')
    plt.xlabel("Epochs")
    plt.ylabel(metric)
    plt.legend([metric, 'val_'+metric])

# high param

In [18]:
# define model

# Embedding from ESM
pep_embed = tf.keras.layers.Input(shape=((30, 1280,)), name='pep_embed')
meta = tf.keras.layers.Input(shape=((39,)))  # peptide info
n_embed = tf.keras.layers.Input(shape=((15, 1280,)), name='n_embed')
c_embed = tf.keras.layers.Input(shape=((15, 1280,)), name='c_embed')
m1_embed = tf.keras.layers.Input(shape=((15, 1280,)), name='m1_embed')
m2_embed = tf.keras.layers.Input(shape=((15, 1280,)), name='m2_embed')

# LSTM
net_meta = tf.keras.layers.Dense(32, activation='relu', name='meta')(meta)
net_meta = tf.keras.layers.Dropout(np.random.uniform(0, 0.2))(net_meta)

ts_lstm1 = tf.keras.layers.LSTM(64)
# ts_lstm2 = tf.keras.layers.LSTM(64)
# ts_lstm3 = tf.keras.layers.LSTM(64)
pep_lstm1 = tf.keras.layers.LSTM(128)
# pep_lstm2 = tf.keras.layers.LSTM(128)
# pep_lstm3 = tf.keras.layers.LSTM(128, name='pep_lstm3')

# tf.keras.layers.Bidirectional()
n_lstm = ts_lstm1(n_embed)
n_lstm = tf.keras.layers.Dropout(np.random.uniform(0, 0.2))(n_lstm)
# n_lstm = ts_lstm2(n_lstm)
# n_lstm = tf.keras.layers.Dropout(np.random.uniform(0, 0.2))(n_lstm)
# n_lstm = ts_lstm3(n_lstm)
# n_lstm = tf.keras.layers.Dropout(np.random.uniform(0, 0.2))(n_lstm)

c_lstm = ts_lstm1(c_embed)
c_lstm = tf.keras.layers.Dropout(np.random.uniform(0, 0.2))(c_lstm)
# c_lstm = ts_lstm2(c_lstm)
# c_lstm = tf.keras.layers.Dropout(np.random.uniform(0, 0.2))(c_lstm)
# c_lstm = ts_lstm3(c_lstm)
# c_lstm = tf.keras.layers.Dropout(np.random.uniform(0, 0.2))(c_lstm)

m1_lstm = ts_lstm1(m1_embed)
m1_lstm = tf.keras.layers.Dropout(np.random.uniform(0, 0.2))(m1_lstm)
# m1_lstm = ts_lstm2(m1_lstm)
# m1_lstm = tf.keras.layers.Dropout(np.random.uniform(0, 0.2))(m1_lstm)
# m1_lstm = ts_lstm3(m1_lstm)
# m1_lstm = tf.keras.layers.Dropout(np.random.uniform(0, 0.2))(m1_lstm)

m2_lstm = ts_lstm1(m2_embed)
m2_lstm = tf.keras.layers.Dropout(np.random.uniform(0, 0.2))(m2_lstm)
# m2_lstm = ts_lstm2(m2_lstm)
# m2_lstm = tf.keras.layers.Dropout(np.random.uniform(0, 0.2))(m2_lstm)
# m2_lstm = ts_lstm3(m2_lstm)
# m2_lstm = tf.keras.layers.Dropout(np.random.uniform(0, 0.2))(m2_lstm)

pep_lstm = pep_lstm1(pep_embed)
pep_lstm = tf.keras.layers.Dropout(np.random.uniform(0, 0.2))(pep_lstm)
# pep_lstm = pep_lstm2(pep_lstm)
# pep_lstm = tf.keras.layers.Dropout(np.random.uniform(0, 0.2))(pep_lstm)
# pep_lstm = pep_lstm3(pep_lstm)
# pep_lstm = tf.keras.layers.Dropout(np.random.uniform(0, 0.2))(pep_lstm)


# Dense
merge = tf.keras.layers.concatenate([pep_lstm, 
#                                      net_meta,
                                     n_lstm,
                                     c_lstm,
                                     m1_lstm,
                                     m2_lstm
                                    ], name='merge')

net_merge = tf.keras.layers.Dense(128, activation='relu', name='fc1')(merge)
net_merge = tf.keras.layers.Dropout(np.random.uniform(0, 0.2))(net_merge)
# net_merge = tf.keras.layers.Dense(128, activation='relu', name='fc2')(net_merge)
# net_merge = tf.keras.layers.Dropout(np.random.uniform(0, 0.2))(net_merge)

output = tf.keras.layers.Dense(1, activation = 'sigmoid', name='out')(net_merge)

model_high = tf.keras.Model(inputs=[pep_embed, n_embed, c_embed, m1_embed, m2_embed],  # meta, 
                            outputs=[output])

model_high.summary()

model_high.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(1e-3),
              metrics=['accuracy'])

es = tf.keras.callbacks.EarlyStopping(monitor='val_loss',
                                      mode='min', 
                                      verbose=1,
                                      patience=50)

Model: "model_5"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
pep_embed (InputLayer)          [(None, 30, 1280)]   0                                            
__________________________________________________________________________________________________
n_embed (InputLayer)            [(None, 15, 1280)]   0                                            
__________________________________________________________________________________________________
c_embed (InputLayer)            [(None, 15, 1280)]   0                                            
__________________________________________________________________________________________________
m1_embed (InputLayer)           [(None, 15, 1280)]   0                                            
____________________________________________________________________________________________

In [12]:
model_high.count_params()

8837889

# ESM embedding vector

In [7]:
df = pd.concat([train, val, test], axis=0).reset_index(drop=True)

train_idx = df.iloc[:len(train), :].index
val_idx = df.iloc[len(train):len(train)+len(val), :].index
test_idx = df.iloc[len(train)+len(val):, :].index

In [8]:
import sys
PATH_TO_REPO = "/home/bis/2021_AIhub/esm/"
sys.path.append(PATH_TO_REPO)

import torch
import esm

In [40]:
import os  #for accessing the file system of the system
import random
from tensorflow import keras
import torch


# data generator class
class DataGenerator(keras.utils.Sequence):
    def __init__(self, ids, vecs_dir, labels_dir, batch_size=128, emb_size=1280, n_classes=1, EMB_LAYER=33, shuffle=True):
        self.id_names = ids
        self.indexes = np.arange(len(self.id_names))
        self.vecs_dir = vecs_dir
        self.labels_dir = labels_dir
        self.batch_size = batch_size
        self.emb_size = emb_size
        self.n_classes = n_classes
        self.EMB_LAYER = EMB_LAYER
        self.shuffle = shuffle
        self.on_epoch_end()

        
    # for printing the statistics of the function
    def on_epoch_end(self):
        'Updates indexes after each epoch'
        self.indexes = np.arange(len(self.id_names))
        if self.shuffle == True:
            np.random.shuffle(self.indexes)

            
    def __data_generation__(self, id_name):
        'Generates data containing batch_size samples'
        # Initialization
        lab_name = id_name.split('.')[0]
        label_path = f'{self.labels_dir}{lab_name}'  # idx.pt file to idx
        lab = int(open(label_path, 'r').readlines()[0])
        
        pep_path = self.vecs_dir + 'PEP/'
        en_path = self.vecs_dir + 'En/'
        ec_path = self.vecs_dir + 'Ec/'
        m1_path = self.vecs_dir + 'E1/'
        m2_path = self.vecs_dir + 'E2/'
        
        pep_fn = f'{pep_path}{id_name}'  # vector 1개 경로
        en_fn = f'{en_path}{id_name}'
        ec_fn = f'{ec_path}{id_name}'
        m1_fn = f'{m1_path}{id_name}'
        m2_fn = f'{m2_path}{id_name}'
        
        embs = torch.load(pep_fn)['representations'][self.EMB_LAYER]
        pep_zp = torch.nn.ZeroPad2d((0, 0, 30-len(embs), 0))  # zero padding on top
        pep_embed = pep_zp(embs).numpy()
        en_embed = torch.load(en_fn)['representations'][self.EMB_LAYER].numpy()
        ec_embed = torch.load(ec_fn)['representations'][self.EMB_LAYER].numpy()
        m1_embed = torch.load(m1_fn)['representations'][self.EMB_LAYER].numpy()
        if len(m1_embed)==1:
            m1_embed = np.zeros((15, 1280))
        m2_embed = torch.load(m2_fn)['representations'][self.EMB_LAYER].numpy()
        if len(m2_embed)==1:
            m2_embed = np.zeros((15, 1280))

        return pep_embed, en_embed, ec_embed, m1_embed, m2_embed, lab

    
    def __len__(self):
        "Denotes the number of batches per epoch"
        # self.id_names: 존재하는 총 vector 개수를 의미합니다.
        # self.batch_size: 배치사이즈를 의미합니다.
        return int(np.floor(len(self.id_names) / self.batch_size))


    def __getitem__(self, index):  # index : batch no.
        # Generate indexes of the batch
        indexes = self.indexes[index * self.batch_size:(index + 1) * self.batch_size]
        batch_ids = [self.id_names[k] for k in indexes]

        pep_embeds = list()
        en_embeds = list()
        ec_embeds = list()
        m1_embeds = list()
        m2_embeds = list()
        labels = list()

        for id_name in batch_ids:
            pep_embed, en_embed, ec_embed, m1_embed, m2_embed, lab = self.__data_generation__(id_name)
            pep_embeds.append(pep_embed)
            en_embeds.append(en_embed)
            ec_embeds.append(ec_embed)
            m1_embeds.append(m1_embed)
            m2_embeds.append(m2_embed)
            labels.append(lab)

        pep_embeds = np.array(pep_embeds)
        en_embeds = np.array(en_embeds)
        ec_embeds = np.array(ec_embeds)
        m1_embeds = np.array(m1_embeds)
        m2_embeds = np.array(m2_embeds)
        labels = np.array(labels)

        return [pep_embeds, en_embeds, ec_embeds, m1_embeds, m2_embeds], labels # return batch

In [41]:
# hyperparameter
emb_size = 1280
train_path = '/data/211129_SJH_ESM/ProtStructureEmbedding_emb_esm1b/'  # 벡터 파일들의 경로
idname_path = '/data/211129_SJH_ESM/ProtStructureEmbedding_emb_esm1b/PEP'  # 벡터 파일들의 경로
labels_path = '/data/211129_SJH_ESM/ProtStructureEmbedding_emb_esm1b/LABEL/'  # 라벨 파일들의 경로
epochs = 300  # number of time we need to train dataset
lr = 1e-4
batch_size = 256  # tarining batch size

# train path
train_ids = np.array(os.listdir(idname_path))  #[1.pt, 2.pt .....]
valid_ids = train_ids[val_idx]
train_ids = train_ids[train_idx]

# train, validation Datagenerator 클래스를  각각 생성합니다.
train_gen = DataGenerator(train_ids, train_path, labels_path, emb_size=emb_size, batch_size=batch_size)
valid_gen = DataGenerator(valid_ids, train_path, labels_path, emb_size=emb_size, batch_size=batch_size)
# 여기서 ids, train_path는 f'{train_path}{id}이런 식으로 경로로 결합하여, 최종적인 임베딩 벡터의 경로가 됩니다.
# 이 경로는 앞서 구현한 DataGenerator클래스에서 벡터 불러들이는데 사용됩니다.

print("total training batches: ", len(train_gen))
print("total validaton batches: ", len(valid_gen))
train_steps = len(train_ids) // batch_size
valid_steps = len(valid_ids) // batch_size

total training batches:  2123
total validaton batches:  530


In [42]:
import time
s = time.time()

tg = train_gen.__getitem__(256)
vg = valid_gen.__getitem__(256)

e = time.time()
print(round(e-s, 2), 'sec')  # read에 10분

16.53 sec


In [43]:
round(e-s, 2)*(train_steps + valid_steps) / 3600  # 1epoch의 read data에 12시간 .. 

12.181691666666667

In [44]:
[p, en, ec, m1, m2], lab = tg
print(p.shape, en.shape, ec.shape, m1.shape, m2.shape, lab.shape)
[p, en, ec, m1, m2], lab = vg
print(p.shape, en.shape, ec.shape, m1.shape, m2.shape, lab.shape)

(256, 30, 1280) (256, 15, 1280) (256, 15, 1280) (256, 15, 1280) (256, 15, 1280) (256,)
(256, 30, 1280) (256, 15, 1280) (256, 15, 1280) (256, 15, 1280) (256, 15, 1280) (256,)


In [None]:
history = model_high.fit_generator(generator=train_gen, validation_data=valid_gen,
                                   steps_per_epoch=train_steps, validation_steps=valid_steps,
                                   epochs=256,
                                   callbacks=[es]
                                   )

Epoch 1/256
  69/2123 [..............................] - ETA: 7:40:15 - loss: 0.6263 - accuracy: 0.6793

In [None]:
plt.figure(figsize=(16,2))
plt.subplot(1,2,1)
plot_graphs(history, 'accuracy')
plt.subplot(1,2,2)
plot_graphs(history, 'loss')

# eval

In [None]:
test_loss, test_acc = model_high.evaluate([pep_test, aa_test, en_test, ec_test, e1_test, e2_test], y_test)
print('Test Loss: {}'.format(test_loss))
print('Test Accuracy: {}'.format(test_acc))

# prediction
y_pred = [1 if i>=0.5 else 0 for i in model_high.predict([pep_test, aa_test, en_test, ec_test, e1_test, e2_test])]
print(classification_report(y_test, y_pred))
# AUC
probs = model_high.predict([pep_test, aa_test, en_test, ec_test, e1_test, e2_test])
rf_auc = roc_auc_score(y_test, probs)
print('rf auc : {}'.format(rf_auc))
# plot the roc curve for the model_high
rf_fpr, rf_tpr, _ = roc_curve(y_test, probs)
plt.figure(figsize=(4, 4))
plt.plot(rf_fpr, rf_tpr, marker='.', label='RF AUC = {:.4f}'.format(rf_auc), color='orange')
plt.title('ROC curve')
# axis labels
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
# show the legend
plt.legend()
# show the plot
plt.show()

cf_matrix = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(3, 3))
sns.heatmap(cf_matrix, annot=True, fmt=',.0f')
plt.show()
plt.figure(figsize=(3, 3))
sns.heatmap(cf_matrix/np.sum(cf_matrix), annot=True, 
            fmt='.2%', cmap='Blues')
plt.show()