In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

In [3]:
df_aaindex = pd.read_csv('data/aaindex/df_aaindex19.csv')
print(df_aaindex.shape)
df_aaindex.head(1)
tmp = df_aaindex.drop('Unnamed: 0',axis=1).T
aa2val = dict()
for aa, val in zip(tmp.index, tmp.values):
    aa2val[aa]=val

(19, 21)


In [4]:
df_detect_peptide_train = pd.read_csv('data/df_detect_peptide_train.csv')
test = pd.read_csv('data/df_detect_peptide_test.csv')
train, val = train_test_split(df_detect_peptide_train, test_size=0.2, random_state=7)

In [5]:
train.head(1)

Unnamed: 0,peptide,En,Ec,E1,E2,protein,PEP,ID
595411,K.QELNEPPKQSTSFLVLQEILESEEKGDPNK.P,VYKMLQEKQELNEPP,EEKGDPNKPSGFRSV,QELNEPPKQSTSFLV,EILESEEKGDPNKPS,sp|O00151|PDLI1_HUMAN,QELNEPPKQSTSFLVLQEILESEEKGDPNK,0


In [6]:
def get_data_labelEnc(df):
    label_enc = {v:k+1 for k, v in enumerate('ARNDCQEGHILKMFPSTWYV')}
    label_enc['Z']=0
    aa_data = [np.array([seq.count(a) for a in 'ARNDCQEGHILKMFPSTWYV'] + list(np.array([aa2val[aa] for aa in seq]).sum(axis=0)))
               for seq in df.PEP.values]
    pep_data = [[label_enc[aa] for aa in seq] + [0]*(30-len(seq))
               for seq in df.PEP.values]
    en_data = [[label_enc[aa] for aa in seq]
               for seq in df.En.values]
    ec_data = [[label_enc[aa] for aa in seq]
               for seq in df.Ec.values]
    e1_data = [[label_enc[aa] for aa in seq]
               if seq != '-' else [0 for _ in range(15)]
               for seq in df.E1.values]
    e2_data = [[label_enc[aa] for aa in seq]
               if seq != '-' else [0 for _ in range(15)]
               for seq in df.E2.values]
    return np.array(aa_data), np.array(pep_data), np.array(en_data), np.array(ec_data), np.array(e1_data), np.array(e2_data), np.array(df.ID.values)

In [34]:
aa_train, pep_train, en_train, ec_train, e1_train, e2_train, y_train = get_data_labelEnc(train)
aa_val, pep_val, en_val, ec_val, e1_val, e2_val, y_val = get_data_labelEnc(val)
aa_test, pep_test, en_test, ec_test, e1_test, e2_test, y_test = get_data_labelEnc(test)
print(aa_train.shape, pep_train.shape, en_train.shape, ec_train.shape, e1_train.shape, e2_train.shape, y_train.shape)
print(aa_val.shape, pep_val.shape, en_val.shape, ec_val.shape, e1_val.shape, e2_val.shape, y_val.shape)
print(aa_test.shape, pep_test.shape, en_test.shape, ec_test.shape, e1_test.shape, e2_test.shape, y_test.shape)

(543516, 39) (543516, 30) (543516, 15) (543516, 15) (543516, 15) (543516, 15) (543516,)
(135880, 39) (135880, 30) (135880, 15) (135880, 15) (135880, 15) (135880, 15) (135880,)
(133992, 39) (133992, 30) (133992, 15) (133992, 15) (133992, 15) (133992, 15) (133992,)


In [8]:
y_train = np.array([[1, 1, 0, 0] + [_] for _ in y_train])
y_val = np.array([[1, 1, 0, 0] + [_] for _ in y_val])
y_test = np.array([[1, 1, 0, 0] + [_] for _ in y_test])

In [9]:
print(y_train.shape, y_val.shape, y_test.shape)

(543516, 5) (135880, 5) (133992, 5)


# Train

In [10]:
from tensorflow.python.client import device_lib

import tensorflow as tf
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        tf.config.experimental.set_virtual_device_configuration(gpus[0],
        [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=1024*8)])
    except RuntimeError as e:
        print(e)

In [11]:
def plot_graphs(history, metric):
    plt.plot(history.history[metric])
    plt.plot(history.history['val_'+metric], '')
    plt.xlabel("Epochs")
    plt.ylabel(metric)
    plt.legend([metric, 'val_'+metric])

# high param

In [12]:
from tensorflow.keras import layers
from tensorflow import keras

In [13]:
class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = keras.Sequential(
            [layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)
    
class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super(TokenAndPositionEmbedding, self).__init__()
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

In [14]:
vocab_size = 21  # 20 Amino acid + zero padding
maxlen_pep = 30
maxlen_ts = 15

pep_embed_dim = 128  # Embedding size for each token
pep_num_heads = 4  # Number of attention heads
pep_ff_dim = 64  # Hidden layer size in feed forward network inside transformer

ts_embed_dim = 64  # Embedding size for each token
ts_num_heads = 4  # Number of attention heads
ts_ff_dim = 32  # Hidden layer size in feed forward network inside transformer

In [65]:
embedding_layer_pep = TokenAndPositionEmbedding(maxlen_pep, vocab_size, pep_embed_dim)
transformer_block_pep = TransformerBlock(pep_embed_dim, pep_num_heads, pep_ff_dim)
embedding_layer_ts = TokenAndPositionEmbedding(maxlen_ts, vocab_size, ts_embed_dim)
transformer_block_ts = TransformerBlock(ts_embed_dim, ts_num_heads, ts_ff_dim)

# peptide embedding
pep_input = layers.Input(shape=(maxlen_pep,), name='pep_input')
pep_embed = embedding_layer_pep(pep_input)
pep_embed = transformer_block_pep(pep_embed)

# peptide info
meta_input = tf.keras.layers.Input(shape=((39,)))  

# cleavage site embedding
n_input = layers.Input(shape=(maxlen_ts,), name='n_input')
n_embed = embedding_layer_ts(n_input)
n_embed = transformer_block_ts(n_embed)

c_input = layers.Input(shape=(maxlen_ts,), name='c_input')
c_embed = embedding_layer_ts(c_input)
c_embed = transformer_block_ts(c_embed)

m1_input = layers.Input(shape=(maxlen_ts,), name='m1_input')
m1_embed = embedding_layer_ts(m1_input)
m1_embed = transformer_block_ts(m1_embed)

m2_input = layers.Input(shape=(maxlen_ts,), name='m2_input')
m2_embed = embedding_layer_ts(m2_input)
m2_embed = transformer_block_ts(m2_embed)

# peptide CNN + LSTM
p_lstm = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(16), name='pep_lstm')
# pep_reshape = keras.backend.reshape(pep_embed, [-1, 30, 128, 1], name='pep_reshape')
# pep_cnn = tf.keras.layers.TimeDistributed(tf.keras.layers.Conv1D(filters=32, kernel_size=2, strides=1, padding='same', name='pep_cnn'),
#                                          name='pep_cnn_t')(pep_reshape)
# pep_pool = tf.keras.layers.TimeDistributed(tf.keras.layers.AveragePooling1D(pool_size=2, name='pep_pool'), name='pep_pool_t')(pep_cnn)
# pep_flat = tf.keras.layers.TimeDistributed(tf.keras.layers.Flatten(name='pep_flat'), name='pep_flat_t')(pep_pool)
# pep_lstm = p_lstm(pep_flat)
pep_lstm = p_lstm(pep_embed)
pep_lstm = tf.keras.layers.Dense(32, activation='relu', name='pep_dense')(pep_lstm)
pep_lstm = tf.keras.layers.Dropout(np.random.uniform(0, 0.2))(pep_lstm)

# meta info
meta = tf.keras.layers.Dense(32, activation='relu', name='meta_dense')(meta_input)
meta = tf.keras.layers.Dropout(np.random.uniform(0, 0.2))(meta)

# cleavage site CNN+LSTM
ts_lstm = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(16), name='ts_lstm')
ts_flat = tf.keras.layers.Dense(16, activation='relu', name='ts_flat')
ts_drop = tf.keras.layers.Dropout(np.random.uniform(0, 0.2))

# n_reshape = keras.backend.reshape(n_embed, [-1, 15, 64, 1], name='n_reshape')
# n_cnn = tf.keras.layers.TimeDistributed(tf.keras.layers.Conv1D(filters=16, kernel_size=2, strides=1, padding='same'))(n_reshape)
# n_pool = tf.keras.layers.TimeDistributed(tf.keras.layers.AveragePooling1D(pool_size=2))(n_cnn)
# n_flat = tf.keras.layers.TimeDistributed(tf.keras.layers.Flatten())(n_pool)
# n_lstm = ts_lstm(n_flat)
# n_lstm = tf.keras.layers.Dense(16, activation='relu', name='n_dense')(n_lstm)
# n_lstm = tf.keras.layers.Dropout(np.random.uniform(0, 0.2))(n_lstm)
n_lstm = ts_lstm(n_embed)
n_lstm = ts_flat(n_lstm)
n_lstm = ts_drop(n_lstm)

# c_reshape = keras.backend.reshape(c_embed, [-1, 15, 64, 1], name='c_reshape')
# c_cnn = tf.keras.layers.TimeDistributed(tf.keras.layers.Conv1D(filters=16, kernel_size=2, strides=1, padding='same'))(c_reshape)
# c_pool = tf.keras.layers.TimeDistributed(tf.keras.layers.AveragePooling1D(pool_size=2))(c_cnn)
# c_flat = tf.keras.layers.TimeDistributed(tf.keras.layers.Flatten())(c_pool)
# c_lstm = ts_lstm(c_flat)
# c_lstm = tf.keras.layers.Dense(16, activation='relu', name='c_dense')(c_lstm)
# c_lstm = tf.keras.layers.Dropout(np.random.uniform(0, 0.2))(c_lstm)
c_lstm = ts_lstm(c_embed)
c_lstm = ts_flat(c_lstm)
c_lstm = ts_drop(c_lstm)

# m1_reshape = keras.backend.reshape(m1_embed, [-1, 15, 64, 1], name='m1_reshape')
# m1_cnn = tf.keras.layers.TimeDistributed(tf.keras.layers.Conv1D(filters=16, kernel_size=2, strides=1, padding='same'))(m1_reshape)
# m1_pool = tf.keras.layers.TimeDistributed(tf.keras.layers.AveragePooling1D(pool_size=2))(m1_cnn)
# m1_flat = tf.keras.layers.TimeDistributed(tf.keras.layers.Flatten())(m1_pool)
# m1_lstm = ts_lstm(m1_flat)
# m1_lstm = tf.keras.layers.Dense(16, activation='relu', name='m1_dense')(m1_lstm)
# m1_lstm = tf.keras.layers.Dropout(np.random.uniform(0, 0.2))(m1_lstm)
m1_lstm = ts_lstm(m1_embed)
m1_lstm = ts_flat(m1_lstm)
m1_lstm = ts_drop(m1_lstm)

# m2_reshape = keras.backend.reshape(m2_embed, [-1, 15, 64, 1], name='m2_reshape')
# m2_cnn = tf.keras.layers.TimeDistributed(tf.keras.layers.Conv1D(filters=16, kernel_size=2, strides=1, padding='same'))(m2_reshape)
# m2_pool = tf.keras.layers.TimeDistributed(tf.keras.layers.AveragePooling1D(pool_size=2))(m2_cnn)
# m2_flat = tf.keras.layers.TimeDistributed(tf.keras.layers.Flatten())(m2_pool)
# m2_lstm = ts_lstm(m2_flat)
# m2_lstm = tf.keras.layers.Dense(16, activation='relu', name='m2_dense')(m2_lstm)
# m2_lstm = tf.keras.layers.Dropout(np.random.uniform(0, 0.2))(m2_lstm)
m2_lstm = ts_lstm(m2_embed)
m2_lstm = ts_flat(m2_lstm)
m2_lstm = ts_drop(m2_lstm)

# Digestibility 
output_array = [] 
metrics_array = {}
loss_array = {}
digestibility_flat = tf.keras.layers.Dense(1, activation='sigmoid', name="digestibility_output")

for i, dense_layer in enumerate([n_lstm, c_lstm, m1_lstm, m2_lstm]):
    layer = tf.keras.layers.concatenate([pep_lstm, dense_layer])
    # A Dense Layer is created for each output
    digestibility_output = digestibility_flat(layer)
    output_array.append(digestibility_output)
    if i ==0:
        metrics_array["digestibility_output"] = 'binary_accuracy'
        loss_array["digestibility_output"] = 'binary_crossentropy'
    else:
        metrics_array["digestibility_output_"+str(i)] = 'binary_accuracy'
        loss_array["digestibility_output_"+str(i)] = 'binary_crossentropy'


# Detectability
detectability_output = tf.keras.layers.concatenate([pep_lstm, 
                                                     meta,
                                                     n_lstm,
                                                     c_lstm,
                                                     m1_lstm,
                                                     m2_lstm])

detectability_output = tf.keras.layers.Dense(64, activation='relu', name='detect_out1')(detectability_output)
detectability_output = tf.keras.layers.Dropout(np.random.uniform(0, 0.2), name='detect_drop1')(detectability_output)
detectability_output = tf.keras.layers.Dense(32, activation='relu', name='detect_out2')(detectability_output)
detectability_output = tf.keras.layers.Dropout(np.random.uniform(0, 0.2), name='detect_drop2')(detectability_output)
detectability_output = tf.keras.layers.Dense(1, activation = 'sigmoid', name='detectability_output')(detectability_output)

output_array.append(detectability_output)

metrics_array['detectability_output'] = 'binary_accuracy'
loss_array['detectability_output'] = 'binary_crossentropy'

In [66]:
metrics_array, loss_array, output_array

({'digestibility_output': 'binary_accuracy',
  'digestibility_output_1': 'binary_accuracy',
  'digestibility_output_2': 'binary_accuracy',
  'digestibility_output_3': 'binary_accuracy',
  'detectability_output': 'binary_accuracy'},
 {'digestibility_output': 'binary_crossentropy',
  'digestibility_output_1': 'binary_crossentropy',
  'digestibility_output_2': 'binary_crossentropy',
  'digestibility_output_3': 'binary_crossentropy',
  'detectability_output': 'binary_crossentropy'},
 [<KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'digestibility_output')>,
  <KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'digestibility_output')>,
  <KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'digestibility_output')>,
  <KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'digestibility_output')>,
  <KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'detectability_output')>])

In [67]:
model_high = tf.keras.Model(inputs=[pep_input, meta_input, n_input, c_input, m1_input, m2_input],
                            outputs=output_array)
model_high.summary()

model_high.compile(loss=loss_array,
                   optimizer=tf.keras.optimizers.Adam(1e-4),
                   metrics=metrics_array)

es = tf.keras.callbacks.EarlyStopping(monitor='val_loss',
                                      mode='min', 
                                      verbose=1,
                                      patience=50)

Model: "model_5"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
pep_input (InputLayer)          [(None, 30)]         0                                            
__________________________________________________________________________________________________
n_input (InputLayer)            [(None, 15)]         0                                            
__________________________________________________________________________________________________
c_input (InputLayer)            [(None, 15)]         0                                            
__________________________________________________________________________________________________
m1_input (InputLayer)           [(None, 15)]         0                                            
____________________________________________________________________________________________

In [68]:
history = model_high.fit(
            [pep_train, aa_train, en_train, ec_train, e1_train, e2_train],
            [np.array([1]*len(y_train)), np.array([1]*len(y_train)), np.array([0]*len(y_train)), np.array([0]*len(y_train)), y_train], 
            epochs=100,
            batch_size=256,
            validation_data=(
            [pep_val, aa_val, en_val, ec_val, e1_val, e2_val],
            [np.array([1]*len(y_val)), np.array([1]*len(y_val)), np.array([0]*len(y_val)), np.array([0]*len(y_val)), y_val]),
#             callbacks=[es]
)

Epoch 1/300
Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300


Epoch 10/300
Epoch 11/300
Epoch 12/300
Epoch 13/300
Epoch 14/300
Epoch 15/300
Epoch 16/300
Epoch 17/300


Epoch 18/300
Epoch 19/300
Epoch 20/300
Epoch 21/300
Epoch 22/300
Epoch 23/300
Epoch 24/300
Epoch 25/300


Epoch 26/300
Epoch 27/300
Epoch 28/300
Epoch 29/300
Epoch 30/300
Epoch 31/300
Epoch 32/300
Epoch 33/300


Epoch 34/300
Epoch 35/300
Epoch 36/300
Epoch 37/300
Epoch 38/300
Epoch 39/300
Epoch 40/300
Epoch 41/300


Epoch 42/300
Epoch 43/300
Epoch 44/300
Epoch 45/300
Epoch 46/300
Epoch 47/300
Epoch 48/300
Epoch 49/300


Epoch 50/300
Epoch 51/300
Epoch 52/300
Epoch 53/300
Epoch 54/300
Epoch 55/300
Epoch 56/300
Epoch 57/300


Epoch 58/300
Epoch 59/300
Epoch 60/300
Epoch 61/300
Epoch 62/300
Epoch 63/300
Epoch 64/300
Epoch 65/300


Epoch 66/300
Epoch 67/300
Epoch 68/300
Epoch 69/300
Epoch 70/300
Epoch 71/300
Epoch 72/300
Epoch 73/300


Epoch 74/300
Epoch 75/300
Epoch 76/300
Epoch 77/300
Epoch 78/300
Epoch 79/300
Epoch 80/300
Epoch 81/300


Epoch 82/300
Epoch 83/300
Epoch 84/300
Epoch 85/300
Epoch 86/300
Epoch 87/300
Epoch 88/300
Epoch 89/300


Epoch 90/300
Epoch 91/300
Epoch 92/300
Epoch 93/300
Epoch 94/300
Epoch 95/300
Epoch 96/300
Epoch 97/300


Epoch 98/300
Epoch 99/300
Epoch 100/300
Epoch 101/300
Epoch 102/300
Epoch 103/300
Epoch 104/300
Epoch 105/300


Epoch 106/300
Epoch 107/300
Epoch 108/300
Epoch 109/300
Epoch 110/300
Epoch 111/300
Epoch 112/300
Epoch 113/300


Epoch 114/300
Epoch 115/300
Epoch 116/300
Epoch 117/300
Epoch 118/300
Epoch 119/300
Epoch 120/300
Epoch 121/300


Epoch 122/300
Epoch 123/300
Epoch 124/300
Epoch 125/300

KeyboardInterrupt: 

In [None]:
plt.figure(figsize=(16,2))
plt.subplot(1,2,1)
plot_graphs(history, 'accuracy')
plt.subplot(1,2,2)
plot_graphs(history, 'loss')

# eval

In [None]:
test_loss, test_acc = model_high.evaluate([pep_test, aa_test, en_test, ec_test, e1_test, e2_test], y_test)
print('Test Loss: {}'.format(test_loss))
print('Test Accuracy: {}'.format(test_acc))

# prediction
y_pred = [1 if i>=0.5 else 0 for i in model_high.predict([pep_test, aa_test, en_test, ec_test, e1_test, e2_test])]
print(classification_report(y_test, y_pred))
# AUC
probs = model_high.predict([pep_test, aa_test, en_test, ec_test, e1_test, e2_test])
rf_auc = roc_auc_score(y_test, probs)
print('rf auc : {}'.format(rf_auc))
# plot the roc curve for the model_high
rf_fpr, rf_tpr, _ = roc_curve(y_test, probs)
plt.figure(figsize=(4, 4))
plt.plot(rf_fpr, rf_tpr, marker='.', label='RF AUC = {:.4f}'.format(rf_auc), color='orange')
plt.title('ROC curve')
# axis labels
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
# show the legend
plt.legend()
# show the plot
plt.show()

cf_matrix = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(3, 3))
sns.heatmap(cf_matrix, annot=True, fmt=',.0f')
plt.show()
plt.figure(figsize=(3, 3))
sns.heatmap(cf_matrix/np.sum(cf_matrix), annot=True, 
            fmt='.2%', cmap='Blues')
plt.show()