In [1]:
import os
import sys
import numpy as np
import keras
# from sklearn.model_selection import train_test_split

Using TensorFlow backend.


In [2]:
seed = 273
kinase = 'tpk_lck'
model_file = 'rnn_%s.h5' % (kinase)

In [3]:
#import data
smiles_lines = [line.strip().split(',') for line in open(os.path.join('data', kinase+'_smiles.csv'))]
smiles = [line[1] for line in smiles_lines]
fingerprint_lines = [line.strip().split(',') for line in open(os.path.join('data', kinase+'_fingerprints.csv'))]
fingerprints = [line[2:] for line in fingerprint_lines]
X_fingerprints = np.asarray(fingerprints, dtype=np.int16)
y = np.asarray([int(line[2]) for line in smiles_lines], dtype=np.int8)

In [4]:
#pad smiles with '!' to ensure equal length
max_smiles_len = max(len(s) for s in smiles)
smiles = [s + '!'*(max_smiles_len + 1 - len(s)) for s in smiles]

In [5]:
#one-hot vector representation of smiles
char_set = set()
for s in smiles:
    for c in s:
        char_set.add(c)
char_set = list(char_set)
char_to_index = {char_set[i]: i for i in range(len(char_set))}

X_smiles = np.zeros((len(smiles), max_smiles_len + 1, len(char_set)))
for i in range(len(smiles)):
    code = smiles[i]
    for j in range(len(code)):
        char = code[j]
        X_smiles[i, j, char_to_index[char]] = 1        

In [6]:
print(X_fingerprints.shape)
print(X_smiles.shape)
print(y.shape)

(1809, 6117)
(1809, 268, 40)
(1809,)


In [7]:
#train val test split
n_tot = X_smiles.shape[0]
n_test = round(n_tot*0.15)
n_val = round(n_tot*0.15)
n_train  = n_tot - n_test - n_val
indices = np.arange(n_tot, dtype=int)
np.random.seed(seed)
np.random.shuffle(indices)
indices_train = indices[:n_train]
indices_val = indices[n_train:n_train+n_val]
indices_test = indices[n_train+n_val:]
X_fingerprints_train = X_fingerprints[indices_train]
print(X_fingerprints_train.shape)
X_smiles_train = X_smiles[indices_train]
print(X_smiles_train.shape)
y_train = y[indices_train]
print(y_train.shape)
X_fingerprints_val = X_fingerprints[indices_val]
print(X_fingerprints_val.shape)
X_smiles_val = X_smiles[indices_val]
print(X_smiles_val.shape)
y_val = y[indices_val]
print(y_val.shape)
X_fingerprints_test = X_fingerprints[indices_test]
print(X_fingerprints_test.shape)
X_smiles_test = X_smiles[indices_test]
print(X_smiles_test.shape)
y_test = y[indices_test]
print(y_test.shape)
smiles_input_shape = X_smiles_train.shape[1:]
fingerprints_input_shape = X_fingerprints_train.shape[1:]

(1267, 6117)
(1267, 268, 40)
(1267,)
(271, 6117)
(271, 268, 40)
(271,)
(271, 6117)
(271, 268, 40)
(271,)


In [8]:
#check GPU presence
from tensorflow.python.client import device_lib

def get_available_gpus():
    local_device_protos = device_lib.list_local_devices()
    return [x.name for x in local_device_protos if x.device_type == 'GPU']

print(get_available_gpus())

[]


In [9]:
#keras imports
from keras.models import Sequential
from keras.layers import TimeDistributed, GlobalAveragePooling2D, Activation, Dense, Input, Bidirectional
from keras.layers import BatchNormalization, Conv2D, MaxPooling2D, AveragePooling2D, GlobalAveragePooling2D
from keras.layers import GlobalMaxPooling2D
from keras.layers.recurrent import LSTM
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, Callback, EarlyStopping
from keras.layers import Dropout, Flatten
from keras.layers import concatenate
from keras import regularizers
from keras import initializers
from keras import constraints
from keras.models import Model
# Backend
from keras import backend as K
# Utils
from keras.utils.layer_utils import convert_all_kernels_in_model
from keras.utils.data_utils import get_file
from keras import metrics

In [10]:
#F1 score computation
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score
from keras import backend as K

def f1(y_true, y_pred):
    def recall(y_true, y_pred):
        """Recall metric.

        Only computes a batch-wise average of recall.

        Computes the recall, a metric for multi-label classification of
        how many relevant items are selected.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision(y_true, y_pred):
        """Precision metric.

        Only computes a batch-wise average of precision.

        Computes the precision, a metric for multi-label classification of
        how many selected items are relevant.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision
    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [11]:
#basic RNN model
model = Sequential()
model.add(Bidirectional(LSTM(units=128, return_sequences=False), input_shape=smiles_input_shape))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(2, activation='softmax'))

In [12]:
#compile model
adam = keras.optimizers.Adam(lr=0.001, decay=0.0, clipnorm=5.)
stop = EarlyStopping(patience=20, verbose=1)
model.compile(optimizer=adam, loss='sparse_categorical_crossentropy', metrics=[metrics.sparse_categorical_accuracy, f1])
checkpoint = ModelCheckpoint(model_file, monitor='val_loss', verbose=1, save_best_only=True, mode='min')

callbacks_list = [checkpoint, stop]

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bidirectional_1 (Bidirection (None, 256)               173056    
_________________________________________________________________
dense_1 (Dense)              (None, 128)               32896     
_________________________________________________________________
dropout_1 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 258       
Total params: 206,210
Trainable params: 206,210
Non-trainable params: 0
_________________________________________________________________


In [13]:
#training
model.fit(x=X_smiles_train, 
          y=y_train, 
          batch_size=32, 
          epochs=1000, 
          verbose=1, 
          callbacks=callbacks_list, 
          validation_data=(X_smiles_val, y_val), 
          shuffle=True)

Train on 1267 samples, validate on 271 samples
Epoch 1/1000

Epoch 00001: val_loss improved from inf to 0.55896, saving model to rnn_tpk_lck.h5
Epoch 2/1000

Epoch 00002: val_loss improved from 0.55896 to 0.52213, saving model to rnn_tpk_lck.h5
Epoch 3/1000

Epoch 00003: val_loss improved from 0.52213 to 0.47130, saving model to rnn_tpk_lck.h5
Epoch 4/1000

Epoch 00004: val_loss improved from 0.47130 to 0.47040, saving model to rnn_tpk_lck.h5
Epoch 5/1000

Epoch 00005: val_loss did not improve from 0.47040
Epoch 6/1000

Epoch 00006: val_loss improved from 0.47040 to 0.45769, saving model to rnn_tpk_lck.h5
Epoch 7/1000

Epoch 00007: val_loss did not improve from 0.45769
Epoch 8/1000

Epoch 00008: val_loss did not improve from 0.45769
Epoch 9/1000

Epoch 00009: val_loss did not improve from 0.45769
Epoch 10/1000

Epoch 00010: val_loss improved from 0.45769 to 0.44856, saving model to rnn_tpk_lck.h5
Epoch 11/1000

Epoch 00011: val_loss did not improve from 0.44856
Epoch 12/1000

Epoch 000


Epoch 00030: val_loss did not improve from 0.40668
Epoch 31/1000

Epoch 00031: val_loss did not improve from 0.40668
Epoch 32/1000

Epoch 00032: val_loss did not improve from 0.40668
Epoch 33/1000

Epoch 00033: val_loss did not improve from 0.40668
Epoch 34/1000

Epoch 00034: val_loss did not improve from 0.40668
Epoch 35/1000

Epoch 00035: val_loss did not improve from 0.40668
Epoch 36/1000

Epoch 00036: val_loss did not improve from 0.40668
Epoch 37/1000

Epoch 00037: val_loss did not improve from 0.40668
Epoch 38/1000

Epoch 00038: val_loss did not improve from 0.40668
Epoch 39/1000

Epoch 00039: val_loss did not improve from 0.40668
Epoch 40/1000

Epoch 00040: val_loss did not improve from 0.40668
Epoch 41/1000

KeyboardInterrupt: 

In [14]:
#testing
from keras.models import load_model
trained_model = load_model(model_file, custom_objects={'f1': f1})
print(trained_model.metrics_names)

['loss', 'sparse_categorical_accuracy', 'f1']


In [15]:
trained_model.evaluate(x=X_smiles_train, y=y_train)



[0.3261936140182739, 0.8508287290465484, 0.8324411510950764]

In [16]:
trained_model.evaluate(x=X_smiles_val, y=y_val)



[0.4066754659823386, 0.8154981556413798, 0.8352863498279529]

In [17]:
trained_model.evaluate(x=X_smiles_test, y=y_test)



[0.3603327609736101, 0.8487084873048142, 0.836762313693212]

In [18]:
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
pred_train = trained_model.predict(x=X_smiles_train)[:,1]
pred_val = trained_model.predict(x=X_smiles_val)[:,1]
pred_test = trained_model.predict(x=X_smiles_test)[:,1]
np.savez(model_file[:-2] + 'npz', pred_train=pred_train, pred_val=pred_val, pred_test=pred_test)