In [1]:
import os
import sys
import numpy as np
import keras
from sklearn.model_selection import train_test_split

Using TensorFlow backend.


In [2]:
seed = 273
kinase = 'cdk2'
model_file = 'basic_RNN_%s.h5' % (kinase)

In [3]:
#import data
lines = [line.strip().split(',') for line in open(os.path.join('data', kinase+'_smiles.csv'))]
smiles = [line[1] for line in lines]
y = np.asarray([int(line[2]) for line in lines], dtype=np.int8)

In [4]:
#pad smiles with '!' to ensure equal length
max_smiles_len = max(len(s) for s in smiles)
smiles = [s + '!'*(max_smiles_len + 1 - len(s)) for s in smiles]

In [5]:
#one-hot vector representation of smiles
char_set = set()
for s in smiles:
    for c in s:
        char_set.add(c)
char_set = list(char_set)
char_to_index = {char_set[i]: i for i in range(len(char_set))}

X = np.zeros((len(smiles), max_smiles_len + 1, len(char_set)))
for i in range(len(smiles)):
    code = smiles[i]
    for j in range(len(code)):
        char = code[j]
        X[i, j, char_to_index[char]] = 1        

In [6]:
#train val test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=seed)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=seed)

print(X_train.shape, y_train.shape)
print(X_val.shape, y_val.shape)
print(X_test.shape, y_test.shape)

input_shape = X_train.shape[1:]

(1322, 84, 36) (1322,)
(283, 84, 36) (283,)
(284, 84, 36) (284,)


In [7]:
#check GPU presence
from tensorflow.python.client import device_lib

def get_available_gpus():
    local_device_protos = device_lib.list_local_devices()
    return [x.name for x in local_device_protos if x.device_type == 'GPU']

print(get_available_gpus())

[]


In [8]:
#keras imports
from keras.models import Sequential
from keras.layers import TimeDistributed, GlobalAveragePooling2D, Activation, Dense, Input, Bidirectional
from keras.layers import BatchNormalization, Conv2D, MaxPooling2D, AveragePooling2D, GlobalAveragePooling2D
from keras.layers import GlobalMaxPooling2D
from keras.layers.recurrent import LSTM
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, Callback, EarlyStopping
from keras.layers import Dropout, Flatten
from keras.layers import concatenate
from keras import regularizers
from keras import initializers
from keras import constraints
from keras.models import Model
# Backend
from keras import backend as K
# Utils
from keras.utils.layer_utils import convert_all_kernels_in_model
from keras.utils.data_utils import get_file
from keras import metrics

In [9]:
#F1 score computation
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score
from keras import backend as K

def f1(y_true, y_pred):
    def recall(y_true, y_pred):
        """Recall metric.

        Only computes a batch-wise average of recall.

        Computes the recall, a metric for multi-label classification of
        how many relevant items are selected.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision(y_true, y_pred):
        """Precision metric.

        Only computes a batch-wise average of precision.

        Computes the precision, a metric for multi-label classification of
        how many selected items are relevant.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision
    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))


In [10]:
#basic RNN model
model = Sequential()
model.add(Bidirectional(LSTM(units=64, return_sequences=False), input_shape=input_shape))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(2, activation='softmax'))

In [11]:
#compile model
adam = keras.optimizers.Adam(lr=0.001, decay=0.0, clipnorm=5.)
stop = EarlyStopping(patience=20, verbose=1)
model.compile(optimizer=adam, loss='sparse_categorical_crossentropy', metrics=[metrics.sparse_categorical_accuracy, f1])
checkpoint = ModelCheckpoint(model_file, monitor='val_loss', verbose=1, save_best_only=True, mode='min')

callbacks_list = [checkpoint, stop]

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bidirectional_1 (Bidirection (None, 128)               51712     
_________________________________________________________________
dense_1 (Dense)              (None, 64)                8256      
_________________________________________________________________
dropout_1 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 130       
Total params: 60,098
Trainable params: 60,098
Non-trainable params: 0
_________________________________________________________________


In [12]:
#training
model.fit(x=X_train, 
          y=y_train, 
          batch_size=32, 
          epochs=1000, 
          verbose=1, 
          callbacks=callbacks_list, 
          validation_data=(X_val, y_val), 
          shuffle=True)

Train on 1322 samples, validate on 283 samples
Epoch 1/1000

Epoch 00001: val_loss improved from inf to 0.57523, saving model to basic_RNN.h5
Epoch 2/1000

Epoch 00002: val_loss improved from 0.57523 to 0.54964, saving model to basic_RNN.h5
Epoch 3/1000

Epoch 00003: val_loss improved from 0.54964 to 0.49114, saving model to basic_RNN.h5
Epoch 4/1000

Epoch 00004: val_loss did not improve from 0.49114
Epoch 5/1000

Epoch 00005: val_loss improved from 0.49114 to 0.48360, saving model to basic_RNN.h5
Epoch 6/1000

Epoch 00006: val_loss improved from 0.48360 to 0.47029, saving model to basic_RNN.h5
Epoch 7/1000

Epoch 00007: val_loss did not improve from 0.47029
Epoch 8/1000

Epoch 00008: val_loss improved from 0.47029 to 0.45231, saving model to basic_RNN.h5
Epoch 9/1000

Epoch 00009: val_loss did not improve from 0.45231
Epoch 10/1000

Epoch 00010: val_loss did not improve from 0.45231
Epoch 11/1000

Epoch 00011: val_loss did not improve from 0.45231
Epoch 12/1000

Epoch 00012: val_loss


Epoch 00030: val_loss did not improve from 0.40886
Epoch 31/1000

Epoch 00031: val_loss did not improve from 0.40886
Epoch 32/1000

Epoch 00032: val_loss did not improve from 0.40886
Epoch 33/1000

Epoch 00033: val_loss did not improve from 0.40886
Epoch 34/1000

Epoch 00034: val_loss did not improve from 0.40886
Epoch 35/1000

Epoch 00035: val_loss did not improve from 0.40886
Epoch 36/1000

Epoch 00036: val_loss did not improve from 0.40886
Epoch 37/1000

Epoch 00037: val_loss did not improve from 0.40886
Epoch 38/1000

Epoch 00038: val_loss did not improve from 0.40886
Epoch 39/1000

Epoch 00039: val_loss did not improve from 0.40886
Epoch 40/1000

Epoch 00040: val_loss did not improve from 0.40886
Epoch 41/1000

Epoch 00041: val_loss did not improve from 0.40886
Epoch 42/1000

Epoch 00042: val_loss did not improve from 0.40886
Epoch 43/1000

Epoch 00043: val_loss did not improve from 0.40886
Epoch 44/1000

Epoch 00044: val_loss did not improve from 0.40886
Epoch 45/1000

Epoch 000

<keras.callbacks.History at 0x7feaf2473748>

In [13]:
#testing
from keras.models import load_model
trained_model = load_model(model_file, custom_objects={'f1': f1})
print(trained_model.metrics_names)

['loss', 'sparse_categorical_accuracy', 'f1']


In [14]:
trained_model.evaluate(x=X_train, y=y_train)



[0.29255126698475203, 0.8691376702868452, 0.7911279401692608]

In [15]:
trained_model.evaluate(x=X_val, y=y_val)



[0.4088619075478598, 0.830388690262717, 0.8199131587790095]

In [16]:
trained_model.evaluate(x=X_test, y=y_test)



[0.40860423319776296, 0.8204225326927614, 0.825893199779618]