In [1]:
import os
import sys
import numpy as np
import keras
# from sklearn.model_selection import train_test_split

Using TensorFlow backend.


In [2]:
seed = 273
kinase = 'tpk_lck'
model_file = 'mlp_%s.h5' % (kinase)

In [3]:
#import data
smiles_lines = [line.strip().split(',') for line in open(os.path.join('data', kinase+'_smiles.csv'))]
smiles = [line[1] for line in smiles_lines]
fingerprint_lines = [line.strip().split(',') for line in open(os.path.join('data', kinase+'_fingerprints.csv'))]
fingerprints = [line[2:] for line in fingerprint_lines]
X_fingerprints = np.asarray(fingerprints, dtype=np.int16)
y = np.asarray([int(line[2]) for line in smiles_lines], dtype=np.int8)

In [4]:
#pad smiles with '!' to ensure equal length
max_smiles_len = max(len(s) for s in smiles)
smiles = [s + '!'*(max_smiles_len + 1 - len(s)) for s in smiles]

In [5]:
#one-hot vector representation of smiles
char_set = set()
for s in smiles:
    for c in s:
        char_set.add(c)
char_set = list(char_set)
char_to_index = {char_set[i]: i for i in range(len(char_set))}

X_smiles = np.zeros((len(smiles), max_smiles_len + 1, len(char_set)))
for i in range(len(smiles)):
    code = smiles[i]
    for j in range(len(code)):
        char = code[j]
        X_smiles[i, j, char_to_index[char]] = 1        

In [6]:
print(X_fingerprints.shape)
print(X_smiles.shape)
print(y.shape)

(1809, 4096)
(1809, 268, 40)
(1809,)


In [7]:
#train val test split
n_tot = X_smiles.shape[0]
n_test = round(n_tot*0.15)
n_val = round(n_tot*0.15)
n_train  = n_tot - n_test - n_val
indices = np.arange(n_tot, dtype=int)
np.random.seed(seed)
np.random.shuffle(indices)
indices_train = indices[:n_train]
indices_val = indices[n_train:n_train+n_val]
indices_test = indices[n_train+n_val:]
X_fingerprints_train = X_fingerprints[indices_train]
print(X_fingerprints_train.shape)
X_smiles_train = X_smiles[indices_train]
print(X_smiles_train.shape)
y_train = y[indices_train]
print(y_train.shape)
X_fingerprints_val = X_fingerprints[indices_val]
print(X_fingerprints_val.shape)
X_smiles_val = X_smiles[indices_val]
print(X_smiles_val.shape)
y_val = y[indices_val]
print(y_val.shape)
X_fingerprints_test = X_fingerprints[indices_test]
print(X_fingerprints_test.shape)
X_smiles_test = X_smiles[indices_test]
print(X_smiles_test.shape)
y_test = y[indices_test]
print(y_test.shape)
smiles_input_shape = X_smiles_train.shape[1:]
fingerprints_input_shape = X_fingerprints_train.shape[1:]

(1267, 4096)
(1267, 268, 40)
(1267,)
(271, 4096)
(271, 268, 40)
(271,)
(271, 4096)
(271, 268, 40)
(271,)


In [8]:
smiles_lines_train = [smiles_lines[i] for i in indices_train]
smiles_lines_val = [smiles_lines[i] for i in indices_val]
smiles_lines_test = [smiles_lines[i] for i in indices_test]

In [9]:
with open(os.path.join('split', kinase, 'train_' + kinase + '.csv'), 'w') as f:
    for line in smiles_lines_train:
        f.write(','.join(line) + '\n')
with open(os.path.join('split', kinase, 'val_' + kinase + '.csv'), 'w') as f:
    for line in smiles_lines_val:
        f.write(','.join(line) + '\n')
with open(os.path.join('split', kinase, 'test_' + kinase + '.csv'), 'w') as f:
    for line in smiles_lines_test:
        f.write(','.join(line) + '\n')