# Building a RNN on SMILES data

In [1]:
from __future__ import absolute_import, division, print_function, unicode_literals

In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
import os
import pickle


from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score



#### Sample
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing import sequence

## Read in data

In [2]:
df = pd.read_csv("../data/CID_properties.csv")

In [5]:
# lowercase column names
df.columns = df.columns.str.lower()

In [34]:
df.head()

Unnamed: 0,cid,featurehydrophobecount3d,hbondacceptorcount,hbonddonorcount,isomericsmiles,molecularweight,drug_class
0,134812791,,16,13,C1C[C@@H](N(C1)C(=O)C2CSSC[C@H](C(=O)NC(C(=O)N...,1056.225,multi
1,134694981,3.0,4,3,CC#CCC(C)[C@@H](/C=C/C1[C@@H](C[C@H]2[C@@H]1CC...,360.494,multi
2,134692713,0.0,8,2,COC1=C(C=C(C=C1)S(=O)(=O)N2[C@H]([C@](C3=C2C=C...,620.498,hematologic
3,134692601,,16,13,C1C[C@H](N(C1)C(=O)C2CSSC[C@@H](C(=O)NC(C(=O)N...,1056.225,multi
4,134692167,1.0,10,3,C[C@@H]1C(=O)O[C@@H]2[C@]1(C34C(=O)O[C@H]5C3([...,424.402,multi


In [42]:
def eda(df):
    nulls = pd.DataFrame(df.isnull().sum()).T # Check for nulls
    datatypes = pd.DataFrame(df.dtypes).T # Check datatypes
    summary = pd.concat([nulls, datatypes], keys = ["nulls", "datatypes"]) # Create pandas dataframe, because I think it's easier to read
    return summary

In [43]:
eda(df)

Unnamed: 0,Unnamed: 1,cid,hbondacceptorcount,hbonddonorcount,isomericsmiles,molecularweight,drug_class
nulls,0,0,0,0,0,0,0
datatypes,0,int64,int64,int64,object,float64,object


In [39]:
# dropping the feature hydrophobecount3d column because it has too many mising values

df.drop(columns = "featurehydrophobecount3d", inplace = True)

## Encoding the data

In [68]:
all_characters = ""

for row in df["isomericsmiles"]:
    all_characters += row

In [84]:
char_cols = list(set(all_characters))
char_cols.sort()
# char_cols.index('C')

20

In [88]:
char_dicts = []
for smile in df['isomericsmiles']:
    smile_dict = {}
    for char in char_cols:
        smile_dict[char] = smile.count(char)
        
    char_dicts.append(smile_dict)
        
    

In [96]:
df_char = pd.DataFrame(char_dicts)

In [97]:
# Double check that it's accurate
# for col_char in char_cols:
  #  print(f"{col_char} count: {df['isomericsmiles'][0].count(col_char)}")

## Set up X and y variables

In [98]:
X = df_char
y = df["drug_class"]

In [99]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    stratify = y)

# Build model

In [112]:
model = Sequential()
model.add(Embedding(53,32))
model.add(SimpleRNN(32))
model.add(Dense(13, activation = "softmax"))
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, None, 32)          1696      
_________________________________________________________________
simple_rnn_2 (SimpleRNN)     (None, 32)                2080      
_________________________________________________________________
dense_1 (Dense)              (None, 13)                429       
Total params: 4,205
Trainable params: 4,205
Non-trainable params: 0
_________________________________________________________________


In [113]:
model.compile(optimizer = "adam", loss =  "sparse_categorical_crossentropy",
              metrics=['accuracy'])

In [114]:
history = model.fit(X_train, y_train,
                   epochs = 10,
                   batch_size = 128,
                   validation_split = 0.2)

Train on 7236 samples, validate on 1809 samples
Epoch 1/10
 128/7236 [..............................] - ETA: 44s

UnimplementedError:  Cast string to float is not supported
	 [[node metrics/accuracy/Cast (defined at <ipython-input-114-612c9e77d8cf>:4) ]] [Op:__inference_distributed_function_3626]

Function call stack:
distributed_function


In [116]:
svc = SVC(C=2,gamma='scale', kernel='linear', probability=True)

svc.fit(X_train, y_train)

SVC(C=2, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=False)

In [117]:
y_preds = svc.predict(X_test)

In [123]:
# Measure performance based on accuracy.
accuracy_score(y_test, y_preds)

0.383289124668435

In [125]:
# Generate confusion matrix. From lesson 4.03
cm_c = confusion_matrix(y_test, # True values.
                 y_preds)  # Predicted values.
cm_c = pd.DataFrame(cm_c, columns=['pred no help', 'pred needs help'],
                   index=['actual no help', 'actual needs help'])
cm_c

ValueError: Shape of passed values is (13, 13), indices imply (2, 2)

In [120]:
pred_df = pd.DataFrame({'true_values':y_test,
                         'predicted_values':y_preds})
pred_df

Unnamed: 0,true_values,predicted_values
2172,cardio,multi
9144,antineoplastic,multi
6548,antiinfective,multi
3508,cns,multi
7504,antiinfective,multi
...,...,...
10609,multi,multi
8840,antineoplastic,multi
4196,cns,multi
9155,multi,antiinfective


In [127]:
pickle.dump(y_preds, open("y_preds.pkl", "wb"))