In [32]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Embedding, Conv1D, GRU, LSTM, Dense, Activation
from keras.utils import to_categorical
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import TensorBoard

In [3]:
df = pd.read_csv("./data.csv")

In [18]:
smiles = df.iloc[:,0]
targets = df.iloc[:,1:13]
targets.fillna(2, inplace=True)
smiles_train, smiles_test, targets_train, targets_test = train_test_split(smiles, targets, test_size=0.2)

In [26]:
batch_size = 32
tokenizer = Tokenizer(filters='', lower=False, char_level=True)
tokenizer.fit_on_texts(smiles.values)
one_hot_train = tokenizer.texts_to_sequences(smiles_train.values)
one_hot_test = tokenizer.texts_to_sequences(smiles_test.values)
one_hot_train = pad_sequences(one_hot_train, padding='post')
one_hot_test = pad_sequences(one_hot_test, padding='post')


In [33]:
model = Sequential()
model.add(Embedding(len(tokenizer.index_docs) + 1, 50, input_length=one_hot_train.shape[1]))
model.add(Conv1D(filters=192, kernel_size=5, activation='relu'))
model.add(LSTM(units=224, return_sequences=True, activation='relu'))
model.add(LSTM(units=384, activation='relu'))
model.add(Dense(12))
model.add(Activation('sigmoid'))
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_13 (Embedding)     (None, 342, 50)           2800      
_________________________________________________________________
conv1d_13 (Conv1D)           (None, 338, 192)          48192     
_________________________________________________________________
lstm_3 (LSTM)                (None, 338, 224)          373632    
_________________________________________________________________
lstm_4 (LSTM)                (None, 384)               935424    
_________________________________________________________________
dense_9 (Dense)              (None, 12)                4620      
_________________________________________________________________
activation_1 (Activation)    (None, 12)                0         
Total params: 1,364,668
Trainable params: 1,364,668
Non-trainable params: 0
_________________________________________________________________


In [29]:
tensorboardCallback = TensorBoard()
model.fit(one_hot_train, targets_train, epochs=100, validation_split=0.2, callbacks=[tensorboardCallback])
score = model.evaluate(one_hot_test, targets_test)
print(score)
model.save('my_model.h5')