In [1]:
import pandas as pd
import numpy as np

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [2]:
df = pd.read_csv('aspirin_like.csv')
df.drop_duplicates('Smiles',inplace = True)
df.shape

(269, 12)

In [3]:
smiles = df['Smiles'].to_list()
for i in range(len(smiles)): #add newline to each SMILES string
    smiles[i] = smiles[i]+'\n'
smiles[1]

'C1=CC=C(C(=C1)C(=O)O)O[N+](=O)[O-]\n'

In [4]:
max_smi, min_smi = 0, len(smiles[0])

for smi in smiles:
    if len(smi) > max_smi:
        max_smi = len(smi)
    if len(smi) < min_smi:
        min_smi = len(smi)
max_smi, min_smi

(38, 18)

In [5]:
features = []
labels = []

smi_len = 10 # characters from Smiles

for smi in smiles:
    for i in range(smi_len, len(smi)):
        seq = smi[i-smi_len:i + 1]
        features.append(seq[:-1])
        labels.append(seq[-1])
        
features[:5]

['CC(=O)OC1=', 'C(=O)OC1=C', '(=O)OC1=CC', '=O)OC1=CC=', 'O)OC1=CC=C']

In [6]:
raw_dataset = tf.data.Dataset.from_tensor_slices(smiles)

In [7]:
preprocess = layers.TextVectorization(standardize = None, split = 'character')
preprocess.adapt(raw_dataset)

In [8]:
preprocess.get_vocabulary(), preprocess.vocabulary_size()

(['',
  '[UNK]',
  'C',
  '=',
  'O',
  ')',
  '(',
  '1',
  '\n',
  'N',
  'F',
  ']',
  '[',
  '-',
  'S',
  '+',
  'l',
  '2',
  'r',
  'B',
  'I',
  'H',
  '#'],
 23)

In [9]:
input_data = ['C1=CC=C(C=C1)CCOC(=O)/C=C/C2=CC(=C(C=C2)O)O','CC(=O)NC1=CC=C(C=C1)O']
preprocess(input_data)

<tf.Tensor: shape=(2, 43), dtype=int64, numpy=
array([[ 2,  7,  3,  2,  2,  3,  2,  6,  2,  3,  2,  7,  5,  2,  2,  4,
         2,  6,  3,  4,  5,  1,  2,  3,  2,  1,  2, 17,  3,  2,  2,  6,
         3,  2,  6,  2,  3,  2, 17,  5,  4,  5,  4],
       [ 2,  2,  6,  3,  4,  5,  9,  2,  7,  3,  2,  2,  3,  2,  6,  2,
         3,  2,  7,  5,  4,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0]], dtype=int64)>

In [10]:
features_ds = tf.data.Dataset.from_tensor_slices(features)
labels_ds = tf.data.Dataset.from_tensor_slices(labels)

In [11]:
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
model = tf.keras.models.Sequential([preprocess, Embedding(input_dim = len(preprocess.get_vocabulary()),output_dim = 100),
                                   LSTM(64, dropout = 0.1, recurrent_dropout = 0.1), Dense(64, activation = 'relu'),
                                   Dropout(0.5), Dense(len(preprocess.get_vocabulary()),activation = 'softmax')])
model.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])