In [1]:
import pandas as pd
import numpy as np

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

tf.random.set_seed(1)

In [2]:
def ds_prep(smi_list):
    char_array = []
    for smi in smi_list:
        char_list = [x for x in smi]
        char_array.append(char_list)
    #char_array = np.array(char_array)
    #char_array = np.reshape(char_array,(len(smi_list),1 , 1))
    return char_array

In [3]:
df = pd.read_csv('aspirin_like.csv')
df.drop_duplicates('Smiles',inplace = True)
df.shape

(269, 12)

In [4]:
smiles = df['Smiles'].to_list()
for i in range(len(smiles)): #add newline to each SMILES string
    smiles[i] = smiles[i]+'\n'
smiles[1]

'C1=CC=C(C(=C1)C(=O)O)O[N+](=O)[O-]\n'

In [5]:
max_smi, min_smi = 0, len(smiles[0])

for smi in smiles:
    if len(smi) > max_smi:
        max_smi = len(smi)
    if len(smi) < min_smi:
        min_smi = len(smi)
max_smi, min_smi

(38, 18)

In [6]:
features = []
labels = []

smi_len = 10 # characters from Smiles

for smi in smiles:
    for i in range(smi_len, len(smi)):
        seq = smi[i-smi_len:i + 1]
        features.append(seq[:-1])
        labels.append(seq[1:])
        
features[:5], labels[:5]

(['CC(=O)OC1=', 'C(=O)OC1=C', '(=O)OC1=CC', '=O)OC1=CC=', 'O)OC1=CC=C'],
 ['C(=O)OC1=C', '(=O)OC1=CC', '=O)OC1=CC=', 'O)OC1=CC=C', ')OC1=CC=CC'])

In [7]:
raw_dataset = tf.data.Dataset.from_tensor_slices(smiles)

In [8]:
preprocess = layers.TextVectorization(standardize = None, split = 'character', output_mode = 'int', output_sequence_length = 10)
preprocess.adapt(raw_dataset)

In [9]:
preprocess.get_vocabulary(), preprocess.vocabulary_size()

(['',
  '[UNK]',
  'C',
  '=',
  'O',
  ')',
  '(',
  '1',
  '\n',
  'N',
  'F',
  ']',
  '[',
  '-',
  'S',
  '+',
  'l',
  '2',
  'r',
  'B',
  'I',
  'H',
  '#'],
 23)

In [10]:
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
model = tf.keras.Sequential([Embedding(input_dim = len(preprocess.get_vocabulary()),output_dim = 100),
                             LSTM(64, dropout = 0.1, recurrent_dropout = 0.1, return_sequences = True),
                             Dropout(0.5), Dense(len(preprocess.get_vocabulary()))])

model.compile(optimizer = 'adam', loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits = True), metrics = ['accuracy'])

In [11]:
trn_features = preprocess(features)
trn_labels = preprocess(labels)
trn_features

<tf.Tensor: shape=(4994, 10), dtype=int64, numpy=
array([[2, 2, 6, ..., 2, 7, 3],
       [2, 6, 3, ..., 7, 3, 2],
       [6, 3, 4, ..., 3, 2, 2],
       ...,
       [6, 3, 4, ..., 6, 3, 4],
       [3, 4, 5, ..., 3, 4, 5],
       [4, 5, 4, ..., 4, 5, 4]], dtype=int64)>

In [12]:
trn_labels

<tf.Tensor: shape=(4994, 10), dtype=int64, numpy=
array([[2, 6, 3, ..., 7, 3, 2],
       [6, 3, 4, ..., 3, 2, 2],
       [3, 4, 5, ..., 2, 2, 3],
       ...,
       [3, 4, 5, ..., 3, 4, 5],
       [4, 5, 4, ..., 4, 5, 4],
       [5, 4, 5, ..., 5, 4, 8]], dtype=int64)>

In [13]:
model.fit(trn_features, trn_labels, epochs = 10) 

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x15b25bc9b90>

In [31]:
def gen_mol(smi_str, model, text_vect, gen_length):
    model.reset_states()
    
    start_str = smi_str
    
    for i in range(gen_length):
        start_vect = text_vect([start_str])
        
        logits = model(start_vect)
        logits = tf.squeeze(logits,0)
        
        new_char_index = tf.random.categorical(logits, num_samples = 1)
        new_char_index = tf.squeeze(new_char_index)[-1].numpy()
        new_char = text_vect.get_vocabulary()[new_char_index]        
        
        start_str += new_char
    
    return start_str    

In [35]:
input_text = 'C1CCCCCCC1' #look into padding
gen_length = 300
predictions = gen_mol(input_text,model,preprocess,gen_length)

In [36]:
predictions

