In [None]:
import selfies as sf
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Draw
from rdkit import RDLogger
from rdkit.DataStructs.cDataStructs import TanimotoSimilarity
from rdkit.Chem.Draw import IPythonConsole
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
from functools import partial
from torch import nn

In [None]:
import myModel

In [None]:
train_data = pd.read_csv("moses_train.csv")
test_data = pd.read_csv("moses_test.csv")

raw_data = pd.concat([train_data, test_data])

In [None]:
def fingerprint_from_smile(smile=str()):
  mol = Chem.MolFromSmiles(smile)
  fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=2048) #could 2048 bits be a hyperparams
  return fp

In [None]:
def selfies_alphabet(selfies_list=list()):

    #  Define selfies Alphabet with padding symbol included 
    all_selfies_symbols = sf.get_alphabet_from_selfies(selfies_list)
    all_selfies_symbols.add('[nop]') # [nop] is the canonical padding symbol supported by the selfies library 
    selfies_alphabet = list(sorted(all_selfies_symbols))

    return selfies_alphabet

In [None]:
def tanimotoLoss(outputs, labels):

    zipped = list(zip(outputs, labels))
    
    total_loss = 0
    for output, label in zipped:
        #specifying the batch size
        #output = output > 0.5
        
        N_AB = torch.dot(output, label)
        N_A = torch.sum(output)
        N_B = torch.sum(label)
        coeff = N_AB / (N_A + N_B - N_AB)
        loss = 1-coeff
        total_loss += loss
    
    return total_loss/len(zipped)

In [None]:
max_len = max(sf.len_selfies(s) for s in raw_data["selfies"])

alphabet = selfies_alphabet(raw_data["selfies"])

vocab_stoi = {symbol: idx for idx, symbol in enumerate(alphabet)}

vocab_itos = {idx: symbol for symbol, idx in vocab_stoi.items()}

In [None]:
alpha_len = len(vocab_stoi)

In [None]:
from sklearn.model_selection import train_test_split

selfies = list(raw_data["selfies"])
smiles = list(raw_data["smiles"])

selfies_train, selfies_test, smiles_train, smiles_test = train_test_split(selfies, smiles, test_size=0.33, random_state=42)
selfies_train, selfies_val, smiles_train, smiles_val = train_test_split(selfies_train, smiles_train, test_size=0.20, random_state=42)

train_data = list(zip(selfies_train, smiles_train))
val_data = list(zip(selfies_val, smiles_val))
test_data = list(zip(selfies_test, smiles_test))

In [None]:
import torch
from torch.utils.data import DataLoader
from rdkit.Chem import DataStructs


train_loader = DataLoader(train_data, batch_size=1000, shuffle=True)
val_loader = DataLoader(val_data, batch_size=1000, shuffle=True)
test_loader = DataLoader(test_data, batch_size=1000, shuffle=True)

net = myModel.Net(max_len*alpha_len, 1600, 1800, 2048)

#hyperparam
optimizer = torch.optim.Adam(net.parameters(), lr=0.01)


In [19]:
from torch.autograd import Variable

def train_one_epoch(epoch_index):
    running_loss = 0.
    last_loss = 0.

    for i, sample in enumerate(train_loader):
        selfies, smiles = sample
        
        inputs = sf.batch_selfies_to_flat_hot(selfies, vocab_stoi, pad_to_len=max_len)
        inputs = torch.tensor(inputs).float()
        
        labels = []
        for smile in smiles:
            fp = fingerprint_from_smile(smile)
            fp = fingerprint_from_smile(smile)
            fp_arr = np.zeros((0,), dtype=np.float32)
            DataStructs.ConvertToNumpyArray(fp,fp_arr)
            labels.append(fp_arr)
        labels = torch.tensor(labels)
        
        optimizer.zero_grad()       
        outputs = net(inputs)
        #loss = tanimotoLoss(outputs, labels)
        #loss = Variable(loss, requires_grad = True)
        otherloss = nn.BCELoss()
        loss = otherloss(outputs, labels)
        loss.backward()
        optimizer.step()


        #reporting performance
        print(loss.item())
        running_loss += loss.item()
        if i % 500 == 499:
            last_loss = running_loss / 500 # loss per batch
            print('  batch {} loss: {}'.format(i + 1, last_loss))
            running_loss = 0.0
    return last_loss


In [20]:

NUM_EPOCHS = 5
best_vloss = 1,000,000.

epoch_number = 0

for epoch in range(NUM_EPOCHS):
    print('EPOCH {}:'.format(epoch + 1))
    net.train(True)
    avg_loss = train_one_epoch(epoch)

    net.train(False)
    
    running_vloss = 0.0
    for i, sample in enumerate(val_loader):
        seflies, smiles = sample

        inputs = sf.batch_selfies_to_flat_hot(selfies, vocab_stoi, pad_to_len=max_len)
        inputs = torch.tensor(inputs).float()
        
        labels = []
        for smile in smiles:
            fp = fingerprint_from_smile(smile)
            fp = fingerprint_from_smile(smile)
            fp_arr = np.zeros((0,), dtype=np.float32)
            DataStructs.ConvertToNumpyArray(fp,fp_arr)
            labels.append(fp_arr)
        labels = torch.tensor(labels)

        pred_outputs = net(inputs)
        tuples = list(zip(pred_outputs, labels))
        losses = tanimotoLoss(pred_outputs, labels)
        running_vloss +=losses

EPOCH 1:
0.7286798357963562
0.32938316464424133
0.0980440154671669
0.20179259777069092
0.12327368557453156
0.0802442654967308
0.0921555608510971
0.10937304049730301
0.1180245578289032
0.11828535795211792
0.109224334359169
0.09933117032051086
0.09044301509857178


KeyboardInterrupt: 