In [1]:
import selfies as sf
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Draw
from rdkit import RDLogger
from rdkit.DataStructs.cDataStructs import TanimotoSimilarity
from rdkit.Chem.Draw import IPythonConsole
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
from multiprocessing import Pool
from functools import partial
from torch import nn

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import myModel

In [4]:
train_data = pd.read_csv("moses_train.csv")
test_data = pd.read_csv("moses_test.csv")

raw_data = pd.concat([train_data, test_data])

In [5]:
def fingerprint_from_smile(smile=str()):

  mol = Chem.MolFromSmiles(smile)
  fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=2048) #could 2048 bits be a hyperparam?

  return fp

In [6]:
def selfies_alphabet(selfies_list=list()):

    #  Define selfies Alphabet with padding symbol included 
    all_selfies_symbols = sf.get_alphabet_from_selfies(selfies_list)
    all_selfies_symbols.add('[nop]') # [nop] is the canonical padding symbol supported by the selfies library 
    selfies_alphabet = list(sorted(all_selfies_symbols))

    return selfies_alphabet

In [7]:
def tanimotoLoss(output, label):
    #specifying the batch size
    output = output > 0.5
    N_AB = torch.bitwise_and(output, label)
    N_A = torch.sum(output)
    N_B = torch.sum(label)
    coeff = N_AB / (N_A + N_B - N_AB)
    loss = 1-coeff
    return -loss

In [8]:
max_len = max(sf.len_selfies(s) for s in raw_data["selfies"])

alphabet = selfies_alphabet(raw_data["selfies"])

vocab_stoi = {symbol: idx for idx, symbol in enumerate(alphabet)}

vocab_itos = {idx: symbol for symbol, idx in vocab_stoi.items()}

In [9]:
alpha_len = len(vocab_stoi)

In [10]:
from sklearn.model_selection import train_test_split

selfies = list(raw_data["selfies"])
smiles = list(raw_data["smiles"])

selfies_train, selfies_test, smiles_train, smiles_test = train_test_split(selfies, smiles, test_size=0.33, random_state=42)
selfies_train, selfies_val, smiles_train, smiles_val = train_test_split(selfies_train, smiles_train, test_size=0.20, random_state=42)

train_data = list(zip(selfies_train, smiles_train))
val_data = list(zip(selfies_val, smiles_val))
test_data = list(zip(selfies_test, smiles_test))

In [29]:
print(str(55*24))

1320


In [60]:
import torch
from torch.utils.data import DataLoader
from rdkit.Chem import DataStructs


train_loader = DataLoader(train_data, batch_size=64, shuffle=True)
val_loader = DataLoader(val_data, batch_size=64, shuffle=True)
test_loader = DataLoader(test_data, batch_size=64, shuffle=True)

net = myModel.Net(max_len*alpha_len, 1600, 1800, 2048)

#hyperparam
optimizer = torch.optim.SGD(net.parameters(), lr=0.001, momentum=0.9)


In [86]:
from torch.autograd import Variable

def train_one_epoch(epoch_index):
    running_loss = 0.
    last_loss = 0.

    for i, sample in enumerate(train_loader):
        selfies, smiles = sample
        
        inputs = sf.batch_selfies_to_flat_hot(selfies, vocab_stoi, pad_to_len=max_len)
        inputs = torch.tensor(inputs).float()
        
        labels = []
        for smile in smiles:
            fp = fingerprint_from_smile(smile)
            fp = fingerprint_from_smile(smile)
            fp_arr = np.zeros((0,), dtype=np.int8)
            DataStructs.ConvertToNumpyArray(fp,fp_arr)
            labels.append(fp_arr)
        labels = torch.tensor(labels)
        
        optimizer.zero_grad()       
        outputs = net(inputs)
        loss = tanimotoLoss(outputs, labels)
        loss = Variable(loss, requires_grad = True)
        loss.sum().backward()
        optimizer.step()

        #reporting performance
        running_loss += loss.item()
        if i % 1000 == 999:
            last_loss = running_loss / 1000 # loss per batch
            print('  batch {} loss: {}'.format(i + 1, last_loss))
            #tb_x = epoch_index * len(train_loader) + i + 1
            running_loss = 0.0
    return last_loss


In [87]:

NUM_EPOCHS = 5
best_vloss = 1_000_000.

epoch_number = 0

for epoch in range(NUM_EPOCHS):
    print('EPOCH {}:'.format(epoch + 1))
    net.train(True)
    avg_loss = train_one_epoch(epoch)

    net.train(False)
    
    running_vloss = 0.0
    for i, sample in enumerate(val_loader):
        inputs, labels = sample
        pred_outputs = net(inputs)
        tuples = list(zip(pred_outputs, labels))
        losses = tanimotoLoss(pred_outputs, labels)
        running_vloss +=losses

EPOCH 1:


ValueError: only one element tensors can be converted to Python scalars

In [63]:
pool=Pool(3)
processed_data = pool.map(partial(sf.selfies_to_encoding, vocab_stoi, pad_to_len=max_len),selfies)
pool.close()
pool.join()

AttributeError: 'dict' object has no attribute 'count'