In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import glob
from pprint import pprint
import tqdm

from scipy.stats import chi2_contingency

In [3]:
from rdkit import Chem

## Read data

In [18]:
dframe = pd.read_csv("data/smiles_data.csv")
smiles_to_check = dframe["smiles"].values

In [5]:
# dframe = pd.read_csv("data/processed_DILI.csv")
# dframe = dframe[["COMPOUND_NAME", "DILI_CONCERN", "SMILES"]].dropna()
# dframe = dframe[dframe["SMILES"].str.len()<510].reset_index(drop=True)
# smiles_to_check = dframe["SMILES"].values

### Neural network

In [19]:
from molbart.decoder import DecodeSampler
from molbart.tokeniser import MolEncTokeniser
from molbart.models.pre_train import BARTModel

import molbart.util as util
import torch
from molbart.data.datamodules import MoleculeDataModule

from rdkit import Chem

In [20]:
smiles_to_check = [i.split()[0].strip() for i in smiles_to_check]

In [8]:
class Config:
    model_path="./weights/mask/version_16/checkpoints/epoch=479-step=54719.ckpt"

tokeniser = util.load_tokeniser("/home/wwydmanski/Chemformer/bart_vocab.txt", util.DEFAULT_CHEM_TOKEN_START)
sampler = DecodeSampler(tokeniser, util.DEFAULT_MAX_SEQ_LEN)
model = util.load_bart(Config, sampler)
model = model.cuda()

In [9]:
def resample_molecule(smiles):
    if type(smiles) is str:
        smiles = [smiles]
        
    token_output = tokeniser.tokenise(smiles, pad=True)    
    enc_token_ids = tokeniser.convert_tokens_to_ids(token_output['original_tokens'])

    token_output["encoder_input"] = torch.tensor(enc_token_ids).cuda().T
    token_output["encoder_pad_mask"] = torch.tensor(token_output["original_pad_masks"]).cuda().T
    token_output["decoder_input"] = torch.tensor(enc_token_ids).cuda().T
    token_output["decoder_pad_mask"] = torch.tensor(token_output["original_pad_masks"]).cuda().T
    model.num_beams = 15
    smiles_batch, log_lhs_batch = model.sample_molecules(token_output, "beam")
    
    return smiles_batch

In [22]:
batch_size = 1

matching = 0
data = []

for i in tqdm.trange(0, len(smiles_to_check), batch_size):
    smiles = smiles_to_check[i:(i+1)*batch_size]
    token_output = tokeniser.tokenise(smiles, pad=True)    
    enc_token_ids = tokeniser.convert_tokens_to_ids(token_output['original_tokens'])

    token_output["encoder_input"] = torch.tensor(enc_token_ids).cuda().T
    token_output["encoder_pad_mask"] = torch.tensor(token_output["original_pad_masks"]).cuda().T
    token_output["decoder_input"] = torch.tensor(enc_token_ids).cuda().T
    token_output["decoder_pad_mask"] = torch.tensor(token_output["original_pad_masks"]).cuda().T
    
    encoder_input = token_output["encoder_input"]
    encoder_pad_mask = token_output["encoder_pad_mask"].transpose(0, 1)
    encoder_embs = model._construct_input(encoder_input)
    
    batch = model.encoder(encoder_embs)
    
    del token_output
    del encoder_embs
    
    data.append(batch.detach().cpu().numpy()[-1, 0])
    break

  0%|                                                                                         | 0/71222 [00:00<?, ?it/s]


In [23]:
smiles

['c1ccccc1N(CNC2=O)C23CCN(CC3)C(=O)OCc4cccc(c4)Oc5ccccc5']

In [24]:
data

[array([ 2.06473321e-02, -1.02492990e-02,  9.93479192e-02,  3.19700055e-02,
         3.52170616e-02, -5.97350001e-02,  1.21315187e-02, -9.39642265e-03,
        -6.60165399e-02, -3.63618508e-02,  3.70051824e-02, -8.04902837e-02,
         1.14619769e-02, -5.19984215e-02, -6.30246773e-02,  3.13120596e-02,
         6.21142685e-02, -6.46388158e-02, -5.95768467e-02, -7.02935830e-02,
         5.13490513e-02,  1.26066431e-02, -4.52264361e-02, -4.94012348e-02,
        -1.10522863e-02,  6.39816523e-02, -2.10183915e-02,  1.82645880e-02,
         4.56356816e-02, -2.21182797e-02,  3.09556946e-02,  7.65324850e-03,
         1.80826709e-02,  3.54681090e-02,  5.51509671e-02,  4.70067112e-04,
        -8.01686794e-02,  1.68720018e-02,  6.54568300e-02,  9.73046497e-02,
         1.36657311e-02,  1.34002836e-02, -7.79754296e-03, -1.06223756e-02,
        -4.38270383e-02, -3.78769152e-02, -1.03812804e-02,  8.56769644e-03,
         3.68353054e-02, -9.49761197e-02, -1.08982055e-02,  1.56069919e-02,
        -1.8

In [11]:
data = [row.tolist() for row in data]

In [12]:
dframe["embeddings"] = [str(row) for row in data]

ValueError: Length of values (1) does not match length of index (389)

In [46]:
# dframe.to_csv("data/mu_embeddings.csv")

In [45]:
val_set_frac = 0.1
val_idx = np.random.randint(len(dframe), size=int(len(dframe)*val_set_frac))

dframe["set"] = "train"
dframe.loc[val_idx, "set"] = "val"

dframe.to_csv("data/DILI_embeddings.csv")