In [1]:
import json
import torch
from torch.utils.data import IterableDataset
import pandas as pd
import numpy as np
from collections import defaultdict
from transformers import AutoTokenizer, AutoModelForMaskedLM
import pickle
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data_dict = json.load(open('data/dataset.json'))
df = pd.DataFrame(data_dict)
df.head()

Unnamed: 0,protein_target,smiles_1,smiles_0
0,MPHEPHEPLTPPFSALPDPAGAPSRRQSRQRPQLSSDSPSAFRASR...,"[[CCN1C(=CC(C)=O)Sc2ccc(OC)cc21, 1], [CC12OC(C...","[[CC(C)N1NC(=C2C=c3cc(O)ccc3=N2)c2c(N)ncnc21, ..."
1,MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKP...,"[[CSc1nc2ccccc2n1CC(=O)c1ccc(S(N)(=O)=O)cc1, 1...","[[CNC(=O)c1ccc(S(N)(=O)=O)cc1, 0], [Nc1ccc(CC(..."
2,MDPLNLSWYDDDLERQNWSRPFNGSEGKADRPHYNYYAMLLTLLIF...,[[CC(C)CC1C(=O)N2CCCC2C2(O)OC(NC(=O)C3C=C4c5cc...,"[[Oc1cc2c(cc1O)C1c3ccccc3CNC1CC2, 0], [CCCN1Cc..."
3,MDRSKENCISGPVKATAPVGGPKRVLVTQQFPCQNPLPVNSGQAQR...,[[Cc1cc(Nc2nc(Sc3ccc(NC(=O)CN4CCC(O)C4)cc3)nn3...,[[CC(Oc1cc(-c2cnn(C3CCNCC3)c2)cnc1N)c1c(Cl)ccc...
4,MRVVVIGAGVIGLSTALCIHERYHSVLQPLDIKVYADRFTPLTTTD...,"[[O=c1[nH]c2ccc(F)cc2cc1O, 1], [O=c1[nH]c2ccc(...","[[O=C(O)c1cc(CCc2ccc(Cl)cc2)c[nH]1, 0], [O=c1o..."


In [None]:
# Creating a dataframe with 50%
# values of original dataframe
train_df = df.sample(frac = 0.8)
 
# Creating dataframe with
# rest of the 50% values
test_df = df.drop(train_df.index)

In [None]:
train_df[:100].to_json('data/train.json')
test_df[:10].to_json('data/test.json')

### Precompute SMILES embeddings

In [3]:
all_smiles = set()

for l in df.smiles_0:
    l = map(lambda x: x[0], l)
    all_smiles.update(l)

for l in df.smiles_1:
    l = map(lambda x: x[0], l)
    all_smiles.update(l)

In [4]:
smiles_tokenizer = AutoTokenizer.from_pretrained("seyonec/ChemBERTa-zinc-base-v1")
smiles_model = AutoModelForMaskedLM.from_pretrained("seyonec/ChemBERTa-zinc-base-v1")

Downloading: 100%|██████████| 166/166 [00:00<00:00, 154kB/s]
Downloading: 100%|██████████| 501/501 [00:00<00:00, 483kB/s]
Downloading: 100%|██████████| 9.43k/9.43k [00:00<00:00, 6.24MB/s]
Downloading: 100%|██████████| 3.21k/3.21k [00:00<00:00, 2.85MB/s]
Downloading: 100%|██████████| 150/150 [00:00<00:00, 146kB/s]
Downloading: 100%|██████████| 179M/179M [00:04<00:00, 36.6MB/s] 


In [5]:
def get_smiles_embeddings(smiles_inputs, tokenizer, model):
    """
    Returns a tensor of pretrained SMILES embeddings for the given SMILES inputs.
    """    
    smiles_tokenized_inputs = tokenizer(smiles_inputs, padding=True, truncation=True, return_tensors="pt")
    smiles_raw_outputs = model(**smiles_tokenized_inputs)

    smiles_mask = torch.unsqueeze(smiles_tokenized_inputs['attention_mask'], dim=2)
    smiles_logits = smiles_raw_outputs.logits
    smiles_logits = smiles_logits.masked_fill(smiles_mask == 0, 0)

    # pooled_smiles_embeddings = torch.sum(smiles_logits, dim=1) 
    seq_lens = torch.sum(smiles_tokenized_inputs['attention_mask'], dim=1)
    seq_lens = seq_lens.reshape((-1,1,1))
    smiles_logits_avg = torch.sum(smiles_logits / seq_lens, dim=1)

    return smiles_logits_avg

In [6]:
smiles_to_embeddings = {}
for smiles in tqdm(all_smiles):
    embed = get_smiles_embeddings([smiles], smiles_tokenizer, smiles_model)
    smiles_to_embeddings[smiles] = embed[0].detach().numpy()

  5%|▍         | 390/7862 [00:13<03:23, 36.78it/s]

In [None]:
with open('data/smiles_to_embeddings.pickle', 'wb') as f:
    pickle.dump(smiles_to_embeddings, f, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
with open('data/smiles_to_embeddings.pickle', 'rb') as f:
    loaded = pickle.load(f)