In [9]:
%cd ..

/home/adam/Projects/jointformer


In [10]:
%ls

[0m[01;34mconfigs[0m/              [01;34mexperiments[0m/                  [01;34mnotebooks[0m/  [01;34mscripts[0m/
[01;34mdata[0m/                 [01;34mHybrid_Transformer.egg-info[0m/  README.md   setup.py
env_with_history.yml  [01;34mjointformer[0m/                  [01;34mresults[0m/    [01;34mvocabularies[0m/
env.yml               LICENSE                       [01;34msampled[0m/    [01;34mwandb[0m/


In [3]:
from jointformer.utils.datasets.smiles.base import SMILESDataset
from jointformer.utils.transforms.permute import PermuteSMILES
from jointformer.utils.transforms.smiles_enumerator import SmilesEnumerator

import torchvision.transforms as transforms

%load_ext autoreload
%autoreload 2

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
PATH_TO_TXT_DATA = 'data/guacamol/test/smiles.txt'
NUM_SAMPLES = 1000
VALIDATE = False

In [8]:

transform = transforms.Compose([
    SmilesEnumerator(),
])
tokenizer = None

dataset = SMILESDataset(file_path=PATH_TO_TXT_DATA, num_samples=NUM_SAMPLES, validate=VALIDATE, transform=transform)

In [9]:
dataset[0]

'c1c(C)c2c(oc(CCCC#N)c2)c2c1C(=O)c1c(c(O)ccc1)C2=O'

In [None]:
class SMILESData:
    """I could make a dataclass with orig_smiles, inputs, props, mask etc. Rerutn vecotr or class. """
    
    

In [49]:
from rdkit import Chem
from rdkit.Chem import Descriptors
from typing import List
from rdkit.ML.Descriptors.MoleculeDescriptors import MolecularDescriptorCalculator
import numpy as np

def get_all_descriptor_names() -> List[str]:
    """
    Get available descriptor names for RDKit physchem features. Custom subset can be used as list of descriptors.
    """
    return sorted([x[0] for x in Descriptors._descList])

def rdkit_dense_array_to_np(dense_fp, dtype=np.int32):
    """
    Converts RDKit ExplicitBitVect to 1D numpy array with specified dtype.
    Args:
        dense_fp (ExplicitBitVect or np.ndarray): fingerprint
        dtype: dtype of the returned array

    Returns:
        Numpy matrix with shape (fp_len,)
    """
    dense_fp = np.array(dense_fp, dtype=dtype)
    if len(dense_fp.shape) == 1:
        pass
    elif len(dense_fp.shape) == 2 and dense_fp.shape[0] == 1:
        dense_fp = np.squeeze(dense_fp, axis=0)
    else:
        raise ValueError("Input matrix should either have shape of (fp_size, ) or (1, fp_size).")

    return np.array(dense_fp)

In [51]:
# Get featurizer


default_smiles = 'CCN(CC)C(=O)C1CN(C2CC3=CNC4=CC=CC(=C34)C2=C1)C'
list_of_descriptors = get_all_descriptor_names()
molecule = Chem.MolFromSmiles(default_smiles)

molecular_descriptor_calculator = MolecularDescriptorCalculator(list_of_descriptors)
fp = molecular_descriptor_calculator.CalcDescriptors(molecule)
fp = np.array(fp)
mask = np.isfinite(fp)
fp[~mask] = 0
fp = rdkit_dense_array_to_np(fp, dtype=float)


In [53]:
fp.shape

(211,)

In [12]:
import torch
from torch.utils.data import Dataset
import numpy as np
import re

class SmilesTokenizer(Dataset):

    def __init__(self, args, data, content, block_size, aug_prob = 0.5, prop = None, scaffold = None, scaffold_maxlen = None):
        chars = sorted(list(set(content)))
        data_size, vocab_size = len(data), len(chars)
        print('data has %d smiles, %d unique characters.' % (data_size, vocab_size))
    
        self.stoi = { ch:i for i,ch in enumerate(chars) }
        self.itos = { i:ch for i,ch in enumerate(chars) }
        self.max_len = block_size
        self.vocab_size = vocab_size
        self.data = data
        self.prop = prop
        self.sca = scaffold
        self.scaf_max_len = scaffold_maxlen
        self.debug = args.debug
        self.tfm = SmilesEnumerator()
        self.aug_prob = aug_prob
    
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        smiles, prop, scaffold = self.data[idx], self.prop[idx], self.sca[idx]    # self.prop.iloc[idx, :].values  --> if multiple properties
        smiles = smiles.strip()
        scaffold = scaffold.strip()

        p = np.random.uniform()
        if p < self.aug_prob:
            smiles = self.tfm.randomize_smiles(smiles)

        pattern =  "(\[[^\]]+]|<|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\\\|\/|:|~|@|\?|>|\*|\$|\%[0-9]{2}|[0-9])"
        regex = re.compile(pattern)
        smiles += str('<')*(self.max_len - len(regex.findall(smiles)))

        if len(regex.findall(smiles)) > self.max_len:
            smiles = smiles[:self.max_len]

        smiles=regex.findall(smiles)

        scaffold += str('<')*(self.scaf_max_len - len(regex.findall(scaffold)))
        
        if len(regex.findall(scaffold)) > self.scaf_max_len:
            scaffold = scaffold[:self.scaf_max_len]

        scaffold=regex.findall(scaffold)

        dix =  [self.stoi[s] for s in smiles]
        sca_dix = [self.stoi[s] for s in scaffold]

        sca_tensor = torch.tensor(sca_dix, dtype=torch.long)
        x = torch.tensor(dix[:-1], dtype=torch.long)
        y = torch.tensor(dix[1:], dtype=torch.long)
        # prop = torch.tensor([prop], dtype=torch.long)
        prop = torch.tensor([prop], dtype = torch.float)
        return x, y, prop, sca_tensor

In [6]:
from jointformer.utils.tokenizers.smiles.regex import SMILESTokenizer

%load_ext autoreload
%autoreload 2


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [11]:
VOCABULARY_PATH = 'jointformer/utils/tokenizers/smiles/molgpt_vocabulary.txt'

smiles = 'CCN(CC)C(=O)C1CN(C2CC3=CNC4=CC=CC(=C34)C2=C1)C'

tokenizer = SMILESTokenizer(path_to_vocabulary=VOCABULARY_PATH)

In [13]:
len(tokenizer.vocabulary)

101

In [181]:
tokenized = tokenizer.tokenize(smiles)

In [182]:
tokenizer.detokenize(tokenized)

'CCN(CC)C(=O)C1CN(C2CC3=CNC4=CC=CC(=C34)C2=C1)C'

In [183]:
tokenized

tensor([ 1, 27, 27, 31, 11, 27, 27, 12, 27, 11, 24, 32, 12, 27, 14, 27, 31, 11,
        27, 15, 27, 27, 16, 24, 27, 31, 27, 17, 24, 27, 27, 24, 27, 27, 11, 24,
        27, 16, 17, 12, 27, 15, 24, 27, 14, 12, 27,  2,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0])

In [131]:
tokenizer.token_to_index

{'[PAD]': 0,
 '[BOS]': 1,
 '[EOS]': 2,
 '[MASK]': 3,
 '[UNK]': 4,
 '[REC]': 5,
 '[PRED]': 6,
 '#': 7,
 '%10': 8,
 '%11': 9,
 '%12': 10,
 '(': 11,
 ')': 12,
 '-': 13,
 '1': 14,
 '2': 15,
 '3': 16,
 '4': 17,
 '5': 18,
 '6': 19,
 '7': 20,
 '8': 21,
 '9': 22,
 '<': 23,
 '=': 24,
 'B': 25,
 'Br': 26,
 'C': 27,
 'Cl': 28,
 'F': 29,
 'I': 30,
 'N': 31,
 'O': 32,
 'P': 33,
 'S': 34,
 '[B-]': 35,
 '[BH-]': 36,
 '[BH2-]': 37,
 '[BH3-]': 38,
 '[B]': 39,
 '[C+]': 40,
 '[C-]': 41,
 '[CH+]': 42,
 '[CH-]': 43,
 '[CH2+]': 44,
 '[CH2]': 45,
 '[CH]': 46,
 '[F+]': 47,
 '[H]': 48,
 '[I+]': 49,
 '[IH2]': 50,
 '[IH]': 51,
 '[N+]': 52,
 '[N-]': 53,
 '[NH+]': 54,
 '[NH-]': 55,
 '[NH2+]': 56,
 '[NH3+]': 57,
 '[N]': 58,
 '[O+]': 59,
 '[O-]': 60,
 '[OH+]': 61,
 '[O]': 62,
 '[P+]': 63,
 '[PH+]': 64,
 '[PH2+]': 65,
 '[PH]': 66,
 '[S+]': 67,
 '[S-]': 68,
 '[SH+]': 69,
 '[SH]': 70,
 '[Se+]': 71,
 '[SeH+]': 72,
 '[SeH]': 73,
 '[Se]': 74,
 '[Si-]': 75,
 '[SiH-]': 76,
 '[SiH2]': 77,
 '[SiH]': 78,
 '[Si]': 79,
 '[b-]': 

In [None]:
train_dataset = SmileDataset(args, smiles, whole_string, max_len, prop=prop, aug_prob=0, scaffold=scaffold, scaffold_maxlen= scaffold_max_len)