In [1]:
# conda env: st (Python 3.12.2)
import os
import sys
from datacat4ml.const import DATA_DIR, FETCH_DATA_DIR, FETCH_FIG_DIR, FEATURIZE_DATA_DIR, FEATURIZE_FIG_DIR

import os
import pandas as pd

# Functions

In [2]:
import re

# Load the data

In [3]:
# load the dataset
or_chembl_id = ['CHEMBL233', 'CHEMBL237', 'CHEMBL236', 'CHEMBL2014']
gpcr_ki = pd.read_csv(os.path.join(DATA_DIR, 'data_prep', '1_data_fetch', 'ki_maxcur_8_data.csv'))
print(f"The shape of gpcr_ki is: {gpcr_ki.shape}")
# extract the rows where the 'target_chembl_id' is one of the elements in the list OR_chembl_id
or_ki = gpcr_ki[gpcr_ki['target_chembl_id'].isin(or_chembl_id)]
print(f"The shape of or_ki is: {or_ki.shape}")

The shape of gpcr_ki is: (139416, 28)
The shape of or_ki is: (13533, 28)


  gpcr_ki = pd.read_csv(os.path.join(DATA_DIR, 'data_prep', '1_data_fetch', 'ki_maxcur_8_data.csv'))


In [4]:
gpcr_smis = gpcr_ki['canonical_smiles'].values
or_smis = or_ki['canonical_smiles'].values

# Tokenize SMILES

deepchem

- Molecule Tokenizers
    - SmilesTokenizer
    - BasicSmilesTokenizer
    - HuggingFaceFeaturizer
- Other Featurizers
    - BertFeaturizer
    - RobertaFeaturizer
    - RxnFeaturizer
    - UserDefinedFeaturizer


## Tokenizer shared by Markus

- shared by Markus
- return: encode --> token_ids
- return: encode --> list

In [5]:
class SMILESTokenizer:
    def __init__(self):
        self.pattern = r"(\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\\\|\/|:|~|@|\?|>|\*|\!|\$|\%[0-9]{2}|[0-9])"
        self.vocab = {}
        self.inv_vocab = {}
        self.pad_token = '<PAD>'
        self.unk_token = '<UNK>'
        self.start_token = '<START>'
        self.end_token = '<END>'
        self.max_len = None
    def tokenize(self, smiles):
        """Tokenizes a SMILES string using the predefined regular expression."""
        return re.findall(self.pattern, smiles)
    def build_vocab(self, smiles_list):
        """Builds vocabulary from a list of SMILES strings."""
        all_tokens = set()
        for smiles in smiles_list:
            tokens = self.tokenize(smiles)
            all_tokens.update(tokens)
        tokens = [self.pad_token, self.unk_token, self.start_token, self.end_token]
        all_tokens = sorted(all_tokens)
        all_tokens = tokens + all_tokens
        self.vocab = {token: idx for idx, token in enumerate(all_tokens)}
        self.inv_vocab = {idx: token for token, idx in self.vocab.items()}
    def encode(self, smiles, max_len=None):
        """Encodes a SMILES string into a list of token indices, optionally padding to max_len."""
        tokens = self.tokenize(smiles)
        tokens = [self.start_token] + tokens + [self.end_token]
        token_ids = [self.vocab.get(token, self.vocab[self.unk_token]) for token in tokens]
        if max_len:
            token_ids = token_ids[:max_len] + [self.vocab[self.pad_token]] * max(0, max_len - len(token_ids))
        return token_ids
    def decode(self, token_ids):
        """Decodes a list of token indices back into a SMILES string."""
        tokens = [self.inv_vocab.get(token_id, self.unk_token) for token_id in token_ids]
        tokens = [token for token in tokens if token not in [self.start_token, self.end_token, self.pad_token]]
        return ''.join(tokens)
    def vocab_size(self):
        """Returns the size of the vocabulary."""
        return len(self.vocab)
    def pad_sequence(self, sequence, max_len):
        """Pads a sequence to the maximum length."""
        return sequence[:max_len] + [self.vocab[self.pad_token]] * max(0, max_len - len(sequence))

In [6]:
# build the vocabulary from the GPCR SMILES
tokenizer = SMILESTokenizer()
tokenizer.build_vocab(gpcr_smis)
print(f"The size of the vocabulary is: {tokenizer.vocab_size()}")
tokenized_gpcr_smis = [tokenizer.tokenize(smiles) for smiles in gpcr_smis]
print(f"The first tokenized SMILES is: {tokenized_gpcr_smis[0]}")
encode_gpcr_smis = [tokenizer.encode(smiles) for smiles in gpcr_smis]
print(f"The first encoded SMILES is: {encode_gpcr_smis[0]}")

The size of the vocabulary is: 79
The first tokenized SMILES is: ['C', 'N', '1', 'C', 'C', '[C@]', '2', '3', 'c', '4', 'c', '5', 'c', 'c', 'c', '(', 'O', ')', 'c', '4', 'O', '[C@H]', '2', '[C@@H]', '(', 'N', 'C', '(', '=', 'O', ')', 'C', 'N', 'C', '(', '=', 'O', ')', 'C', 'C', 'C', '(', '=', 'O', ')', 'N', 'C', 'C', '(', '=', 'O', ')', 'N', '[C@H]', '2', 'C', 'C', '[C@@]', '4', '(', 'O', ')', '[C@H]', '6', 'C', 'c', '7', 'c', 'c', 'c', '(', 'O', ')', 'c', '8', 'c', '7', '[C@@]', '4', '(', 'C', 'C', 'N', '6', 'C', ')', '[C@H]', '2', 'O', '8', ')', 'C', 'C', '[C@@]', '3', '(', 'O', ')', '[C@H]', '1', 'C', '5']
The first encoded SMILES is: [2, 21, 25, 10, 21, 21, 46, 11, 12, 75, 13, 75, 14, 75, 75, 75, 5, 26, 6, 75, 13, 26, 45, 11, 43, 5, 25, 21, 5, 19, 26, 6, 21, 25, 21, 5, 19, 26, 6, 21, 21, 21, 5, 19, 26, 6, 25, 21, 21, 5, 19, 26, 6, 25, 45, 11, 21, 21, 44, 13, 5, 26, 6, 45, 15, 21, 75, 16, 75, 75, 75, 5, 26, 6, 75, 17, 75, 16, 44, 13, 5, 21, 21, 25, 15, 21, 6, 45, 11, 26, 17, 6, 21, 2

In [26]:
smi_tokenizer_M = SMILESTokenizer()

print(f"The size of the vocabulary is: {smi_tokenizer_M.vocab_size()}")

tokenized_gpcr_smis_M = [smi_tokenizer_M.tokenize(smi) for smi in gpcr_smis]
# encode_gpcr_smis_M = [smi_tokenizer_M.encode(smi) for smi in gpcr_smis] --> KeyError: '<UNK>' 
print(f'The length of tokenized_gpcr_smis is {len(tokenized_gpcr_smis_M)}')
print(f"The first element of tokenized_gpcr_smis is:\n {tokenized_gpcr_smis_M[0]}")
#print(f"The length of encode_gpcr_smis is {len(encode_gpcr_smis_M)}")

tokenized_or_smis_M = [smi_tokenizer_M.tokenize(smi) for smi in or_smis]
print(f'The length of tokenized_or_smis is {len(tokenized_or_smis_M)}')
print(f"The first element of tokenized_or_smis is:\n {tokenized_or_smis_M[0]}")

The size of the vocabulary is: 0
The length of tokenized_gpcr_smis is 139416
The first element of tokenized_gpcr_smis is:
 ['C', 'N', '1', 'C', 'C', '[C@]', '2', '3', 'c', '4', 'c', '5', 'c', 'c', 'c', '(', 'O', ')', 'c', '4', 'O', '[C@H]', '2', '[C@@H]', '(', 'N', 'C', '(', '=', 'O', ')', 'C', 'N', 'C', '(', '=', 'O', ')', 'C', 'C', 'C', '(', '=', 'O', ')', 'N', 'C', 'C', '(', '=', 'O', ')', 'N', '[C@H]', '2', 'C', 'C', '[C@@]', '4', '(', 'O', ')', '[C@H]', '6', 'C', 'c', '7', 'c', 'c', 'c', '(', 'O', ')', 'c', '8', 'c', '7', '[C@@]', '4', '(', 'C', 'C', 'N', '6', 'C', ')', '[C@H]', '2', 'O', '8', ')', 'C', 'C', '[C@@]', '3', '(', 'O', ')', '[C@H]', '1', 'C', '5']
The length of tokenized_or_smis is 13533
The first element of tokenized_or_smis is:
 ['C', 'N', '1', 'C', 'C', '[C@]', '2', '3', 'c', '4', 'c', '5', 'c', 'c', 'c', '(', 'O', ')', 'c', '4', 'O', '[C@H]', '2', '[C@@H]', '(', 'N', 'C', '(', '=', 'O', ')', 'C', 'N', 'C', '(', '=', 'O', ')', 'C', 'C', 'C', '(', '=', 'O', ')', 'N'

## Basic Smiles Tokenizer
- a regex tokenization pattern to tokenise SMILES strings
- This tokenizer is to be used when a tokenizer that does not require the transformers library by HuggingFace is required.
- firstly developed in MolecularTransformer (Schwaller et. al), and used in OpenNMT-py
- return: tokens
- return type: str


In [8]:
def smi_tokenizer(smi):
    """
    Tokenize a SMILES molecule or reaction
    """
    import re
    pattern =  "(\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\\\|\/|:|~|@|\?|>|\*|\$|\%[0-9]{2}|[0-9])"
    regex = re.compile(pattern)
    tokens = [token for token in regex.findall(smi)]
    assert smi == ''.join(tokens)
    return ' '.join(tokens)

  pattern =  "(\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\\\|\/|:|~|@|\?|>|\*|\$|\%[0-9]{2}|[0-9])"


In [9]:
tokenized_gpcr_smis_B = [smi_tokenizer(smi) for smi in gpcr_smis]
print(f'The first element of tokenized_gpcr_smis_B is:\n {tokenized_gpcr_smis_B[0]}')

The first element of tokenized_gpcr_smis_B is:
 C N 1 C C [C@] 2 3 c 4 c 5 c c c ( O ) c 4 O [C@H] 2 [C@@H] ( N C ( = O ) C N C ( = O ) C C C ( = O ) N C C ( = O ) N [C@H] 2 C C [C@@] 4 ( O ) [C@H] 6 C c 7 c c c ( O ) c 8 c 7 [C@@] 4 ( C C N 6 C ) [C@H] 2 O 8 ) C C [C@@] 3 ( O ) [C@H] 1 C 5


## From ChemBerta Transformer

- used in MoleculeACE, also available in deepchem
- retruns: A numpy arrray containing a featurized representation of datapoints
- return type: np.ndarray

In [11]:
def chemberta_tokenizer(smi:str, max_smi_len: int=200, padding: bool=True, truncation: bool=True, 
                        auto_tokenizer: str = 'seyonec/PubChem10M_SMILES_BPE_450k'):
    """
    Tokenize SMILES for a ChemBerta Transformer
    
    :param smi: (str)SMILES string
    :param max_smi_len: (int) maximum SMILES length
    :param padding: (bool) padding
    :param truncation: (bool) allow truncation (you will need this for heterogenous SMILES strings)
    :param auto_tokenizer: (str) tokenizer name provided by HuggingFace
    """

    from transformers import AutoTokenizer
    tokenizer = AutoTokenizer.from_pretrained(auto_tokenizer)
    tokens = tokenizer(smi, return_tensors='pt', padding=padding, truncation=truncation, max_length=max_smi_len)

    return tokens

In [12]:
tokenized_smis_C = [chemberta_tokenizer(smi) for smi in gpcr_smis[0]]
print(f'The first element of tokenized_smis_C is:\n {tokenized_smis_C[0]}')

  from .autonotebook import tqdm as notebook_tqdm


The first element of tokenized_smis_C is:
 {'input_ids': tensor([[ 0, 39,  2]]), 'attention_mask': tensor([[1, 1, 1]])}


# Tokenize assay-related data

## assay_id

In [13]:
gpcr_assay_ids = gpcr_ki['assay_id'].values.tolist()
gpcr_assay_ids[:5]

[147162, 957, 1732, 1360, 65644]

## assay_desc

In [14]:
gpcr_assay_descs = gpcr_ki['assay_desc'].values.tolist()
print(f"The data type of gpcr_assay_descs is {type(gpcr_assay_descs)}")
# Create a temporary file to store assay descriptions
with open('gpcr_assay_descs.txt', 'w') as f:
    for desc in gpcr_assay_descs:
        f.write(desc + '\n')
gpcr_assay_descs[:5]

The data type of gpcr_assay_descs is <class 'list'>


['Displacement of [3H]EK from Opioid receptor delta 1 in guinea pig brain membrane',
 'Affinity for 5-hydroxytryptamine 1A receptor subtype',
 'Affinity for 5-hydroxytryptamine 1D receptor subtype',
 'Affinity for 5-hydroxytryptamine 1B receptor subtype',
 'Binding affinity towards human ETA receptor expressed in CHO-K1 cells in the presence of 0.05 nM [125I]-labeled endothelin 1']

In [16]:
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, trainers, processors

# initialize a tokenizer
tokenizer = Tokenizer(models.BPE())

# customize pre-tokenizer and decoder
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=True)
tokenizer.decoder = decoders.ByteLevel()
tokenizer.post_processor = processors.ByteLevel(trim_offsets=True)

# train tokenizer
trainer = trainers.BpeTrainer(vocab_size=9000, min_frequency=2, limit_alphabet=55, special_tokens=['affinity', 'displacement', '3H', '125I', 'camp', 'gtp', 'calcium', 'ca2+', 'IP1', 'IP3', 'arrest', 'agonist'])
tokenizer.train(['gpcr_assay_descs.txt'], trainer=trainer) #? shall I remove duplicates?






In [17]:
def assay_desc_tokenizer(sentence):
    '''Tokenize a sentense, optimized for assay description'''
    encoded = tokenizer.encode(sentence)
    my_list = [item for item in encoded.tokens] 
    my_list = ' '.join(my_list)

    return my_list

def enzyme_sentence_tokenizer(sentence):
    '''
    Tokenize a sentenze, optimized for enzyme-like descriptions & names
    '''
    encoded = tokenizer.encode(sentence)
    my_list = [item for item in encoded.tokens if 'Ġ' != item]
    my_list = [item.replace('Ġ', '_') for item in my_list]
    my_list = ' '.join(my_list)
    
    return my_list

In [19]:
tokenized_assay_descs = [enzyme_sentence_tokenizer(assay_desc) for assay_desc in gpcr_assay_descs]
tokenized_assay_descs[:5]

['_Displacement _of _[ 3H ] EK _from _Opioid _receptor _delta _1 _in _guinea _pig _brain _membrane',
 '_Affinity _for _5 - hydroxytryptamine _1 A _receptor _subtype',
 '_Affinity _for _5 - hydroxytryptamine _1 D _receptor _subtype',
 '_Affinity _for _5 - hydroxytryptamine _1 B _receptor _subtype',
 '_Binding affinity _towards _human _ETA _receptor _expressed _in _CHO - K 1 _cells _in _the _presence _of _0 . 05 _nM _[ 125I ]- labeled _endothelin _1']

## identifiers in assay table

# Concatenate the tokenized SMILES and tokenized assay-related data

## The method in Enzymatic Transformer

## SMILES + assay_id

## SMILES + assay-desc

## SMILES + assay-related info