In [1]:
# inner module import
import sys
sys.path.append("/storage/homefs/yc24j783/datacat4ml/datacat4ml")
from const import FETCH_DATA_DIR, FETCH_FIG_DIR, FEATURIZE_DATA_DIR, FEATURIZE_FIG_DIR

import os
import pandas as pd
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, trainers, processors

In [2]:
# load featurized data
ki_maxcur_df = pd.read_pickle(os.path.join(FEATURIZE_DATA_DIR, 'ki_maxcur_8_fp.pkl'))
ki_maxcur_df.head(1)

Unnamed: 0,assay_id,assay_chembl_id,tid,target_chembl_id,standard_type,pchembl_value,assay_type,assay_category,assay_organism,assay_tax_id,...,relationship_type,aidx,confidence_score,molregno,compound_chembl_id,canonical_smiles,assay_info_hash,ecfp4,map4c,activity
0,147162,CHEMBL753852,136,CHEMBL236,Ki,6.96,B,,Cavia porcellus,10141,...,H,CLD0,8,1798744,CHEMBL3350133,CN1CC[C@]23c4c5ccc(O)c4O[C@H]2[C@@H](NC(=O)CNC...,d5fdf976cc6fd98f7656c177bcab9fc2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[2421162, 248707, 1555374, 3026370, 1608673, 1...",intermediate


In [3]:
print(ki_maxcur_df['assay_info_hash'][:1])

0    d5fdf976cc6fd98f7656c177bcab9fc2
Name: assay_info_hash, dtype: object


In [4]:
print(f"gpcr: {ki_maxcur_df['assay_id'].value_counts()}")
print(f"gpcr shape: {ki_maxcur_df.shape}")
mor_ki_maxcur_df = ki_maxcur_df[ki_maxcur_df['target_chembl_id'] == 'CHEMBL233']
print(f"mor: {mor_ki_maxcur_df['assay_id'].value_counts()}")
print(f"mor shape: {mor_ki_maxcur_df.shape}")

gpcr: 1642196    121
596087     112
1468240    110
1613477    107
447747     106
          ... 
1688606      1
700881       1
1688609      1
1688610      1
3051         1
Name: assay_id, Length: 14223, dtype: int64
gpcr shape: (139416, 31)
mor: 1642108    91
1641066    90
1536477    79
540743     65
443967     54
           ..
439345      1
939900      1
447457      1
878643      1
1869947     1
Name: assay_id, Length: 459, dtype: int64
mor shape: (4487, 31)


# Tokenize SMILES

## from OpenNMT-py

In [5]:
def smi_tokenizer(smi):
    """
    Tokenize a SMILES molecule or reaction
    """
    import re
    pattern =  "(\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\\\|\/|:|~|@|\?|>|\*|\$|\%[0-9]{2}|[0-9])"
    regex = re.compile(pattern)
    tokens = [token for token in regex.findall(smi)]
    assert smi == ''.join(tokens)
    return ' '.join(tokens)

In [6]:
smis = ki_maxcur_df['canonical_smiles'].values
tokenized_smis = [smi_tokenizer(smi) for smi in smis]
tokenized_smis[:5]

['C N 1 C C [C@] 2 3 c 4 c 5 c c c ( O ) c 4 O [C@H] 2 [C@@H] ( N C ( = O ) C N C ( = O ) C C C ( = O ) N C C ( = O ) N [C@H] 2 C C [C@@] 4 ( O ) [C@H] 6 C c 7 c c c ( O ) c 8 c 7 [C@@] 4 ( C C N 6 C ) [C@H] 2 O 8 ) C C [C@@] 3 ( O ) [C@H] 1 C 5',
 'C O c 1 c c c ( N C ( = O ) c 2 c c c ( - c 3 c c c ( - c 4 n o c ( C ) n 4 ) c c 3 C ) c c 2 ) c c 1 N 1 C C N ( C ) C C 1',
 'C O c 1 c c c ( N C ( = O ) c 2 c c c ( - c 3 c c c ( - c 4 n o c ( C ) n 4 ) c c 3 C ) c c 2 ) c c 1 N 1 C C N ( C ) C C 1',
 'C O c 1 c c c ( N C ( = O ) c 2 c c c ( - c 3 c c c ( - c 4 n o c ( C ) n 4 ) c c 3 C ) c c 2 ) c c 1 N 1 C C N ( C ) C C 1',
 'C C C c 1 n c ( C C ) c ( C ( = O ) N C ) n 1 C c 1 c c c ( - c 2 c c c c c 2 S ( = O ) ( = O ) N c 2 o n c ( C ) c 2 C ) c ( C ) c 1']

## From ChemBerta Transformer

In [3]:
def chemberta_tokenizer(smi:str, max_smi_len: int=200, padding: bool=True, truncation: bool=True, 
                        auto_tokenizer: str = 'seyonec/PubChem10M_SMILES_BPE_450k'):
    """
    Tokenize SMILES for a ChemBerta Transformer
    
    :param smi: (str)SMILES string
    :param max_smi_len: (int) maximum SMILES length
    :param padding: (bool) padding
    :param truncation: (bool) allow truncation (you will need this for heterogenous SMILES strings)
    :param auto_tokenizer: (str) tokenizer name provided by HuggingFace
    """

    from transformers import AutoTokenizer
    tokenizer = AutoTokenizer.from_pretrained(auto_tokenizer)
    tokens = tokenizer(smi, return_tensors='pt', padding=padding, truncation=truncation, max_length=max_smi_len)

    return tokens

In [4]:
smis = ki_maxcur_df['canonical_smiles'].values
tokenized_smis = [chemberta_tokenizer(smi) for smi in smis]
tokenized_smis[:5]

2024-09-09 13:36:24.730795: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Downloading tokenizer_config.json:   0%|          | 0.00/62.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/515 [00:00<?, ?B/s]

Downloading vocab.json:   0%|          | 0.00/165k [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/101k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

# Tokenize assay-related data

## assay_id

In [7]:
assay_ids = ki_maxcur_df['assay_id'].values.tolist()
assay_ids[:5]

[147162, 957, 1732, 1360, 65644]

## assay_desc

In [8]:
assay_descs = ki_maxcur_df['assay_desc'].values.tolist()
print(f"The data type of assay_descs is {type(assay_descs)}")
# Create a temporary file to store assay descriptions
with open('assay_descs.txt', 'w') as f:
    for desc in assay_descs:
        f.write(desc + '\n')
assay_descs[:5]

The data type of assay_descs is <class 'list'>


['Displacement of [3H]EK from Opioid receptor delta 1 in guinea pig brain membrane',
 'Affinity for 5-hydroxytryptamine 1A receptor subtype',
 'Affinity for 5-hydroxytryptamine 1D receptor subtype',
 'Affinity for 5-hydroxytryptamine 1B receptor subtype',
 'Binding affinity towards human ETA receptor expressed in CHO-K1 cells in the presence of 0.05 nM [125I]-labeled endothelin 1']

In [9]:
# initialize a tokenizer
tokenizer = Tokenizer(models.BPE())

# customize pre-tokenizer and decoder
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=True)
tokenizer.decoder = decoders.ByteLevel()
tokenizer.post_processor = processors.ByteLevel(trim_offsets=True)

# train tokenizer
trainer = trainers.BpeTrainer(vocab_size=9000, min_frequency=2, limit_alphabet=55, special_tokens=['affinity', 'displacement', '3H', '125I', 'camp', 'gtp', 'calcium', 'ca2+', 'IP1', 'IP3', 'arrest', 'agonist'])
tokenizer.train(['assay_descs.txt'], trainer=trainer)






In [10]:
def assay_desc_tokenizer(sentence):
    '''Tokenize a sentense, optimized for assay description'''
    encoded = tokenizer.encode(sentence)
    my_list = [item for item in encoded.tokens] 
    my_list = ' '.join(my_list)

    return my_list

def enzyme_sentence_tokenizer(sentence):
    '''
    Tokenize a sentenze, optimized for enzyme-like descriptions & names
    '''
    encoded = tokenizer.encode(sentence)
    my_list = [item for item in encoded.tokens if 'Ġ' != item]
    my_list = [item.replace('Ġ', '_') for item in my_list]
    my_list = ' '.join(my_list)
    
    return my_list

In [11]:
tokenized_assay_descs = [enzyme_sentence_tokenizer(assay_desc) for assay_desc in assay_descs]
tokenized_assay_descs[:5]

['_Displacement _of _[ 3H ] EK _from _Opioid _receptor _delta _1 _in _guinea _pig _brain _membrane',
 '_Affinity _for _5 - hydroxytryptamine _1 A _receptor _subtype',
 '_Affinity _for _5 - hydroxytryptamine _1 D _receptor _subtype',
 '_Affinity _for _5 - hydroxytryptamine _1 B _receptor _subtype',
 '_Binding affinity _towards _human _ETA _receptor _expressed _in _CHO - K 1 _cells _in _the _presence _of _0 . 05 _nM _[ 125I ]- labeled _endothelin _1']

## identifiers in assay table

# Concatenate the tokenized SMILES and tokenized assay-related data

## SMILES + assay_id

## SMILES + assay-desc

## SMILES + assay-related info

# Implement the transformer

In [None]:
## Split the data into training, validation, and testing sets
from sklearn.model_selection import train_test_split