In [1]:
# conda env: st(Python 3.12.2)
import os
import sys
from datacat4ml.const import DATA_DIR

import pandas as pd

from typing import List, Tuple

# Load the dataset

In [4]:
or_chembl_id = ['CHEMBL233', 'CHEMBL237', 'CHEMBL236', 'CHEMBL2014']
gpcr_ki = pd.read_csv(os.path.join(DATA_DIR, 'data_prep', 'data_fetch', 'Ki_gpcr_maxcur_8_data.csv'))
print(f"The shape of gpcr_ki is: {gpcr_ki.shape}")
gpcr_ic50 = pd.read_csv(os.path.join(DATA_DIR, 'data_prep', 'data_fetch', 'IC50_gpcr_maxcur_8_data.csv'))
print(f"The shape of gpcr_ic50 is: {gpcr_ic50.shape}")
gpcr_ec50 = pd.read_csv(os.path.join(DATA_DIR, 'data_prep', 'data_fetch', 'EC50_gpcr_maxcur_8_data.csv'))
print(f"The shape of gpcr_ec50 is: {gpcr_ec50.shape}")
# extract the rows where the 'target_chembl_id' is one of the elements in the list OR_chembl_id
or_ki = gpcr_ki[gpcr_ki['target_chembl_id'].isin(or_chembl_id)]
print(f"The shape of or_ki is: {or_ki.shape}")

The shape of gpcr_ki is: (176449, 31)
The shape of gpcr_ic50 is: (122257, 31)
The shape of gpcr_ec50 is: (79012, 31)
The shape of or_ki is: (17072, 31)


  gpcr_ec50 = pd.read_csv(os.path.join(DATA_DIR, 'data_prep', 'data_fetch', 'EC50_gpcr_maxcur_8_data.csv'))


## assay_desc

In [5]:
gpcr_ki_assay_descs = gpcr_ki['assay_desc'].unique()
print(f"The number of unique assay_desc in gpcr_ki is: {len(gpcr_ki_assay_descs)}")
gpcr_ic50_assay_descs = gpcr_ic50['assay_desc'].unique()
print(f"The number of unique assay_desc in gpcr_ic50 is: {len(gpcr_ic50_assay_descs)}")
gpcr_ec50_assay_descs = gpcr_ec50['assay_desc'].unique()
print(f"The number of unique assay_desc in gpcr_ec50 is: {len(gpcr_ec50_assay_descs)}")

The number of unique assay_desc in gpcr_ki is: 14161
The number of unique assay_desc in gpcr_ic50 is: 9942
The number of unique assay_desc in gpcr_ec50 is: 6376


In [6]:
# combine gpcr_ki_assay_desc, gpcr_ic50_assay_desc, gpcr_ec50_assay_desc into a single list
gpcr_assay_descs = gpcr_ki_assay_descs.tolist() + gpcr_ic50_assay_descs.tolist() + gpcr_ec50_assay_descs.tolist()
print(f"The number of unique assay_desc in gpcr_assay_descs is: {len(gpcr_assay_descs)}")
print(f"The data type of gpcr_assay_descs is: {type(gpcr_assay_descs)}")

# save the list to a file
with open('gpcr_assay_descs.txt', 'w') as f:
    for item in gpcr_assay_descs:
        f.write("%s\n" % item)

The number of unique assay_desc in gpcr_assay_descs is: 30479
The data type of gpcr_assay_descs is: <class 'list'>


## compound smiles

In [7]:
gpcr_ki_smis = gpcr_ki['canonical_smiles'].unique()
print(f"The number of unique canonical_smiles in gpcr_ki is: {len(gpcr_ki_smis)}")
gpcr_ic50_smis = gpcr_ic50['canonical_smiles'].unique()
print(f"The number of unique canonical_smiles in gpcr_ic50 is: {len(gpcr_ic50_smis)}")
gpcr_ec50_smis = gpcr_ec50['canonical_smiles'].unique()
print(f"The number of unique canonical_smiles in gpcr_ec50 is: {len(gpcr_ec50_smis)}")

The number of unique canonical_smiles in gpcr_ki is: 76880
The number of unique canonical_smiles in gpcr_ic50 is: 77556
The number of unique canonical_smiles in gpcr_ec50 is: 43539


In [8]:
# combine gpcr_ki_smis, gpcr_ic50_smis, gpcr_ec50_smis into a single list
gpcr_smis = gpcr_ki_smis.tolist() + gpcr_ic50_smis.tolist() + gpcr_ec50_smis.tolist()
print(f"The number of unique assay_desc in gpcr_assay_smis is: {len(gpcr_smis)}")
print(f"The data type of gpcr_assay_smis is: {type(gpcr_smis)}")

# save the list to a file
with open('gpcr_smis.txt', 'w') as f:
    for item in gpcr_smis:
        f.write("%s\n" % item)

The number of unique assay_desc in gpcr_assay_smis is: 197975
The data type of gpcr_assay_smis is: <class 'list'>


# Compound SMILES

In [9]:
import re

class SMILESTokenizer:

    """
    Shared by Markus.
    
    """

    def __init__(self):
        self.pattern = r"(\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\\\|\/|:|~|@|\?|>|\*|\!|\$|\%[0-9]{2}|[0-9])"
        self.vocab = {}
        self.inv_vocab = {}
        self.pad_token = '<PAD>'
        self.unk_token = '<UNK>'
        self.start_token = '<START>'
        self.end_token = '<END>'
        self.max_len = None
    def tokenize(self, smiles):
        """Tokenizes a SMILES string using the predefined regular expression."""
        return re.findall(self.pattern, smiles)
    def build_vocab(self, smiles_list):
        """Builds vocabulary from a list of SMILES strings."""
        all_tokens = set()
        for smiles in smiles_list:
            tokens = self.tokenize(smiles)
            all_tokens.update(tokens)
        tokens = [self.pad_token, self.unk_token, self.start_token, self.end_token]
        all_tokens = sorted(all_tokens)
        all_tokens = tokens + all_tokens
        self.vocab = {token: idx for idx, token in enumerate(all_tokens)}
        self.inv_vocab = {idx: token for token, idx in self.vocab.items()}
    def encode(self, smiles, max_len=None):
        """Encodes a SMILES string into a list of token indices, optionally padding to max_len."""
        tokens = self.tokenize(smiles)
        tokens = [self.start_token] + tokens + [self.end_token]
        token_ids = [self.vocab.get(token, self.vocab[self.unk_token]) for token in tokens]
        if max_len:
            token_ids = token_ids[:max_len] + [self.vocab[self.pad_token]] * max(0, max_len - len(token_ids))
        return token_ids
    def decode(self, token_ids):
        """Decodes a list of token indices back into a SMILES string."""
        tokens = [self.inv_vocab.get(token_id, self.unk_token) for token_id in token_ids]
        tokens = [token for token in tokens if token not in [self.start_token, self.end_token, self.pad_token]]
        return ''.join(tokens)
    def vocab_size(self):
        """Returns the size of the vocabulary."""
        return len(self.vocab)
    def pad_sequence(self, sequence, max_len):
        """Pads a sequence to the maximum length."""
        return sequence[:max_len] + [self.vocab[self.pad_token]] * max(0, max_len - len(sequence))

In [14]:
# build the vocabulary from the GPCR SMILES
tokenizer = SMILESTokenizer()
tokenizer.build_vocab(gpcr_smis)
print(f"The size of the vocabulary is: {tokenizer.vocab_size()}")
#tokenized_gpcr_smi_1 = tokenizer.tokenize(gpcr_smis[4]) 
#print(f"The first tokenized SMILES is: {tokenized_gpcr_smi_1}")
#encode_gpcr_smi_1 = tokenizer.encode(gpcr_smis[0])
#print(f"The first encoded SMILES is: {encode_gpcr_smi_1}")

morphine_smi = 'CN1CC[C@]23c4c5ccc(O)c4O[C@H]2[C@@H](O)C=C[C@H]3[C@H]1C5'
morphine_tokenized = tokenizer.tokenize(morphine_smi)
print(f"The tokenized morphine is: \n{morphine_tokenized}")

The size of the vocabulary is: 101
The tokenized morphine is: 
['C', 'N', '1', 'C', 'C', '[C@]', '2', '3', 'c', '4', 'c', '5', 'c', 'c', 'c', '(', 'O', ')', 'c', '4', 'O', '[C@H]', '2', '[C@@H]', '(', 'O', ')', 'C', '=', 'C', '[C@H]', '3', '[C@H]', '1', 'C', '5']


# Assay-related text

## BPE model

In [17]:
def train_BPE_tokenizer(special_tokens: List[str] = ["[PAD]", "[CLS]", "[SEP]", "[UNK]", "[3H]", "[125I]", "[35S]", "[active]", "[inactive]"], 
                        assay_files: List[str]=['./gpcr_assay_descs.txt']):

    """ Based on BPE model, train a tokenizer from scratch using the entire dataset.
    
    Params:
    - special_tokens: List(str) - A list of special tokens that we want to add to the tokenizer
    - all_X_txt: List[str] - A list of path to the files that we should use for training
    
    Returns:
    - tokenizer: PreTrainedTokenizerFast object
    
    """

    from tokenizers import Tokenizer, models, pre_tokenizers, decoders, trainers, processors
    # Initialize a tokenizer
    tokenizer = Tokenizer(models.BPE())

    # Specify a pre-tokenizer before training
    tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=True)

    # Train the tokenizer
    trainer = trainers.BpeTrainer(vocab_size=25_000, min_frequency=2, special_tokens=special_tokens)
    # To ensure a comprehensive vocabulary, use the entire datasets including those that will later be split into training and validation sets
    tokenizer.train(trainer=trainer, files=assay_files) 

    # Specify a decoder and post-process before training
    tokenizer.decoder = decoders.ByteLevel()
    tokenizer.post_processor = processors.ByteLevel(trim_offsets=True)

    # Wrap this in a Tranformers tokenizer object:
    from transformers import PreTrainedTokenizerFast
    tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer)

    return tokenizer

In [18]:
BPE_tokenizer = train_BPE_tokenizer()
# have a look at the vocabulary
vocab = BPE_tokenizer.get_vocab()
print(f"The number of tokens in the vocabulary is: {len(vocab)}")
# print the first 10 tokens
print(f"The first 10 tokens are: {list(vocab.keys())[:10]}")
# use this tokenizer to tokenize a sentence and then encode it
sentence = "Displacement of [3H]Naltindole from human delta opioid receptor expressed in CHO cell membrane. active"
tokenized_sentence = BPE_tokenizer.tokenize(sentence)
encoded = BPE_tokenizer(sentence)
print(f"The tokenized sentence is: {tokenized_sentence}")
print(f"The encoded sentence is: {encoded}")






TypeError: Descriptors cannot be created directly.
If this call came from a _pb2.py file, your generated code is out of date and must be regenerated with protoc >= 3.19.0.
If you cannot immediately regenerate your protos, some other possible workarounds are:
 1. Downgrade the protobuf package to 3.20.x or lower.
 2. Set PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python (but this will use pure-Python parsing and will be much slower).

More information: https://developers.google.com/protocol-buffers/docs/news/2022-05-06#python-updates

In [None]:
## Try wordlevl and wordpiece tokenization

# Pretrain a model from scratch

In [28]:
import torch
torch.cuda.is_available()

False

In [29]:
# define the following config for the model
from transformers import RobertaConfig
config = RobertaConfig(
    vocab_size=52_000,
    max_position_embeddings=514,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1,
)

In [None]:
# recreate the tokenizer in transformers
# It is necessary because it ennsures compatibility between the tokenizer and the transformer model. 
# The `transformers` provides additional features and integration that may not be available in the `tokenizers` library alone
from transformers import RobertaTokenizerFast
tokenizer = RobertaTokenizerFast.from_pretrained("./from_scratch", max_len=512)

In [None]:
# initialize the model.
# As we are training from scratch, we only initialize from a config, not from an existing pretrained model or checkpoint.

from transformers import RobertaForMaskedLM
model = RobertaForMaskedLM(config=config)
model.num_parameters()