In [1]:
# Import necessary libraries, load model and tokenizer
from transformers import RobertaTokenizer, RobertaModel, BertModel, BertTokenizer, BertConfig
import re
import pandas as pd
import torch
from torch.utils.data import DataLoader, TensorDataset
from datasets import load_dataset, Dataset


mole_tokenizer = RobertaTokenizer.from_pretrained("seyonec/ChemBERTa-zinc-base-v1")
mole_model = RobertaModel.from_pretrained("seyonec/ChemBERTa-zinc-base-v1")

pro_tokenizer = BertTokenizer.from_pretrained("Rostlab/prot_bert", do_lower_case=False)
pro_model = BertModel.from_pretrained("Rostlab/prot_bert")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Defining the function that tokenize the molecules
def mole_tokenize_and_encode(molecules):
    tokens = mole_tokenizer(molecules, padding=True, return_tensors='pt')
    # with torch.no_grad():
    #     outputs = mole_model(**tokens)
    # embeddings = outputs.last_hidden_state
    return tokens

In [20]:
# Defining the function to tokenize the proteins
def pro_tokenize_and_encode(proteins):
    tokens = pro_tokenizer(proteins, padding=True, truncation=True, max_length=512, return_tensors='pt')
    # with torch.no_grad():
    #     outputs = pro_model(**tokens)
    # embeddings = outputs.last_hidden_state
    return tokens

In [36]:
# Loading the data
dataset = load_dataset("jglaser/binding_affinity")["train"]

proteins = dataset['seq'][:10]
molecules = dataset['smiles'][:10]

# Preprocess Protein
proteins = [re.sub(r"[UZOB]", "X", protein) for protein in proteins]
proteins = [" ".join(protein) for protein in proteins]

In [35]:
sequence_Example = ["MTVPDRSEIAGKWYVVALAS"]
s = [" ".join(protein) for protein in sequence_Example]
print(s)
tokens = pro_tokenizer(s,padding=True,truncation=True, max_length=512,return_tensors='pt')
tokens

['M T V P D R S E I A G K W Y V V A L A S']


{'input_ids': tensor([[ 2, 21, 15,  8, 16, 14, 13, 10,  9, 11,  6,  7, 12, 24, 20,  8,  8,  6,
          5,  6, 10,  3]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [37]:
# Running the Data through tokenizer:
pro_tokenized = pro_tokenize_and_encode(proteins)
pro_list = pd.DataFrame(pro_tokenized['input_ids'].numpy())

mole_tokenized = mole_tokenize_and_encode(molecules)
mole_list = pd.DataFrame(mole_tokenized['input_ids'].numpy())

In [38]:
# Saving To csv
pro_list.to_csv(r'Data\protein\protein.csv', index=False, header=False)
mole_list.to_csv(r'Data\molecule\molecule.csv', index=False, header=False)