In [1]:
# Import necessary packages
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertModel, BertTokenizer, RobertaModel, RobertaTokenizer, AdamW
from datasets import load_dataset

In [2]:
# prot_tokenizer = BertTokenizer.from_pretrained("Rostlab/prot_bert", do_lower_case=False)
# prot_model = BertModel.from_pretrained("Rostlab/prot_bert")

# mol_tokenizer = AutoTokenizer.from_pretrained("seyonec/ChemBERTa-zinc-base-v1")
# mol_model = AutoModelForMaskedLM.from_pretrained("seyonec/ChemBERTa-zinc-base-v1")

# dataset = load_dataset("jglaser/binding_affinity")['train'].train_test_split(test_size=0.001)['test']

# Load the dataset
dataset = load_dataset("jglaser/binding_affinity")

# Load the tokenizers
protein_tokenizer = BertTokenizer.from_pretrained("Rostlab/prot_bert")
smiles_tokenizer = RobertaTokenizer.from_pretrained("seyonec/ChemBERTa-zinc-base-v1")


In [4]:

# Tokenization function
def tokenize_function(examples):
    protein_input = protein_tokenizer(examples['seq'], truncation=True, padding='max_length', max_length=512, return_tensors='pt')
    smiles_input = smiles_tokenizer(examples['smiles_can'], truncation=True, padding='max_length', max_length=512, return_tensors='pt')
    return {'protein_input': protein_input, 'smiles_input': smiles_input}

# Tokenize the sequences and SMILES strings using multiple cores
dataset = dataset.map(tokenize_function, batched=True, num_proc=20)

# Split the dataset
# train_dataset = dataset['train'].train_test_split(test_size=0.1)['train']
# val_dataset = dataset['train'].train_test_split(test_size=0.1)['test']

train_dataset = dataset['train'].train_test_split(test_size=0.0001)['test']
val_dataset = dataset['train'].train_test_split(test_size=0.00001)['test']

Map (num_proc=20):   0%|          | 0/1836729 [00:00<?, ? examples/s]

NameError: name 'protein_tokenizer' is not defined

In [None]:
class BindingAffinityModel(nn.Module):
    def __init__(self):
        super(BindingAffinityModel, self).__init__()

        self.protein_encoder = BertModel.from_pretrained("Rostlab/prot_bert")
        self.smiles_encoder = BertModel.from_pretrained("seyonec/ChemBERTa-zinc-base-v1")

        # Considering both the models' embedding sizes are 768 (standard BERT-base size)
        self.fc = nn.Sequential(
            nn.Linear(768 * 2, 512),
            nn.ReLU(),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Linear(256, 1)  # Final binding affinity value
        )

    def forward(self, protein_input, smiles_input):
        protein_embedding = self.protein_encoder(**protein_input).last_hidden_state[:,0,:]
        smiles_embedding = self.smiles_encoder(**smiles_input).last_hidden_state[:,0,:]
        
        concatenated_embeddings = torch.cat((protein_embedding, smiles_embedding), dim=1)
        
        return self.fc(concatenated_embeddings)

model = BindingAffinityModel()

In [None]:

# Hyperparameters
EPOCHS = 10
LEARNING_RATE = 1e-4
BATCH_SIZE = 32

# Define Loss & Optimizer
criterion = nn.MSELoss()  # Mean Squared Error
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)

# Dataloaders
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=BATCH_SIZE)
val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

# Training Loop
for epoch in range(EPOCHS):
    model.train()
    
    for batch in train_dataloader:
        optimizer.zero_grad()
        
        outputs = model(batch['protein_input'], batch['smiles_input'])
        loss = criterion(outputs, batch['affinity'])
        
        loss.backward()
        optimizer.step()
    
    # Evaluation logic here (compute metrics on validation set)
    # ...


{'train': ['seq',
  'smiles',
  'affinity_uM',
  'neg_log10_affinity_M',
  'smiles_can',
  'affinity'],
 'no_kras': ['seq',
  'smiles',
  'affinity_uM',
  'neg_log10_affinity_M',
  'smiles_can',
  'affinity'],
 'covalent': ['seq',
  'smiles',
  'affinity_uM',
  'neg_log10_affinity_M',
  'smiles_can',
  'affinity']}

I am doing a machine learning project in python where for predicting protein ligand binding affinity. Here is the code I have right now:

from transformers import BertModel, BertTokenizer
from transformers import AutoTokenizer, AutoModelForMaskedLM
from datasets import load_dataset
import re

prot_tokenizer = BertTokenizer.from_pretrained("Rostlab/prot_bert", do_lower_case=False)
prot_model = BertModel.from_pretrained("Rostlab/prot_bert")

mol_tokenizer = AutoTokenizer.from_pretrained("seyonec/ChemBERTa-zinc-base-v1")
mol_model = AutoModelForMaskedLM.from_pretrained("seyonec/ChemBERTa-zinc-base-v1")

dataset = load_dataset("jglaser/binding_affinity")

the dataset has the following column names and structure:
{'train': ['seq',
  'smiles',
  'affinity_uM',
  'neg_log10_affinity_M',
  'smiles_can',
  'affinity'],
 'no_kras': ['seq',
  'smiles',
  'affinity_uM',
  'neg_log10_affinity_M',
  'smiles_can',
  'affinity'],
 'covalent': ['seq',
  'smiles',
  'affinity_uM',
  'neg_log10_affinity_M',
  'smiles_can',
  'affinity']}

I want to use the prot_model as an encoder for the sequences and the mol_model for an encoder for the smiles_can data. The model will then predict affinity values using a cross attention mechanism from the encoded sequence and molecule data. I want you to write the entire program. 

You are an experienced software engineer and machine learning engineer. 

Take a deep breath and work on this step by step