In [2]:
# Given Imports
import torch
import torch.nn as nn
import torch.optim as optim
import re
import os
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertModel, BertTokenizer, RobertaModel, RobertaTokenizer, AdamW
from datasets import load_dataset

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
torch.cuda.empty_cache()

### Load Encoders and Tokenizers

In [4]:
# Protein encoder
prot_tokenizer = BertTokenizer.from_pretrained("Rostlab/prot_bert")
prot_model = BertModel.from_pretrained("Rostlab/prot_bert").to(device)

# Molecule encoder
mol_tokenizer = RobertaTokenizer.from_pretrained("seyonec/ChemBERTa-zinc-base-v1")
mol_model = RobertaModel.from_pretrained("seyonec/ChemBERTa-zinc-base-v1").to(device)

### Load Dataset

In [5]:
dataset = load_dataset("jglaser/binding_affinity")

# split_dataset = dataset["train"].train_test_split(train_size=0.8)
train_dataset = dataset["train"].train_test_split(train_size=0.00008)['train']
test_dataset = dataset["train"].train_test_split(test_size=0.00002)['test']

### Preprocess Data

In [6]:
def preprocess_function(example):
    example['seq'] = re.sub(r"[UZOB]", "X", example['seq'])
    return example

train_dataset = train_dataset.map(preprocess_function)
test_dataset = test_dataset.map(preprocess_function)

Map:   0%|          | 0/1469 [00:00<?, ? examples/s]

Map:   0%|          | 0/368 [00:00<?, ? examples/s]

In [7]:
def encode_protein(prot_seq):
    # Encode protein sequences
    prot_tokens = prot_tokenizer(prot_seq, padding=True, return_tensors='pt')
    with torch.no_grad():
        prot_outputs = prot_model(**prot_tokens.to(device))
    prot_representations = prot_outputs.last_hidden_state.mean(dim=1)
    return prot_representations

def encode_molecule(mol_smiles):
    # Encode molecule sequences
    mol_tokens = mol_tokenizer(mol_smiles, padding=True, return_tensors='pt')
    with torch.no_grad():
        chem_outputs = mol_model(**mol_tokens.to(device))
    chem_representations = chem_outputs.last_hidden_state.mean(dim=1)
    return chem_representations

def encode_sequences(prot_seq, mol_smiles):
    prot_representations = encode_protein(prot_seq)
    chem_representations = encode_molecule(mol_smiles)
    return prot_representations, chem_representations

In [8]:
def create_tensor_dataset(dataset):
    proteins, smiles, affinities = dataset["seq"], dataset["smiles_can"], dataset["affinity"]
    prot_rep, chem_rep = encode_sequences(proteins, smiles)
    return TensorDataset(prot_rep, chem_rep, torch.tensor(affinities))

train_tensor_dataset = create_tensor_dataset(train_dataset)
test_tensor_dataset = create_tensor_dataset(test_dataset)

train_loader = DataLoader(train_tensor_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_tensor_dataset, batch_size=32)

OutOfMemoryError: CUDA out of memory. Tried to allocate 15.26 GiB. GPU 0 has a total capacty of 6.00 GiB of which 0 bytes is free. Of the allocated memory 13.93 GiB is allocated by PyTorch, and 51.44 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
torch.save(train_tensor_dataset, 'data\\train_data_processed')
torch.save(test_tensor_dataset, 'data\\test_data_processed')

In [8]:
train_tensor_dataset = torch.load('data\\test_data_processed')
train_tensor_dataset = torch.load('data\\train_data_processed')