Authors: Thea Enache, Jake Basile

Libraries used:

In [None]:
import pandas as pd
from transformers import BertTokenizer, BertModel
from transformers import AutoTokenizer, AutoModel
from sklearn.preprocessing import LabelEncoder
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import torch.nn.functional as F
import matplotlib.pyplot as plt

# Data Preparation

In [None]:
!pip install transformers datasets sklearn torch
!pip install gdown
!gdown --id 1iF3roCnrO0FbHh292g2r8LTeVNKbt72b
!gdown --id 1igkzOA8mTFdz3FJK6MUdHOMzykH3B_vp
!gdown --id 1QOb6_1MZ0O7bzX0ORTrdiFiUucOqdvvl
# This line of code is calling gdown to download from our google drive. "!gdown --id" remains the same, but to change the file change the unique ID
# which is the random letters and numbers at the end of the google drive link.
# First: Upload your data to the SHARED google drive folder
# Second: Click on your file in google drive, click open in new window
# Third: Check the url of this tab, it should look like :  https://drive.google.com/file/d/1iF3roCnrO0FbHh292g2r8LTeVNKbt72b/view
# The unique ID is the characters after /d/, in this case : 1iF3roCnrO0FbHh292g2r8LTeVNKbt72b
# Copy and paste your new, cleaned data's unique ID below, so that it is also downloaded alongside the other files:
# !gdown --id UniqueIDReplaceMe
# It saves the name of the file from google drive, so make sure it has a meaningful name in drive!
# This simply downloads the file to your runtime, to use the data call the function below (REMEMBER TO SEP='\T' FOR TSV DATA)

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting sklearn
  Downloading sklearn-0.0.post12.tar.gz (2.6 kB)
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py egg_info[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
  Preparing metadata (setup.py) ... [?25l[?25herror
[1;31merror[0m: [1mmetadata-generation-failed[0m

[31m×[0m Encountered error while generating package metadata.
[31m╰─>[0m See above for output.

[1;35mnote[0m: This is an issue with the package mentioned above, not pip.
[1;36mhint[0m: See above for details.
Downloading...
From (original): https://drive.google.com/uc?id=1iF3roCnrO0FbHh292g2r8LTeVNKbt72b
From (redirected): https://drive.google.com/uc?id=1iF3roCnrO0FbHh292g2r8LTeVNKbt72b&confirm=t&uuid=3d0b96bd-21f7-471a-a

In [None]:
df = pd.read_csv('BindingDB_Covid-19.tsv', sep='\t', on_bad_lines='warn')
df1 = pd.read_csv('BindingDB_BindingDB_Articles.tsv', sep='\t', on_bad_lines='warn')
df2 = pd.read_csv('BindingDB_Patents.tsv', sep='\t', on_bad_lines='warn')

print(df.columns)
print(df1.columns)
print(df2.columns)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Skipping line 1132170: expected 50 fields, saw 62
Skipping line 1132171: expected 50 fields, saw 62
Skipping line 1132172: expected 50 fields, saw 62
Skipping line 1132173: expected 50 fields, saw 62
Skipping line 1132174: expected 50 fields, saw 62
Skipping line 1132175: expected 50 fields, saw 62
Skipping line 1132176: expected 50 fields, saw 62
Skipping line 1132177: expected 50 fields, saw 62
Skipping line 1132178: expected 50 fields, saw 62
Skipping line 1132179: expected 50 fields, saw 62
Skipping line 1132180: expected 50 fields, saw 62
Skipping line 1132181: expected 50 fields, saw 62
Skipping line 1132182: expected 50 fields, saw 62
Skipping line 1132183: expected 50 fields, saw 62
Skipping line 1132184: expected 50 fields, saw 62
Skipping line 1132185: expected 50 fields, saw 62
Skipping line 1132186: expected 50 fields, saw 62
Skipping line 1132187: expected 50 fields, saw 62
Skipping line 1132188: expected 50 

Index(['BindingDB Reactant_set_id', 'Ligand SMILES', 'Ligand InChI',
       'Ligand InChI Key', 'BindingDB MonomerID', 'BindingDB Ligand Name',
       'Target Name',
       'Target Source Organism According to Curator or DataSource', 'Ki (nM)',
       'IC50 (nM)', 'Kd (nM)', 'EC50 (nM)', 'kon (M-1-s-1)', 'koff (s-1)',
       'pH', 'Temp (C)', 'Curation/DataSource', 'Article DOI',
       'BindingDB Entry DOI', 'PMID', 'PubChem AID', 'Patent Number',
       'Authors', 'Institution', 'Link to Ligand in BindingDB',
       'Link to Target in BindingDB',
       'Link to Ligand-Target Pair in BindingDB', 'Ligand HET ID in PDB',
       'PDB ID(s) for Ligand-Target Complex', 'PubChem CID of Ligand',
       'PubChem SID of Ligand', 'ChEBI ID of Ligand', 'ChEMBL ID of Ligand',
       'DrugBank ID of Ligand', 'IUPHAR_GRAC ID of Ligand',
       'KEGG ID of Ligand', 'ZINC ID of Ligand',
       'Number of Protein Chains in Target (>1 implies a multichain complex)',
       'BindingDB Target Chain Sequ

In [None]:
merged_df = pd.concat([df, df1, df2], ignore_index=True)
df = merged_df # merging data

Data Cleaning:

In [None]:
columns_to_keep = [ # our data has lots of irrelevant columns, these are the ones we care about
    'Ligand SMILES',
    'Target Name',
    'Target Source Organism According to Curator or DataSource',
    'Ki (nM)',
    'pH', # not used currently, kept for future research
    'Temp (C)' # not used currently, kept for future research
]
df_filtered = df[columns_to_keep] # filtered
df_filtered.rename(columns={
    'Target Source Organism According to Curator or DataSource': 'Target Source' # horrible name changed to make more sense
}, inplace=True)

df_cleaned = df_filtered.dropna() # clean
print(df_cleaned.head())
print(df_cleaned.shape) # (17282, 6), data is 17,282 observations

df_cleaned = df_cleaned.sample(n=500 , random_state=123) # tokenization and embedding takes a long time, reduce to 500 for now.

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered.rename(columns={


                                       Ligand SMILES  \
0  CCOC(=O)\C=C\[C@H](C[C@@H]1CCNC1=O)NC(=O)[C@H]...   
1  CCOC(=O)\C=C\[C@H](C[C@@H]1CCNC1=O)NC(=O)[C@H]...   
2  CCOC(=O)\C=C\[C@H](C[C@@H]1CCNC1=O)NC(=O)[C@H]...   
3  CCOC(=O)\C=C\[C@H](C[C@@H]1CCNC1=O)NC(=O)[C@H]...   
4  C[C@H](OC(C)(C)C)[C@H](NC(=O)OCc1ccccc1)C(=O)N...   

                 Target Name           Target Source Ki (nM)   pH Temp (C)  
0  Replicase polyprotein 1ab  Human SARS coronavirus  >10000  7.5  25.00 C  
1  Replicase polyprotein 1ab  Human SARS coronavirus    2260  7.5  25.00 C  
2  Replicase polyprotein 1ab  Human SARS coronavirus     660  7.5  25.00 C  
3  Replicase polyprotein 1ab  Human SARS coronavirus      58  7.5  25.00 C  
4  Replicase polyprotein 1ab  Human SARS coronavirus      53  7.5  25.00 C  
(17282, 6)


# Tokenization and Embedding

In [None]:
def get_tokenizer(texts, tokenizer): # This was used to test what the tokenizer outputted
    return tokenizer(texts, return_tensors='pt', padding=True, truncation=True)
# tokenizer takes input and splits it into "words" that are then assigned a numeric value
# For ligand smiles, the only tokenizer that makes sense is chemberta tokenizer
# For Protein embeddings, it is basically natural language so Base bert tokenizer makes more sense.
# Tokenization is very important to understand

def get_embeddings(texts, model, tokenizer): # This tokenizes and embedds, the above function was for testing and is unused.
    inputs = tokenizer(texts, return_tensors='pt', padding=True, truncation=True)
    outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings

chemberta_tokenizer = AutoTokenizer.from_pretrained("seyonec/ChemBERTa-zinc-base-v1") # For Smiles
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') # For Protein name data.
dnabert_tokenizer = AutoTokenizer.from_pretrained("zhihan1996/DNA_bert_6")
# print(get_tokenizer(df_cleaned['Ligand SMILES'].tolist(), chemberta_tokenizer)) # Tests the token output, unused.

#BERT Embeddings (General BERT, not trained on anything so we might wanna pre-train this one first)
bert_model = BertModel.from_pretrained('bert-base-uncased')

bert_SmileEmbeddings = get_embeddings(df_cleaned['Ligand SMILES'].tolist(), bert_model, chemberta_tokenizer)
df_cleaned['BERT_SmileEmbeddings'] = [embedding.tolist() for embedding in bert_SmileEmbeddings]
print("BERT Embeddings added to DataFrame:\n", df_cleaned)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/166 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/501 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/9.43k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/3.21k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

{'input_ids': tensor([[  0, 267,  21,  ...,   1,   1,   1],
        [  0, 398,  21,  ...,   1,   1,   1],
        [  0, 262, 263,  ...,   1,   1,   1],
        ...,
        [  0, 286,  12,  ...,   1,   1,   1],
        [  0, 267,  21,  ...,   1,   1,   1],
        [  0, 286,  63,  ...,   1,   1,   1]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

BERT Embeddings added to DataFrame:
                                             Ligand SMILES  \
38109   Cc1n[nH]c2ccc(cc12)-c1cncc(OC[C@@H](N)Cc2csc3c...   
519867    Fc1cc(NC(=O)Nc2ccc(cc2F)[C@@H]2CNCCO2)cc(c1)C#N   
31444                       CC(=O)Nc1ccc(cc1Cl)S(N)(=O)=O   
61986                  COc1cc(CNC(=O)CCCC\C=C\C(C)C)ccc1O   
642644  C[C@]1(CS(=O)(=O)C(C)(C)C(=N)N1)c1nc2ccc(cc2s1...   
71751   NS(=O)(=O)c1nnc(NC(=O)N[C@]23C[C@H]4C[C@H](C[C...   
606056  Nc1nn2cccnc2c1C(=O)Nc1cn(nc1-c1cc(Cl)ccc1OC(F)...   
583295  CC1(CC(=NO1)c1ccc2c(cc(CCCCC(O)=O)n(-c3ccc(F)c...   
640655                   COc1ccc(NC(=O)NCCCn2cncc2C)cc1OC   
90465                            Oc1ccc2cc(C#N)c(=O)oc2c1   
69633   CC1(C)C(=O)C(C)(C)c2cc(ccc12)C(=O)Nc1ccc(cc1)C...   
33492   CN(C)Cc1ccccc1-c1ccc(cc1)N1CC=Cc2c(nn(c2C1=O)-...   
33591   NC(=N)c1cccc(c1)-n1nc(cc1C(=O)Nc1ccc(cc1F)-n1c...   
53153   CC(C)C[C@H](NC(=O)CCC1CCCCC1)C(=O)NC(Cc1ccccc1...   
119328         N[C@H]1C[C@@H]1c1ccc(nc1)-c1cccc(

In [None]:
#DNABERT Embeddings (Pretrained on DNA, no need to pretrain but might need to fine tune?)
dnabert_model = AutoModel.from_pretrained("zhihan1996/DNA_bert_6")

dnabert_SmileEmbeddings = get_embeddings(df_cleaned['Ligand SMILES'].tolist(), dnabert_model, chemberta_tokenizer)
df_cleaned['DNABERT_SmileEmbeddings'] = [embedding.tolist() for embedding in dnabert_SmileEmbeddings]
print("DNABERT Embeddings added to DataFrame:\n", df_cleaned)

config.json:   0%|          | 0.00/1.45k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/359M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/359M [00:00<?, ?B/s]

DNABERT Embeddings added to DataFrame:
                                             Ligand SMILES  \
38109   Cc1n[nH]c2ccc(cc12)-c1cncc(OC[C@@H](N)Cc2csc3c...   
519867    Fc1cc(NC(=O)Nc2ccc(cc2F)[C@@H]2CNCCO2)cc(c1)C#N   
31444                       CC(=O)Nc1ccc(cc1Cl)S(N)(=O)=O   
61986                  COc1cc(CNC(=O)CCCC\C=C\C(C)C)ccc1O   
642644  C[C@]1(CS(=O)(=O)C(C)(C)C(=N)N1)c1nc2ccc(cc2s1...   
71751   NS(=O)(=O)c1nnc(NC(=O)N[C@]23C[C@H]4C[C@H](C[C...   
606056  Nc1nn2cccnc2c1C(=O)Nc1cn(nc1-c1cc(Cl)ccc1OC(F)...   
583295  CC1(CC(=NO1)c1ccc2c(cc(CCCCC(O)=O)n(-c3ccc(F)c...   
640655                   COc1ccc(NC(=O)NCCCn2cncc2C)cc1OC   
90465                            Oc1ccc2cc(C#N)c(=O)oc2c1   
69633   CC1(C)C(=O)C(C)(C)c2cc(ccc12)C(=O)Nc1ccc(cc1)C...   
33492   CN(C)Cc1ccccc1-c1ccc(cc1)N1CC=Cc2c(nn(c2C1=O)-...   
33591   NC(=N)c1cccc(c1)-n1nc(cc1C(=O)Nc1ccc(cc1F)-n1c...   
53153   CC(C)C[C@H](NC(=O)CCC1CCCCC1)C(=O)NC(Cc1ccccc1...   
119328         N[C@H]1C[C@@H]1c1ccc(nc1)-c1cc

In [None]:
#ChemBERTa Embeddings (Pretrained)
chemberta_model = AutoModel.from_pretrained("seyonec/ChemBERTa-zinc-base-v1")

chemberta_SmileEmbeddings = get_embeddings(df_cleaned['Ligand SMILES'].tolist(), chemberta_model, chemberta_tokenizer)
df_cleaned['ChemBERTa_SmileEmbeddings'] = [embedding.tolist() for embedding in chemberta_SmileEmbeddings]
print("ChemBERTa Embeddings added to DataFrame:\n", df_cleaned)

# Embeddings created for Ligand smiles, easy peasy.

pytorch_model.bin:   0%|          | 0.00/179M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/179M [00:00<?, ?B/s]

ChemBERTa Embeddings added to DataFrame:
                                             Ligand SMILES  \
38109   Cc1n[nH]c2ccc(cc12)-c1cncc(OC[C@@H](N)Cc2csc3c...   
519867    Fc1cc(NC(=O)Nc2ccc(cc2F)[C@@H]2CNCCO2)cc(c1)C#N   
31444                       CC(=O)Nc1ccc(cc1Cl)S(N)(=O)=O   
61986                  COc1cc(CNC(=O)CCCC\C=C\C(C)C)ccc1O   
642644  C[C@]1(CS(=O)(=O)C(C)(C)C(=N)N1)c1nc2ccc(cc2s1...   
71751   NS(=O)(=O)c1nnc(NC(=O)N[C@]23C[C@H]4C[C@H](C[C...   
606056  Nc1nn2cccnc2c1C(=O)Nc1cn(nc1-c1cc(Cl)ccc1OC(F)...   
583295  CC1(CC(=NO1)c1ccc2c(cc(CCCCC(O)=O)n(-c3ccc(F)c...   
640655                   COc1ccc(NC(=O)NCCCn2cncc2C)cc1OC   
90465                            Oc1ccc2cc(C#N)c(=O)oc2c1   
69633   CC1(C)C(=O)C(C)(C)c2cc(ccc12)C(=O)Nc1ccc(cc1)C...   
33492   CN(C)Cc1ccccc1-c1ccc(cc1)N1CC=Cc2c(nn(c2C1=O)-...   
33591   NC(=N)c1cccc(c1)-n1nc(cc1C(=O)Nc1ccc(cc1F)-n1c...   
53153   CC(C)C[C@H](NC(=O)CCC1CCCCC1)C(=O)NC(Cc1ccccc1...   
119328         N[C@H]1C[C@@H]1c1ccc(nc1)-c1

In [None]:
#BERT Embeddings
bert_ProteinEmbeddings = get_embeddings(df_cleaned['Target Name'].tolist(), bert_model, bert_tokenizer)
df_cleaned['BERT_ProteinEmbeddings'] = [embedding.tolist() for embedding in bert_ProteinEmbeddings]
print("BERT Embeddings added to DataFrame:\n", df_cleaned)

In [None]:
#DNABERT Embeddings (Pretrained on DNA, no need to pretrain but might need to fine tune?)
dnabert_tokenizer = AutoTokenizer.from_pretrained("zhihan1996/DNA_bert_6")
dnabert_ProteinEmbeddings = get_embeddings(df_cleaned['Target Name'].tolist(), dnabert_model, dnabert_tokenizer)
df_cleaned['DNABERT_ProteinEmbeddings'] = [embedding.tolist() for embedding in dnabert_ProteinEmbeddings]
print("DNABERT Embeddings added to DataFrame:\n", df_cleaned)

In [None]:
#ChemBERTa Embeddings (Pretrained )
chemberta_tokenizer = AutoTokenizer.from_pretrained("seyonec/ChemBERTa-zinc-base-v1")
chemberta_ProteinEmbeddings = get_embeddings(df_cleaned['Target Name'].tolist(), chemberta_model, chemberta_tokenizer)
df_cleaned['ChemBERTa_ProteinEmbeddings'] = [embedding.tolist() for embedding in chemberta_ProteinEmbeddings]
print("ChemBERTa Embeddings added to DataFrame:\n", df_cleaned)

# Embeddings created for Target Name, easy peasy.

Next, we just assign integer values to each unique entry in the Target Source column. Not so bad!

In [None]:
from sklearn.preprocessing import LabelEncoder

df_cleaned['Target Source Code'] = pd.Categorical(df_cleaned['Target Source']).codes  # Starts from 0
print(df_cleaned)

print(df_cleaned.columns)

# We are just turning the categorical nature of "Target Source" from strings to integers.
label_encoder = LabelEncoder()
#fit the encoder to the target source column
label_encoder.fit(df_cleaned['Target Source'])
#transform the target source column to the encoded values
df_cleaned['Target Source Code'] = label_encoder.transform(df_cleaned['Target Source'])

# Data Loading and custom dataset creation

In [None]:

# custom dataset class
class MoleculeDataset(Dataset):
    def __init__(self, dataframe, smiles_embeddings_index, protein_embeddings_index):
        self.dataframe = dataframe

        # extract features and target.
        embeddings_columns = {
            'BERT_SmileEmbeddings': dataframe['BERT_SmileEmbeddings'].apply(np.array).tolist(),
            'DNABERT_SmileEmbeddings': dataframe['DNABERT_SmileEmbeddings'].apply(np.array).tolist(),
            'ChemBERTa_SmileEmbeddings': dataframe['ChemBERTa_SmileEmbeddings'].apply(np.array).tolist(),
            'BERT_ProteinEmbeddings': dataframe['BERT_ProteinEmbeddings'].apply(np.array).tolist(),
            'DNABERT_ProteinEmbeddings': dataframe['DNABERT_ProteinEmbeddings'].apply(np.array).tolist(),
            'ChemBERTa_ProteinEmbeddings': dataframe['ChemBERTa_ProteinEmbeddings'].apply(np.array).tolist(),
        }

        self.smiles_embeddings = embeddings_columns.get( # Chooses from above depending on integer input.
            dataframe.columns[smiles_embeddings_index]
        )
        self.protein_embeddings = embeddings_columns.get( # Same as above
            dataframe.columns[protein_embeddings_index]
        )

        # combine the embeddings, this results in a (1536, 1) size attribute.
        self.embeddings = [
            np.concatenate([smiles, protein])
            for smiles, protein in zip(
                self.smiles_embeddings,
                self.protein_embeddings,

            )
        ]

        # convert to torch tensors (Really just for syntax for other methods)
        self.embeddings = torch.tensor(self.embeddings, dtype=torch.float32)

        # target variable (ki values)
        # Convert 'Ki (nM)' column to numeric, replacing errors with NaN
        # and then fill NaN with 0 and convert to a NumPy array before
        # creating the tensor.
        self.target = torch.tensor(
            dataframe['Ki (nM)'].apply(pd.to_numeric, errors='coerce').fillna(0).values,
            dtype=torch.float32 # specify dtype to ensure consistency
        )

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        return {'inputs': self.embeddings[idx], 'Ki': self.target[idx]}

#-------------------------------------------Thea 12/6
#fixed val split

train_df, val_df = train_test_split(df_cleaned, test_size=0.2, random_state=123) # 80/20 split

train_datasets = {
    'BB': MoleculeDataset(train_df, 6, 9), # BaseBert Smile embeddings, BaseBert Protein embeddings
    'BD': MoleculeDataset(train_df, 6, 10),# BaseBert Smile embeddings, DNAbert Protein embeddings
    'BC': MoleculeDataset(train_df, 6, 11),# BaseBert Smile embeddings, Chemberta Protein embeddings
    'DB': MoleculeDataset(train_df, 7, 9), # DNAbert Smile embeddings, BaseBert Protein embeddings
    'DD': MoleculeDataset(train_df, 7, 10),# DNAbert Smile embeddings, DNAbert Protein embeddings
    'DC': MoleculeDataset(train_df, 7, 11),# DNAbert Smile embeddings, BaseBert Protein embeddings
    'CB': MoleculeDataset(train_df, 8, 9), # Chemberta Smile embeddings, BaseBert Protein embeddings
    'CD': MoleculeDataset(train_df, 8, 10),# Chemberta Smile embeddings, DNAbert Protein embeddings
    'CC': MoleculeDataset(train_df, 8, 11),# Chemberta Smile embeddings, Chemberta Protein embeddings
}



val_datasets = {
    'BB': MoleculeDataset(val_df, 6, 9),
    'BD': MoleculeDataset(val_df, 6, 10),
    'BC': MoleculeDataset(val_df, 6, 11),
    'DB': MoleculeDataset(val_df, 7, 9),
    'DD': MoleculeDataset(val_df, 7, 10),
    'DC': MoleculeDataset(val_df, 7, 11),
    'CB': MoleculeDataset(val_df, 8, 9),
    'CD': MoleculeDataset(val_df, 8, 10),
    'CC': MoleculeDataset(val_df, 8, 11),
}

train_loader = {
    name: DataLoader(dataset, batch_size=32, shuffle=True)
    for name, dataset in train_datasets.items()
}

val_loader = {
    name: DataLoader(dataset, batch_size=32, shuffle=False)
    for name, dataset in val_datasets.items()
}
#-------------------------------------------^

# Neural Network

In [None]:
# Neural Network Model
class DTIRegressionModel(nn.Module):
    def __init__(self, input_size, hidden_size1, hidden_size2, output_size, embedding_combination_method):
        super(DTIRegressionModel, self).__init__()

        self.embedding_combination_method = embedding_combination_method

        if self.embedding_combination_method == 'concatenation':
            self.fc1 = nn.Linear(input_size, hidden_size1)  # Adjust input_size accordingly 1536
        elif self.embedding_combination_method in ['sum', 'average']:
            self.fc1 = nn.Linear(768, hidden_size1)  # BERT embeddings are typically 768 dimensions
        else:
            raise ValueError("Invalid embedding_combination_method. Choose from 'concatenation', 'sum', or 'average'.")

        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size1, hidden_size2)
        self.dropout = nn.Dropout(0.1)
        self.fc3 = nn.Linear(hidden_size2, output_size)

    def forward(self, x):
        #normalize embeddings -Thea 12/7------
        x = torch.nn.functional.normalize(x, p=2, dim=1)
        #------------------------------^
        if self.embedding_combination_method == 'concatenation':
            combined_embeddings = x
        elif self.embedding_combination_method == 'sum':
            smiles_embeddings = x[:, :768]
            protein_embeddings = x[:, 768:1536]
            combined_embeddings = smiles_embeddings + protein_embeddings
        elif self.embedding_combination_method == 'average':
            smiles_embeddings = x[:, :768]
            protein_embeddings = x[:, 768:1536]
            combined_embeddings = (smiles_embeddings + protein_embeddings) / 2

        # Normalize embeddings
        combined_embeddings = (combined_embeddings - combined_embeddings.mean(dim=1, keepdim=True)) / (combined_embeddings.std(dim=1, keepdim=True) + 1e-6)

        # Forward pass
        out = self.fc1(combined_embeddings)
        out = self.relu(out)
        out = self.dropout(out)
        out = self.fc2(out)
        out = self.relu(out)
        out = self.dropout(out)
        out = self.fc3(out)
        return out

# Training and results

In [None]:
# Evaluate Huber
import copy  # Import the copy module for deep copying
def evaluate_huber_loss(model, loader):
    model.eval()  # Set the model to evaluation mode
    huber_loss = 0.0
    with torch.no_grad():
        for batch in loader:
            inputs = batch['inputs']
            labels = batch['Ki']
            outputs = model(inputs)
            # Compute Huber loss (smooth_l1_loss is the PyTorch implementation of Huber loss)
            huber_loss += F.smooth_l1_loss(outputs, labels, reduction='sum').view(-1,1)  # Using 'sum' to accumulate the loss
    return huber_loss / len(loader)  # Averaging the loss over the entire loader

totResults = {}
huber_loss_results = {}
train_loss_histories = {}
val_loss_histories = {} #store train and val loss histories

# Training and Evaluation Workflow
# Training and Evaluation Workflow
def ProjectWork(concatType, train_loader, val_loader):
    results = {}
    huber_loss_results = {}
    models = {}

    # Store the top 3 models and their validation losses
    top_3_models = []

    for combination, train_loader_comb in train_loader.items():
        input_size = next(iter(train_loader_comb))['inputs'].shape[1]
        model = DTIRegressionModel(
            input_size,  # 1536
            hidden_size1=128,
            hidden_size2=64,
            output_size=1,
            embedding_combination_method=concatType
        )

        criterion = nn.HuberLoss(delta=1.0)  # Using Huber loss
        optimizer = optim.Adam(model.parameters(), lr=0.001)

        train_loss_history = []
        val_loss_history = []

        num_epochs = 8
        for epoch in range(num_epochs):
            model.train()  # Set model to training mode
            train_loss = 0.0  # Initialize train_loss at the beginning of each epoch
            for batch in train_loader_comb:
                inputs = batch['inputs']
                labels = batch['Ki'].view(-1, 1)  # Ensure correct shape

                # Forward pass
                outputs = model(inputs)
                loss = criterion(outputs, labels)

                # Backward pass and optimization
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

                train_loss += loss.item()

            train_loss /= len(train_loader_comb)  # Average training loss over all batches
            train_loss_history.append(train_loss)  # Append training loss to the history

            # Validation phase
            model.eval()
            with torch.no_grad():
                val_loss = 0.0
                val_loader_comb = val_loader[combination]
                for batch in val_loader_comb:
                    inputs, labels = batch['inputs'], batch['Ki']
                    labels = labels.view(-1, 1)
                    outputs = model(inputs)
                    val_loss += criterion(outputs, labels).item()

                val_loss /= len(val_loader_comb)
                val_loss_history.append(val_loss)  # Append average val loss to history

            # Fixed print statement
            print(f'Combination: {combination}, Method: {concatType.capitalize()}, Epoch [{epoch + 1}/{num_epochs}], Train Loss: {loss.item():.4f}, Val Loss: {val_loss:.4f}')

        # Store results
        models[combination] = model
        train_loss_histories[combination] = train_loss_history
        val_loss_histories[combination] = val_loss_history
        results[combination] = val_loss

        # Add the model and its validation loss to the top_3_models list
        top_3_models.append((val_loss, combination, model))
        # Keep only the top 3 models (sorted by validation loss)
        top_3_models = sorted(top_3_models, key=lambda x: x[0])[:3]

    # Save results for this concatenation type
    totResults[concatType] = {"Top 3 Models": [(comb, val) for val, comb, _ in top_3_models]}

    # Save models for this concatenation type
    saved_models[concatType] = [model for _, _, model in top_3_models]

saved_models = {}
totResults = {}

ProjectWork(concatType="concatenation", train_loader=train_loader, val_loader=val_loader)
contrain_loss_histories = copy.deepcopy(train_loss_histories)
conval_loss_histories = copy.deepcopy(val_loss_histories)

ProjectWork(concatType="sum", train_loader=train_loader, val_loader=val_loader)
sumtrain_loss_histories = copy.deepcopy(train_loss_histories)
sumval_loss_histories = copy.deepcopy(val_loss_histories)

ProjectWork(concatType="average", train_loader=train_loader, val_loader=val_loader)
avgtrain_loss_histories = copy.deepcopy(train_loss_histories)
avgval_loss_histories = copy.deepcopy(val_loss_histories)

# Display summary for each method
print("\nTop 3 Models Summary:")
for concatType, details in totResults.items():
    print(f'\n{concatType.capitalize()}:')
    for i, (comb, val_loss) in enumerate(details["Top 3 Models"], start=1):
        print(f'  Rank {i}: Combination = {comb}, Val Loss = {val_loss:.4f}')

# Code for plots, designed for the linux environment in which it was ran.

In [None]:
# Define a function to plot the relative loss history for each method and save the plots
def plot_relative_loss_history(train_loss_histories, val_loss_histories, concatType):
    # Extract BB's train and validation losses
    bb_train_loss = train_loss_histories.get('BB', [])
    bb_val_loss = val_loss_histories.get('BB', [])

    plt.figure(figsize=(14, 6))

    # Plot relative training loss for each combination as a proportion of BB's train loss
    plt.subplot(1, 2, 1)  # Create subplot for training loss
    for combination in train_loss_histories:
        if combination == 'BB':
            continue  # Skip the BB model itself, since we're using it as the baseline
        relative_train_loss = [loss / bb_loss for loss, bb_loss in zip(train_loss_histories[combination], bb_train_loss)]
        plt.plot(relative_train_loss, label=f'{combination} - Train', linestyle='-', marker='o')

    plt.title(f'Training Loss Proportions Over Epochs ({concatType.capitalize()})')
    plt.xlabel('Epochs')
    plt.ylabel('Relative Loss (Train)')
    plt.legend()
    plt.grid(True)

    # Plot relative validation loss for each combination as a proportion of BB's validation loss
    plt.subplot(1, 2, 2)  # Create subplot for validation loss
    for combination in val_loss_histories:
        if combination == 'BB':
            continue  # Skip the BB model itself, since we're using it as the baseline
        relative_val_loss = [loss / bb_loss for loss, bb_loss in zip(val_loss_histories[combination], bb_val_loss)]
        plt.plot(relative_val_loss, label=f'{combination} - Val', linestyle='--', marker='x')

    plt.title(f'Validation Loss Proportions Over Epochs ({concatType.capitalize()})')
    plt.xlabel('Epochs')
    plt.ylabel('Relative Loss (Val)')
    plt.legend()
    plt.grid(True)

    plt.tight_layout()  # Adjust layout to avoid overlap

    # Save the plot to the "plots" directory
    file_path = f"plots/{concatType}_loss_proportions.png"
    plt.savefig(file_path)
    print(f"Plot saved to {file_path}")

    plt.close()  # Close the figure to free up memory

# After training is complete, generate and save graphs for each concatenation type

def plot_true_loss(train_loss_histories, val_loss_histories, concatType):
    """
    Plots the true training and validation loss for each combination over epochs.

    Parameters:
    - train_loss_histories (dict): Dictionary with keys as combination names and values as lists of training losses.
    - val_loss_histories (dict): Dictionary with keys as combination names and values as lists of validation losses.
    - concatType (str): Type of concatenation to include in the plot title and file name.
    """
    plt.figure(figsize=(14, 6))

    # Plot true training loss
    plt.subplot(1, 2, 1)  # Create subplot for training loss
    for combination, train_losses in train_loss_histories.items():
        plt.plot(train_losses, label=f'{combination} - Train', linestyle='-', marker='o')

    plt.title(f'Training Loss Over Epochs ({concatType.capitalize()})')
    plt.xlabel('Epochs')
    plt.ylabel('True Loss (Train)')
    plt.legend()
    plt.grid(True)

    # Plot true validation loss
    plt.subplot(1, 2, 2)  # Create subplot for validation loss
    for combination, val_losses in val_loss_histories.items():
        plt.plot(val_losses, label=f'{combination} - Val', linestyle='--', marker='x')

    plt.title(f'Validation Loss Over Epochs ({concatType.capitalize()})')
    plt.xlabel('Epochs')
    plt.ylabel('True Loss (Val)')
    plt.legend()
    plt.grid(True)

    plt.tight_layout()  # Adjust layout to avoid overlap

    # Save the plot to the "plots" directory
    file_path = f"plots/{concatType}_true_loss.png"  # This path works for linux
    plt.savefig(file_path)
    print(f"Plot saved to {file_path}")

    plt.close()  # Close the figure to free up memory


plot_relative_loss_history(contrain_loss_histories, conval_loss_histories, "concatenation")
plot_relative_loss_history(sumtrain_loss_histories, sumval_loss_histories, "sum")
plot_relative_loss_history(avgtrain_loss_histories, avgval_loss_histories, "average")