# Supervised Model 2 - Neural Network
In this approach we train a neural network on encoded inputs of Ligand SMILES and Target Drug Sequence to try and predict the affinity between the two. It is a classic regression task where we are attempting to minimize the MSE between the predicted and actual values


## Prerequisites

In [None]:
!pip install rdkit

Collecting rdkit
  Downloading rdkit-2024.3.6-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (4.0 kB)
Downloading rdkit-2024.3.6-cp310-cp310-manylinux_2_28_x86_64.whl (32.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m32.8/32.8 MB[0m [31m33.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rdkit
Successfully installed rdkit-2024.3.6


In [None]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from rdkit import Chem
from rdkit.Chem import AllChem

## Data Loading

In [None]:
data = pd.read_csv('EC50_bind.tsv', sep='\t')
data.head()

Unnamed: 0,drug_id,target_id,smiles,target_seq,origin_affinity,affinity
0,100000,P49862,CN1CCN(Cc2c(O)c(Cl)cc3c(cc(=O)oc23)-c2ccccc2)CC1,MARSLLLPLQILLLSLALETAGEEAQGDKIIDGAPCARGSHPWQVA...,68293,4.165624
1,100001,P49862,COc1ccccc1C1CC(=Nc2nnnn12)c1ccc(C)cc1,MARSLLLPLQILLLSLALETAGEEAQGDKIIDGAPCARGSHPWQVA...,23546,4.628083
2,100002,P49862,Cc1oc2c(CN3CCCC3)c(O)ccc2c(=O)c1-c1ccc(Br)cc1,MARSLLLPLQILLLSLALETAGEEAQGDKIIDGAPCARGSHPWQVA...,>69498,4.158021
3,100003,P49862,CCN1C(c2ccccn2)n2c(nc3ccccc23)-c2ccccc12,MARSLLLPLQILLLSLALETAGEEAQGDKIIDGAPCARGSHPWQVA...,>69511,4.15794
4,100004,P49862,Oc1ccc2c(occ(-c3ccc(Br)cc3)c2=O)c1CN1CCOCC1,MARSLLLPLQILLLSLALETAGEEAQGDKIIDGAPCARGSHPWQVA...,66092,4.179851


In [None]:
# Convert SMILES to fingerprints
def smiles_to_fingerprint(smiles, radius=2, n_bits=2048):
    mol = Chem.MolFromSmiles(smiles)
    if mol is not None:
        return AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=n_bits)
    else:
        return np.zeros((n_bits,))

# Convert sequences to amino acid composition
def sequence_to_composition(sequence):
    amino_acids = 'ACDEFGHIKLMNPQRSTVWY'
    composition = [sequence.count(aa) / len(sequence) for aa in amino_acids]
    return np.array(composition)

data['Fingerprint'] = data['smiles'].apply(smiles_to_fingerprint)
data['Composition'] = data['target_seq'].apply(sequence_to_composition)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m


In [None]:
device = torch.device("cuda")

X_drug = [torch.tensor(fp, dtype=torch.float32).to(device) for fp in data['Fingerprint']]
X_target = [torch.tensor(comp, dtype=torch.float32).to(device) for comp in data['Composition']]
X = [(drug, target) for drug, target in zip(X_drug, X_target)]
y = torch.tensor(data['affinity'].values, dtype=torch.float32).to(device)

In [None]:
from torch.utils.data import Dataset, DataLoader, random_split

class AffinityDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

def collate_fn(batch):
    drugs = [item[0][0] for item in batch]
    targets = [item[0][1] for item in batch]
    affinities = torch.tensor([item[1] for item in batch], dtype=torch.float32).to(device)

    return drugs, targets, affinities

dataset = AffinityDataset(X, y)
test_split_ratio = 0.2
test_size = int(len(dataset) * test_split_ratio)
train_size = len(dataset) - test_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

batch_size = 32

dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

## Model Initialization

In [None]:
import torch.nn as nn

class AffinityNN(nn.Module):
    def __init__(self):
        super(AffinityNN, self).__init__()
        self.drug_fc = nn.Sequential(
            nn.Linear(2048, 128),
            nn.ReLU(),
            nn.Dropout(0.3)
        )
        self.target_fc = nn.Sequential(
            nn.Linear(20, 64),
            nn.ReLU(),
            nn.Dropout(0.3)
        )

        self.fc_combined = nn.Sequential(
            nn.Linear(128 + 64, 64),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(64, 1)
        )

    def forward(self, drugs, targets):
        drug_out = torch.stack([self.drug_fc(drug) for drug in drugs]).to(device)
        target_out = torch.stack([self.target_fc(target) for target in targets]).to(device)
        combined = torch.cat((drug_out, target_out), dim=1)
        output = self.fc_combined(combined)
        return output

In [None]:
model = AffinityNN()
device = torch.device("cuda")
model.to(device)

AffinityNN(
  (drug_fc): Sequential(
    (0): Linear(in_features=2048, out_features=128, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.3, inplace=False)
  )
  (target_fc): Sequential(
    (0): Linear(in_features=20, out_features=64, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.3, inplace=False)
  )
  (fc_combined): Sequential(
    (0): Linear(in_features=192, out_features=64, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.3, inplace=False)
    (3): Linear(in_features=64, out_features=1, bias=True)
  )
)

In [None]:
def initialize_weights(m):
    if isinstance(m, nn.Linear):
        nn.init.xavier_uniform_(m.weight)
        nn.init.zeros_(m.bias)

model.apply(initialize_weights)

AffinityNN(
  (drug_fc): Sequential(
    (0): Linear(in_features=2048, out_features=128, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.3, inplace=False)
  )
  (target_fc): Sequential(
    (0): Linear(in_features=20, out_features=64, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.3, inplace=False)
  )
  (fc_combined): Sequential(
    (0): Linear(in_features=192, out_features=64, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.3, inplace=False)
    (3): Linear(in_features=64, out_features=1, bias=True)
  )
)

# Training and Evaluation

In [None]:
best_model_path = "best_model.pth"
model.load_state_dict(torch.load(best_model_path))

  model.load_state_dict(torch.load(best_model_path))


<All keys matched successfully>

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

optimizer = optim.Adam(model.parameters(), lr=1e-4)
criterion = nn.MSELoss()

best_model_path = "best_model.pth"

best_loss = float("inf")

num_epochs = 20

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0

    for drugs, targets, affinities in dataloader:
        drugs = [drug.to(device) for drug in drugs]
        targets = [target.to(device) for target in targets]
        affinities = affinities.to(device)

        optimizer.zero_grad()

        outputs = model(drugs, targets)

        outputs = outputs.view(-1, 1)
        affinities = affinities.view(-1, 1)

        loss = criterion(outputs, affinities)

        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    avg_loss = running_loss / len(dataloader)
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}")

    if avg_loss < best_loss:
        print(f"New best loss: {avg_loss:.4f}. Saving model...")
        best_loss = avg_loss
        torch.save(model.state_dict(), best_model_path)

print("Loading the best model...")
model.load_state_dict(torch.load(best_model_path))
model.eval()

Epoch 1/20, Loss: 2.9766
New best loss: 2.9766. Saving model...
Epoch 2/20, Loss: 1.9402
New best loss: 1.9402. Saving model...
Epoch 3/20, Loss: 1.7555
New best loss: 1.7555. Saving model...
Epoch 4/20, Loss: 1.6190
New best loss: 1.6190. Saving model...
Epoch 5/20, Loss: 1.5055
New best loss: 1.5055. Saving model...
Epoch 6/20, Loss: 1.4265
New best loss: 1.4265. Saving model...
Epoch 7/20, Loss: 1.3429
New best loss: 1.3429. Saving model...
Epoch 8/20, Loss: 1.2914
New best loss: 1.2914. Saving model...
Epoch 9/20, Loss: 1.2336
New best loss: 1.2336. Saving model...
Epoch 10/20, Loss: 1.1958
New best loss: 1.1958. Saving model...
Epoch 11/20, Loss: 1.1484
New best loss: 1.1484. Saving model...
Epoch 12/20, Loss: 1.1044
New best loss: 1.1044. Saving model...
Epoch 13/20, Loss: 1.0818
New best loss: 1.0818. Saving model...
Epoch 14/20, Loss: 1.0443
New best loss: 1.0443. Saving model...
Epoch 15/20, Loss: 1.0181
New best loss: 1.0181. Saving model...
Epoch 16/20, Loss: 0.9979
New best

  model.load_state_dict(torch.load(best_model_path))


AffinityNN(
  (drug_fc): Sequential(
    (0): Linear(in_features=2048, out_features=128, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.3, inplace=False)
  )
  (target_fc): Sequential(
    (0): Linear(in_features=20, out_features=64, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.3, inplace=False)
  )
  (fc_combined): Sequential(
    (0): Linear(in_features=192, out_features=64, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.3, inplace=False)
    (3): Linear(in_features=64, out_features=1, bias=True)
  )
)

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

optimizer = optim.Adam(model.parameters(), lr=1e-4)
criterion = nn.MSELoss()

best_model_path = "best_model.pth"

best_loss = float("inf")

num_epochs = 20

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0

    for drugs, targets, affinities in dataloader:
        drugs = [drug.to(device) for drug in drugs]
        targets = [target.to(device) for target in targets]
        affinities = affinities.to(device)

        optimizer.zero_grad()

        outputs = model(drugs, targets)

        outputs = outputs.view(-1, 1)
        affinities = affinities.view(-1, 1)

        loss = criterion(outputs, affinities)

        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    avg_loss = running_loss / len(dataloader)
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}")

    if avg_loss < best_loss:
        print(f"New best loss: {avg_loss:.4f}. Saving model...")
        best_loss = avg_loss
        torch.save(model.state_dict(), best_model_path)

print("Loading the best model...")
model.load_state_dict(torch.load(best_model_path))
model.eval()

Epoch 1/20, Loss: 0.8854
New best loss: 0.8854. Saving model...
Epoch 2/20, Loss: 0.8640
New best loss: 0.8640. Saving model...
Epoch 3/20, Loss: 0.8509
New best loss: 0.8509. Saving model...
Epoch 4/20, Loss: 0.8421
New best loss: 0.8421. Saving model...
Epoch 5/20, Loss: 0.8175
New best loss: 0.8175. Saving model...
Epoch 6/20, Loss: 0.8072
New best loss: 0.8072. Saving model...
Epoch 7/20, Loss: 0.7853
New best loss: 0.7853. Saving model...
Epoch 8/20, Loss: 0.7711
New best loss: 0.7711. Saving model...
Epoch 9/20, Loss: 0.7616
New best loss: 0.7616. Saving model...
Epoch 10/20, Loss: 0.7472
New best loss: 0.7472. Saving model...
Epoch 11/20, Loss: 0.7345
New best loss: 0.7345. Saving model...
Epoch 12/20, Loss: 0.7174
New best loss: 0.7174. Saving model...
Epoch 13/20, Loss: 0.7086
New best loss: 0.7086. Saving model...
Epoch 14/20, Loss: 0.6974
New best loss: 0.6974. Saving model...
Epoch 15/20, Loss: 0.6861
New best loss: 0.6861. Saving model...
Epoch 16/20, Loss: 0.6726
New best

  model.load_state_dict(torch.load(best_model_path))


AffinityNN(
  (drug_fc): Sequential(
    (0): Linear(in_features=2048, out_features=128, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.3, inplace=False)
  )
  (target_fc): Sequential(
    (0): Linear(in_features=20, out_features=64, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.3, inplace=False)
  )
  (fc_combined): Sequential(
    (0): Linear(in_features=192, out_features=64, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.3, inplace=False)
    (3): Linear(in_features=64, out_features=1, bias=True)
  )
)

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

optimizer = optim.Adam(model.parameters(), lr=1e-4)
criterion = nn.MSELoss()

best_model_path = "best_model.pth"

best_loss = float("inf")

num_epochs = 20

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0

    for drugs, targets, affinities in dataloader:
        drugs = [drug.to(device) for drug in drugs]
        targets = [target.to(device) for target in targets]
        affinities = affinities.to(device)

        optimizer.zero_grad()

        outputs = model(drugs, targets)

        outputs = outputs.view(-1, 1)
        affinities = affinities.view(-1, 1)

        loss = criterion(outputs, affinities)

        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    avg_loss = running_loss / len(dataloader)
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}")

    if avg_loss < best_loss:
        print(f"New best loss: {avg_loss:.4f}. Saving model...")
        best_loss = avg_loss
        torch.save(model.state_dict(), best_model_path)

print("Loading the best model...")
model.load_state_dict(torch.load(best_model_path))
model.eval()

Epoch 1/20, Loss: 0.5902
New best loss: 0.5902. Saving model...
Epoch 2/20, Loss: 0.5764
New best loss: 0.5764. Saving model...
Epoch 3/20, Loss: 0.5709
New best loss: 0.5709. Saving model...
Epoch 4/20, Loss: 0.5658
New best loss: 0.5658. Saving model...
Epoch 5/20, Loss: 0.5553
New best loss: 0.5553. Saving model...
Epoch 6/20, Loss: 0.5493
New best loss: 0.5493. Saving model...
Epoch 7/20, Loss: 0.5456
New best loss: 0.5456. Saving model...
Epoch 8/20, Loss: 0.5408
New best loss: 0.5408. Saving model...
Epoch 9/20, Loss: 0.5344
New best loss: 0.5344. Saving model...
Epoch 10/20, Loss: 0.5271
New best loss: 0.5271. Saving model...
Epoch 11/20, Loss: 0.5263
New best loss: 0.5263. Saving model...
Epoch 12/20, Loss: 0.5201
New best loss: 0.5201. Saving model...
Epoch 13/20, Loss: 0.5159
New best loss: 0.5159. Saving model...
Epoch 14/20, Loss: 0.5149
New best loss: 0.5149. Saving model...
Epoch 15/20, Loss: 0.5105
New best loss: 0.5105. Saving model...
Epoch 16/20, Loss: 0.5065
New best

  model.load_state_dict(torch.load(best_model_path))


AffinityNN(
  (drug_fc): Sequential(
    (0): Linear(in_features=2048, out_features=128, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.3, inplace=False)
  )
  (target_fc): Sequential(
    (0): Linear(in_features=20, out_features=64, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.3, inplace=False)
  )
  (fc_combined): Sequential(
    (0): Linear(in_features=192, out_features=64, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.3, inplace=False)
    (3): Linear(in_features=64, out_features=1, bias=True)
  )
)

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

optimizer = optim.Adam(model.parameters(), lr=1e-4)
criterion = nn.MSELoss()

best_model_path = "best_model.pth"

best_loss = float("inf")

num_epochs = 20

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0

    for drugs, targets, affinities in dataloader:
        drugs = [drug.to(device) for drug in drugs]
        targets = [target.to(device) for target in targets]
        affinities = affinities.to(device)

        optimizer.zero_grad()

        outputs = model(drugs, targets)

        outputs = outputs.view(-1, 1)
        affinities = affinities.view(-1, 1)

        loss = criterion(outputs, affinities)

        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    avg_loss = running_loss / len(dataloader)
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}")

    if avg_loss < best_loss:
        print(f"New best loss: {avg_loss:.4f}. Saving model...")
        best_loss = avg_loss
        torch.save(model.state_dict(), best_model_path)

print("Loading the best model...")
model.load_state_dict(torch.load(best_model_path))
model.eval()

Epoch 1/20, Loss: 0.4921
New best loss: 0.4921. Saving model...
Epoch 2/20, Loss: 0.4891
New best loss: 0.4891. Saving model...
Epoch 3/20, Loss: 0.4845
New best loss: 0.4845. Saving model...
Epoch 4/20, Loss: 0.4827
New best loss: 0.4827. Saving model...
Epoch 5/20, Loss: 0.4836
Epoch 6/20, Loss: 0.4837
Epoch 7/20, Loss: 0.4772
New best loss: 0.4772. Saving model...
Epoch 8/20, Loss: 0.4779
Epoch 9/20, Loss: 0.4735
New best loss: 0.4735. Saving model...
Epoch 10/20, Loss: 0.4720
New best loss: 0.4720. Saving model...
Epoch 11/20, Loss: 0.4711
New best loss: 0.4711. Saving model...
Epoch 12/20, Loss: 0.4691
New best loss: 0.4691. Saving model...
Epoch 13/20, Loss: 0.4677
New best loss: 0.4677. Saving model...
Epoch 14/20, Loss: 0.4626
New best loss: 0.4626. Saving model...
Epoch 15/20, Loss: 0.4630
Epoch 16/20, Loss: 0.4614
New best loss: 0.4614. Saving model...
Epoch 17/20, Loss: 0.4583
New best loss: 0.4583. Saving model...
Epoch 18/20, Loss: 0.4590
Epoch 19/20, Loss: 0.4591
Epoch 20

  model.load_state_dict(torch.load(best_model_path))


AffinityNN(
  (drug_fc): Sequential(
    (0): Linear(in_features=2048, out_features=128, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.3, inplace=False)
  )
  (target_fc): Sequential(
    (0): Linear(in_features=20, out_features=64, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.3, inplace=False)
  )
  (fc_combined): Sequential(
    (0): Linear(in_features=192, out_features=64, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.3, inplace=False)
    (3): Linear(in_features=64, out_features=1, bias=True)
  )
)

In [None]:
from google.colab import files
files.download('best_model.pth')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
best_model_path = "best_model(1).pth"
model.load_state_dict(torch.load(best_model_path))

  model.load_state_dict(torch.load(best_model_path))


<All keys matched successfully>

In [None]:
model.eval()
test_loss = 0.0

with torch.no_grad():
    for drugs, targets, affinities in test_dataloader:
        predictions = model(drugs, targets)
        loss = criterion(predictions.view(-1), affinities)
        test_loss += loss.item()

print(f"Test Loss: {test_loss / len(test_dataloader)}")

Test Loss: 0.5851542018499458


# Inference

In [None]:
model.load_state_dict(torch.load(best_model_path))
model.eval()

batch_size = 32
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

for drugs, targets, affinities in dataloader:
    drugs = [drug.to(device) for drug in drugs]
    targets = [target.to(device) for target in targets]
    affinities = affinities.to(device)

    with torch.no_grad():
        predictions = model(drugs, targets)

    predictions = predictions.view(-1).cpu().numpy()

    print(f"Predicted Affinities (Batch of {batch_size}):")
    print(predictions)

    true_affinities = affinities.cpu().numpy()
    print(f"True Affinities (Batch of {batch_size}):")
    print(true_affinities)

    break

Predicted Affinities (Batch of 32):
[5.7246404 5.30561   7.125929  7.3421645 7.009601  7.976798  4.3262377
 7.3268914 5.9205623 8.43082   4.837859  5.0588255 6.3647103 6.056521
 6.162412  5.2577167 6.129155  6.7968106 7.481848  7.5472183 7.7630215
 4.410719  5.227548  6.5005474 4.198215  4.9020133 3.8372068 4.3602743
 7.0100107 7.1810684 6.915792  7.184063 ]
True Affinities (Batch of 32):
[5.3279023 5.223299  7.5086384 7.5228786 7.09691   8.6575775 4.345246
 8.        4.9999566 8.88941   4.5228643 4.9999566 6.939302  5.999566
 6.183096  5.1573906 6.3018994 7.080922  9.167491  8.045757  8.045757
 4.3031983 4.9999566 6.2412395 3.5228772 5.        4.171134  4.
 7.318759  7.1307683 7.39794   7.337242 ]


  model.load_state_dict(torch.load(best_model_path))
