### 1. Importing important libraries

In [1]:
from tqdm import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import timeit

import os
import pickle
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import  confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import plot_confusion_matrix
from matplotlib.pyplot import figure
import wandb

wandb.login()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mdanish2562022[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

### 2. Basic EDA

In [2]:
train = pd.read_csv("../../csv_files_new_ppi/training_and_test_set/train_set_without_embedding.csv")
test =  pd.read_csv("../../csv_files_new_ppi/training_and_test_set/test_set_without_embedding.csv")

In [3]:
train_pos = train[train.label == 1][0:100000]
train_neg = train[train.label == 0][0:100000]
train = pd.concat([train_pos,train_neg])

test_pos = test[test.label == 1]
test_neg = test[test.label ==0][0:len(test_pos)]
test = pd.concat([test_pos,test_neg])
test.label.value_counts()

1    21347
0    21347
Name: label, dtype: int64

In [4]:
val_pos =  test[test.label == 1][0:10000]
val_neg =  test[test.label == 0][0:10000]
val = pd.concat([val_pos,val_neg])

test_pos =  test[test.label == 1][10000:]
test_neg =  test[test.label == 0][10000:]
test = pd.concat([test_pos,test_neg])

In [5]:
print("Size of Train dataset: ", len(train))
print("Size of Test dataset: ", len(test))
print("Size of val dataset: ", len(val))

Size of Train dataset:  200000
Size of Test dataset:  22694
Size of val dataset:  20000


In [6]:
print(f"Number of negative points in training set: {train.label.value_counts()[0]}")
print(f"Number of positive points in training set: {train.label.value_counts()[1]}")
print("----"*57)
print(f"Number of negative points in test set: {test.label.value_counts()[0]}")
print(f"Number of positive points in test set: {test.label.value_counts()[1]}")
print("----"*57)
print(f"Number of negative points in test set: {val.label.value_counts()[0]}")
print(f"Number of positive points in test set: {val.label.value_counts()[1]}")

Number of negative points in training set: 100000
Number of positive points in training set: 100000
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Number of negative points in test set: 11347
Number of positive points in test set: 11347
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Number of negative points in test set: 10000
Number of positive points in test set: 10000


### 3. Importing embedding vectors from pickle file

In [7]:
with open("../../pickle/embedding_vectors_pickle/sum_of_amino_acid_vector.pickle",'rb') as handle:
    dc = pickle.load(handle)

In [8]:
def return_embed(prot_name):
    try:
        return dc[prot_name]
    except:
        return np.nan
train['embed_vec_protein_A'] = train["Protein_A"].apply(return_embed)
train['embed_vec_protein_B'] = train["Protein_B"].apply(return_embed)

test['embed_vec_protein_A'] = test["Protein_A"].apply(return_embed)
test['embed_vec_protein_B'] = test["Protein_B"].apply(return_embed)

val['embed_vec_protein_A'] = val["Protein_A"].apply(return_embed)
val['embed_vec_protein_B'] = val["Protein_B"].apply(return_embed)


train = train.dropna()
test = test.dropna()
val = val.dropna()

In [9]:
train_features_Protein_A = []
train_features_Protein_B = []
train_label = []
test_features_Protein_A = []
test_features_Protein_B = []
test_label =[]
val_features_Protein_A = []
val_features_Protein_B = []
val_label = []
for i in tqdm(range(len(train))):
    train_features_Protein_A.append(np.array(train.iloc[i].embed_vec_protein_A))
    train_features_Protein_B.append(np.array(train.iloc[i].embed_vec_protein_B))
    train_label.append(np.array(train.iloc[i].label))
    
for i in tqdm(range(len(test))):
    test_features_Protein_A.append(np.array(test.iloc[i].embed_vec_protein_A))
    test_features_Protein_B.append(np.array(test.iloc[i].embed_vec_protein_B))
    test_label.append(np.array(test.iloc[i].label))  
for i in tqdm(range(len(val))):
    
    
    val_features_Protein_A.append(np.array(val.iloc[i].embed_vec_protein_A))
    val_features_Protein_B.append(np.array(val.iloc[i].embed_vec_protein_B))
    val_label.append(np.array(val.iloc[i].label))

100%|████████████████████████████████████████████████| 200000/200000 [00:43<00:00, 4573.98it/s]
100%|██████████████████████████████████████████████████| 22694/22694 [00:05<00:00, 4521.98it/s]
100%|██████████████████████████████████████████████████| 20000/20000 [00:04<00:00, 4537.48it/s]


In [10]:
train_features_Protein_A = np.array(train_features_Protein_A)
train_features_Protein_B = np.array(train_features_Protein_B)
train_label = np.array(train_label)

test_features_Protein_A = np.array(test_features_Protein_A)
test_features_Protein_B = np.array(test_features_Protein_B)
test_label = np.array(test_label)

val_features_Protein_A = np.array(val_features_Protein_A)
val_features_Protein_B = np.array(val_features_Protein_B)
val_label = np.array(val_label)

### 4. Dataloader

In [11]:
class Data(Dataset):
    
    def __init__(self, X_data_A,X_data_B, y_data):
        self.X_data_A = X_data_A
        self.X_data_B = X_data_B
        self.y_data = y_data
        
    def __getitem__(self, index):
        return self.X_data_A[index],self.X_data_B[index], self.y_data[index]
        
    def __len__ (self):
        return len(self.X_data_A)

In [12]:
train_data = Data(torch.FloatTensor(train_features_Protein_A), torch.FloatTensor(train_features_Protein_B),
                       torch.FloatTensor(train_label))

test_data = Data(torch.FloatTensor(test_features_Protein_A), torch.FloatTensor(test_features_Protein_B),
                       torch.FloatTensor(test_label))

val_data = Data(torch.FloatTensor(val_features_Protein_A), torch.FloatTensor(val_features_Protein_B),
                       torch.FloatTensor(val_label))

In [13]:

train_loader = DataLoader(dataset=train_data, batch_size=512, shuffle=True,drop_last=True)
test_loader = DataLoader(dataset=test_data, batch_size=512,shuffle=True,drop_last=True )
val_loader = DataLoader(dataset=val_data, batch_size=512,shuffle=True,drop_last=True )

In [14]:
for i,j,k in train_loader:
    print(i.shape)
    print(j.shape)
    print(k.shape)
    break
    

torch.Size([512, 1024])
torch.Size([512, 1024])
torch.Size([512])


### 5. Building Models

In [36]:
class BertClassifier(nn.Module):
    
    def __init__(self,config, embed_dim =1024):
        super(BertClassifier,self).__init__()
        self.relu = nn.ReLU()
        self.config = config
        self.conv1  = nn.Conv1d(in_channels = 1,out_channels = 33, kernel_size = 3, stride=1)
        self.fc1 = nn.Linear(33726,config.dim_1)
        self.fc1_2 = nn.Linear(config.dim_1,config.dim_1)
        self.fc1_3 = nn.Linear(config.dim_1,config.dim_1)
        self.fully_connected_layers_1 = nn.ModuleList([nn.Linear(config.dim_1,config.dim_1)
                                                    for _ in range(config.layer_fc_1)])
        
        
    
        self.fc_2 = nn.Linear(config.dim_1,config.dim_2)
        self.fully_connected_layers_2 = nn.ModuleList([nn.Linear(config.dim_2,config.dim_2)
                                                    for _ in range(config.layer_fc_2)]) 
        self.bn2 = nn.BatchNorm1d(num_features=config.dim_2)
        self.fc3 = nn.Linear(config.dim_2,config.dim_1)
        
        self.fc4 = nn.Linear(config.dim_1,256)
        self.drop = nn.Dropout(p = 0.2)
        self.fc5 = nn.Linear(256,128)
        self.fc6 = nn.Linear(128,64)
        self.fc7 = nn.Linear(64,32)
        self.fc8 = nn.Linear(32,16)
        self.fc9 = nn.Linear(16,8)
        self.fc10 = nn.Linear(8,1)
    
    def forward(self, inputs_A,inputs_B):
        
        
        inputs_A = inputs_A.reshape(512,1,1024)
        output_conv_A = self.relu(self.conv1(inputs_A))
        output_conv_A = output_conv_A.reshape(512,33726) 
        output_A = self.relu(self.fc1(output_conv_A))
        output_A = self.relu(self.fc1_2(output_A))
        output_A = self.relu(self.fc1_3(output_A))*100
        
        inputs_B = inputs_B.reshape(512,1,1024)
        output_conv_B = self.relu(self.conv1(inputs_B))
        output_conv_B = output_conv_B.reshape(512,33726) 
        output_B = self.relu(self.fc1(output_conv_B))
        output_B = self.relu(self.fc1_2(output_B))
        output_B = self.relu(self.fc1_3(output_B))*100
        
        output = torch.mul(output_A, output_B)
        for i in range(self.config.layer_fc_1):
            output = self.relu(self.fully_connected_layers_1[i](output))
        output = self.relu(self.fc_2(output))
        for i in range(self.config.layer_fc_2):
            output = self.relu(self.fully_connected_layers_2[i](output))
            
        output  = self.bn2(output)
        output = self.relu(self.fc3(output))
        if self.config.dropout:
            output = self.drop(output)
            
        output = self.relu(self.fc4(output))
        output = self.relu(self.fc5(output))
        output = self.relu(self.fc6(output))
        output = self.fc7(output)
        output = self.fc8(output)
        output = self.fc9(output)
        output = self.fc10(output)

        return output

In [37]:
def build_optimizer(network, optimizer,learning_rate, momentum, weight_decay, amsgrad):
    
    
#     if optimizer == "sgd":
#         optimizer_ = optim.SGD(network.parameters(),
#                               lr = learning_rate, momentum = momentum, weight_decay = weight_decay,
#                               )
        
        
    if optimizer == "adam":
        optimizer_ = optim.Adam(network.parameters(),
                               lr = learning_rate, betas = (0.9,0.999), weight_decay = weight_decay,
                               amsgrad = amsgrad)
        
#     elif optimizer == "rms_prop":
#         optimizer_ = optim.RMSprop(network.parameters(),
#                                lr = learning_rate, alpha = 0.99, momentum = momentum,
#                                   weight_decay = weight_decay)
                               
        
    return optimizer_

In [38]:
# sweep_config = {
#     'method': 'random'
    
#     }
# metric = {
#     'name': 'val_accuracy',
#     'goal': 'maximize'   
#     }
# early_terminate = {"type": "hyperband",
#       "min_iter": 3 }

# sweep_config['metric'] = metric 
# sweep_config['early_terminate'] = early_terminate 

# parameters_dict = {
    
#     'layer_fc_1': {
#         'values': [2]
#         },
   
#     'dim_1': {
#           'values': [2048]
#         },
    
#     'layer_fc_2': {
#         'values': [2]
#         },
#     'dim_2': {
#           'values': [512]
#         },
    
    
#     'dropout': {
#           'values': [False]
#         },
   
    
    
#     'optimizer': {
#           'values': ['adam']   #
#         }
#     ,
  
    
#     'learning_rate': {
#             'values':[0.0001]
#         },
    
    
#     'momentum': {
#           'values': [0.95]
#         },
    
#     'weight_decay': {
#             'values': [0.009827436437331628]
#         },
   
        
#     'amsgrad': {
#           'values': [False]
#         },
    
    
#     }


# sweep_config['parameters'] = parameters_dict
# parameters_dict.update({
#     'epochs': {
#         'value': 10000}
#     })


config = dict(layer_fc_1 = 2,
        dim_1 = 2048,
        layer_fc_2 = 2,
        dim_2 = 512,
        dropout = False,
        optimizer = 'adam',
        learning_rate = 0.00007,
        momentum = 0.95,
        weight_decay = 0.01,
        amsgrad = False,
        epochs = 500)


import pprint

pprint.pprint(config)

{'amsgrad': False,
 'dim_1': 2048,
 'dim_2': 512,
 'dropout': False,
 'epochs': 500,
 'layer_fc_1': 2,
 'layer_fc_2': 2,
 'learning_rate': 7e-05,
 'momentum': 0.95,
 'optimizer': 'adam',
 'weight_decay': 0.01}


### 6. Training

In [39]:
import random
import time
from tqdm import tqdm
loss_fn = nn.BCEWithLogitsLoss()
def train(config, train_dataloader,val_dataloader = None):
    
    best_accuracy = 0
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print("Start training...\n")
    epochs = config.epochs
    
    model = BertClassifier(config).to(device)
    loss_fn = nn.BCEWithLogitsLoss()
    optimizer,learning_rate, momentum, weight_decay, amsgrad = config.optimizer,config.learning_rate, config.momentum, config.weight_decay, config.amsgrad
    optimizer = build_optimizer(model,optimizer,learning_rate, momentum, weight_decay, amsgrad)
    
    
    for epoch_i in range(1,epochs+1):
        
        total_loss = 0
        model.train()
        
        for step,batch in tqdm(enumerate(train_dataloader)):
            
            inputs_A,inputs_B, b_labels = tuple(t.to(device) for t in batch)
            b_labels = b_labels.reshape((1,512,1)).squeeze(0)
            model.zero_grad()
            logits = model(inputs_A,inputs_B)
            loss = loss_fn(logits,b_labels.float()) 
            total_loss += loss.item()
            loss.mean().backward()
            optimizer.step()
               
        avg_train_loss = total_loss / len(train_dataloader)
        
    
        if val_dataloader is not None:
                
                val_loss, val_accuracy = evaluate(model, val_dataloader)
                if val_accuracy > best_accuracy:
                    best_accuracy = val_accuracy
                    torch.save({
                        'epoch': epoch_i ,
                        'model_state_dict': model.state_dict(),
                        'optimizer_state_dict': optimizer.state_dict(),
                        'loss': loss_fn,
                        }, 'best_model_trained_fc_v2.pth')
        
        wandb.log({
                    'epoch': epoch_i, 
                    "train_loss": avg_train_loss,
                    "val_loss": val_loss,
                    "val_accuracy": val_accuracy
                   })
        print(f"Epoch: {epoch_i} | Training Loss: {avg_train_loss}  | Validation Loss: {val_loss}  | Accuracy: {val_accuracy:.2f}")
        with open('result.txt', 'a') as f:
            print(f"Epoch: {epoch_i} | Training Loss: {avg_train_loss}  | Validation Loss: {val_loss}  | Accuracy: {val_accuracy:.2f}", file=f) 
    print("\n")
    wandb.log({"val_accuracy": best_accuracy,
                   })
    print(f"Training complete! Best accuracy: {best_accuracy:.2f}%.")
    

def evaluate(model,val_dataloader):
    
    
    model.eval()
    val_accuracy = []
    val_loss = []
    for batch in tqdm(val_dataloader):
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        inputs_A,inputs_B ,b_labels = tuple(t.to(device) for t in batch)
        b_labels = b_labels.reshape((1,512,1)).squeeze(0)
        with torch.no_grad():
                logits = model(inputs_A,inputs_B)
        
        loss = loss_fn(logits, b_labels.float())
        val_loss.append(loss.item())
        preds = torch.round(torch.sigmoid(logits))
        
        accuracy = (preds.float() == b_labels.float()).cpu().numpy().mean() * 100
        val_accuracy.append(accuracy)
    
    val_loss = np.mean(val_loss)
    val_accuracy = np.mean(val_accuracy)

    return val_loss, val_accuracy 

In [40]:
def make(config):
    # Make the data
    if torch.cuda.is_available():
        device = torch.device('cuda')
        print('The code uses GPU...')
    else:
        device = torch.device('cpu')
        print('The code uses CPU!!!')

    
    model = BertClassifier(config).to(device)

    # Make the loss and optimizer
    criterion = nn.BCEWithLogitsLoss()
    optimizer,learning_rate, momentum, weight_decay, amsgrad = config.optimizer,config.learning_rate, config.momentum, config.weight_decay, config.amsgrad
    optimizer = build_optimizer(model,optimizer,learning_rate, momentum, weight_decay, amsgrad)
    
    return model, criterion, optimizer

In [41]:
def model_pipeline(config=None):

    # tell wandb to get started
    with wandb.init(project="Approach_2_elementwise_multiplication_siamese_network_on_sum_", config=config):
      # access all HPs through wandb.config, so logging matches execution!
      config = wandb.config
      
      # make the model, data, and optimization problem
      
      
      
      # and use them to train the model
      train(config, train_loader,val_dataloader = val_loader)
     
      for i,j in test_loader:
          x = i
      wandb.save("model.onnx")


In [None]:
model_pipeline(config)

Start training...



390it [01:17,  5.03it/s]
100%|██████████████████████████████████████████████████████████| 39/39 [00:02<00:00, 13.39it/s]


Epoch: 1 | Training Loss: 0.5961955872101662  | Validation Loss: 0.5726679105025071  | Accuracy: 69.81


390it [01:27,  4.47it/s]
100%|██████████████████████████████████████████████████████████| 39/39 [00:03<00:00, 10.31it/s]


Epoch: 2 | Training Loss: 0.4104445727971884  | Validation Loss: 0.5708672006924947  | Accuracy: 70.97


390it [01:33,  4.17it/s]
100%|██████████████████████████████████████████████████████████| 39/39 [00:03<00:00, 10.32it/s]


Epoch: 3 | Training Loss: 0.36676665185353696  | Validation Loss: 0.6243361677878942  | Accuracy: 72.16


390it [01:35,  4.10it/s]
100%|██████████████████████████████████████████████████████████| 39/39 [00:03<00:00, 10.31it/s]


Epoch: 4 | Training Loss: 0.3405290381266521  | Validation Loss: 0.5323965266729013  | Accuracy: 74.37


390it [01:37,  3.99it/s]
100%|██████████████████████████████████████████████████████████| 39/39 [00:03<00:00, 10.32it/s]


Epoch: 5 | Training Loss: 0.323610611527394  | Validation Loss: 0.6827986591901535  | Accuracy: 70.03


390it [01:36,  4.06it/s]
100%|██████████████████████████████████████████████████████████| 39/39 [00:03<00:00, 10.33it/s]


Epoch: 6 | Training Loss: 0.3111689699383882  | Validation Loss: 0.7498671274918777  | Accuracy: 70.12


390it [01:36,  4.06it/s]
100%|██████████████████████████████████████████████████████████| 39/39 [00:03<00:00, 10.29it/s]


Epoch: 7 | Training Loss: 0.30025386611620586  | Validation Loss: 0.5762704412142435  | Accuracy: 74.22


390it [01:34,  4.13it/s]
100%|██████████████████████████████████████████████████████████| 39/39 [00:03<00:00, 10.32it/s]


Epoch: 8 | Training Loss: 0.29293968398601583  | Validation Loss: 0.6381603005604867  | Accuracy: 74.34


390it [01:35,  4.09it/s]
100%|██████████████████████████████████████████████████████████| 39/39 [00:03<00:00, 10.33it/s]


Epoch: 9 | Training Loss: 0.2841126173352584  | Validation Loss: 0.6551333451882387  | Accuracy: 70.65


390it [01:39,  3.92it/s]
100%|██████████████████████████████████████████████████████████| 39/39 [00:03<00:00, 10.32it/s]


Epoch: 10 | Training Loss: 0.28093048746769245  | Validation Loss: 0.5488307185662098  | Accuracy: 74.67


390it [01:35,  4.06it/s]
100%|██████████████████████████████████████████████████████████| 39/39 [00:03<00:00, 10.32it/s]


Epoch: 11 | Training Loss: 0.2744011797966101  | Validation Loss: 0.7865165074666342  | Accuracy: 71.04


95it [00:24,  4.46it/s]

### 7. Testing model

In [None]:
model = bert