In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pwd

/content


In [None]:
import torch
import torch.nn as nn

import numpy as np
import random
from torch.utils.data import Dataset
import os
import pandas as pd
import pdb

import torch.nn.functional as F
from torch.utils.data import DataLoader
import pdb, random
import pickle
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

torch.manual_seed(0)

<torch._C.Generator at 0x7868e8e92550>

In [None]:
class RBPNet(nn.Module):
    def __init__(self, input_size=2048, hidden_size1=1024, hidden_size2=512, output_size=58):
        super(RBPNet, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size1)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size1, hidden_size2)
        self.fc3 = nn.Linear(hidden_size2, output_size)

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.fc3(x)
        return x

In [None]:
class PhageDataset(Dataset):
    def __init__(self, data_path, embeddings_size, start_index):
        embedding_type = "prott5"
        rbp_embeddings = pd.read_csv(data_path,
                                     low_memory = False)
        # rbp_embeddings['Modification Date'] = pd.to_datetime(rbp_embeddings['Modification Date'])
        # Get only the top 25% hosts
        self.hosts = rbp_embeddings['Host'].tolist()
        # self.host_to_idx = { h: i for i, h in enumerate(list(set(self.hosts))) }

        # print(self.hosts)
        # print(self.host_to_idx)


        feature_columns = [str(i) for i in range(start_index, embeddings_size + start_index)]
        self.features = rbp_embeddings.loc[:, rbp_embeddings.columns.isin(feature_columns)].to_numpy()

        print("Total hosts: ", len(set(self.hosts)))
        print("Total phages: ", len(self.hosts))


    def __len__(self):
        return len(self.hosts)

    def __getitem__(self, idx):
        host_id = self.hosts[idx]
        rbp_embedding = self.features[idx]
        host_vector = np.zeros(len(set(self.hosts)), dtype=np.float32)
        host_vector[host_id] = 1.0
        sample = {"rbp_embedding": rbp_embedding, "host_vector": host_vector}
        return sample

In [None]:
class PhageTrainer:
    def __init__(self,data_path, embeddings_size, start_index):
        phage_dataset = PhageDataset(data_path, embeddings_size, start_index)  # DISDataset()
        print("Total dataset size:", len(phage_dataset))
        self.phage_dataloader = DataLoader(
            phage_dataset, batch_size=128, shuffle=True, num_workers=1
        )

        NUM_CLASSES = 1
        self.model = RBPNet().cuda()
        #self.model.load_state_dict(
        #    torch.load("checkpoints/nca/recursive_nca_v1.pth"),
        #   strict=False,  # "models/init/token_nca_v18.pth"
        #)

        self._setup_optimizers()

        self.phage_iter = iter(self.phage_dataloader)

        self.bce_loss = nn.BCEWithLogitsLoss(reduction="sum")

    def _clip_weights(self):
        """
        Performs clipping of weights.
        """
        for p in self.model.parameters():
            p.data.clamp_(-1.0 * self.clip_value, self.clip_value)

    def _setup_optimizers(self):
        self.iter_size = 1
        self.optimizer = torch.optim.Adam(
            [param for name, param in self.model.named_parameters()],
            lr=3e-4,
            weight_decay=0.00001)
            #momentum = 0.95)
        self.scheduler = torch.optim.lr_scheduler.MultiStepLR(
            self.optimizer, milestones=[30, 80], gamma=0.1
        )

    def save(self, model_path):
        torch.save(self.model.state_dict(), model_path + ".pth")

    @torch.no_grad()
    def predict(self,model_path, data_path):

        self.model.load_state_dict(
            torch.load(model_path),
           strict=False,  # "models/init/token_nca_v18.pth"
        )

        embedding_type = "prott5"
        rbp_embeddings = pd.read_csv(data_path,

                                     low_memory = False)
        # rbp_embeddings['Modification Date'] = pd.to_datetime(rbp_embeddings['Modification Date'])
        # Get only the top 25% hosts
        self.hosts = rbp_embeddings['Host'].tolist()
        self.host_to_idx = { h: i for i, h in enumerate(list(set(self.hosts))) }

        embeddings_size = 2048
        feature_columns = [str(i) for i in range(1, embeddings_size+1)]
        features = rbp_embeddings.loc[:, rbp_embeddings.columns.isin(feature_columns)].to_numpy()

        self.model.eval()
        output = self.model(torch.tensor(features).cuda().float())

        for i in range(len(output)):
            output[i] = F.softmax(output[i], dim=0)

        return output.cpu().numpy(), self.hosts

    def step(self):
        self.optimizer.zero_grad()
        seg_loss = 0.0
        for _ in range(self.iter_size):
            # auxialiary classifier
            try:
                phage_sample = next(self.phage_iter)
            except StopIteration:
                print("bbox dataloader reset.")
                self.phage_iter = iter(self.phage_dataloader)
                phage_sample = next(self.phage_iter)

            #edge_band = phage_sample["edge_band"].cuda()
            labels = phage_sample["host_vector"].float().cuda()#.unsqueeze(1)
            output = self.model(phage_sample["rbp_embedding"].float().cuda())
            #print(output.shape, labels.shape)
            loss = self.bce_loss(output, labels) / output.shape[0]
            loss.backward()

        seg_loss = loss.detach().item()
        self.optimizer.step()
        #self._clip_weights()

        return [
            seg_loss / self.iter_size,
        ]


In [None]:
def do_training(data_path,embeddings_size,start_index):
    trainer = PhageTrainer(data_path, embeddings_size, start_index)
    max_iters = 20000
    save_iter = 100
    snap_iter = 1000
    for iter_no in range(max_iters):
        #batch_loss = trainer.step()
        try:
            #pass
            batch_loss = trainer.step()
        except KeyboardInterrupt:
            print("User Exit.")
            exit(1)
        except:
            batch_loss = 0.0, 0.0, 0.0, 0.0
        print(
            "[Iter %d/%d] seg_loss = %f"
            % (iter_no, max_iters, batch_loss[0])
        )
        if (iter_no + 1) % snap_iter == 0:
            trainer.save("/content/drive/MyDrive/checkpoints/rbp_net_%d" % (iter_no + 1))
        elif (iter_no + 1) % save_iter == 0:
            trainer.save("/content/drive/MyDrive/checkpoints/rbp_net_v1")

In [None]:
@torch.no_grad()
def do_prediction(model_path,data_path,embeddings_size,start_index,k):
    trainer = PhageTrainer(data_path, embeddings_size,start_index)
    output, hosts = trainer.predict(model_path, data_path)

    output=output.tolist()

    for i in range(len(output)):
        p1 = max(output[i])
        ind1 = output[i].index(p1)

        output[i].remove(p1)

        p2 = max(output[i])
        ind2 = output[i].index(p2)

        if p1 - p2 >= k:
            output[i] = ind1
        else:
            output[i] = -1

    final_op = []

    for i in output:
        final_op.append(i)

    class_weights = {19: 0.15956136027599802, -1: 0.12148841793987186, 46: 0.07787087235091178, 54: 0.06419418432725481, 40: 0.06320847708230655, 55: 0.05704780680137999, 24: 0.04891572203055693, 18: 0.03523903400689995, 31: 0.030556924593395762, 50: 0.030064070970921637, 5: 0.025135534746180386, 42: 0.02168555938886151, 49: 0.017619517003449974, 16: 0.017496303597831445, 9: 0.015278462296697881, 27: 0.01232134056185313, 1: 0.012074913750616067, 47: 0.011951700344997535, 2: 0.011705273533760474, 57: 0.01145884672252341, 52: 0.009980285855101035, 43: 0.009117792015771316, 33: 0.008748151798915723, 56: 0.007146377525874815, 21: 0.0065303104977821585, 14: 0.006407097092163627, 36: 0.006407097092163627, 17: 0.006037456875308034, 20: 0.005914243469689502, 53: 0.005298176441596846, 41: 0.005174963035978314, 51: 0.005051749630359783, 37: 0.0049285362247412515, 8: 0.004805322819122721, 13: 0.004435682602267127, 11: 0.004189255791030064, 44: 0.004189255791030064, 34: 0.004066042385411533, 30: 0.004066042385411533, 6: 0.0038196155741744703, 29: 0.0034499753573188764, 4: 0.0032035485460818135, 10: 0.0030803351404632825, 12: 0.002957121734844751, 26: 0.0028339083292262196, 35: 0.0022178413011335633, 0: 0.002094627895515032, 39: 0.002094627895515032, 38: 0.001971414489896501, 32: 0.001971414489896501, 23: 0.001971414489896501, 22: 0.0016017742730409068, 45: 0.0016017742730409068, 3: 0.0014785608674223755, 48: 0.0013553474618038443, 7: 0.0012321340561853129, 25: 0.0012321340561853129, 28: 0.0012321340561853129, 15: 0.0012321340561853129}

    results = open('results.txt', 'a')

    print(data_path+"\n")
    print("k = "+str(k)+"\n")
    print("f1_score:" + str(f1_score(hosts,final_op, average="weighted", sample_weight=[class_weights[i] for i in hosts])) + "\n")
    print("accuracy:"+ str(accuracy_score(hosts,final_op)) + "\n")
    print("\n")

In [None]:
do_training("/content/drive/MyDrive/Bacteriophage_Research/combined_embeddings/train.csv",2048,1)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[Iter 15038/20000] seg_loss = 0.517640
[Iter 15039/20000] seg_loss = 0.484088
[Iter 15040/20000] seg_loss = 0.626154
[Iter 15041/20000] seg_loss = 0.495865
[Iter 15042/20000] seg_loss = 0.541078
[Iter 15043/20000] seg_loss = 0.479591
[Iter 15044/20000] seg_loss = 0.430097
[Iter 15045/20000] seg_loss = 0.402290
[Iter 15046/20000] seg_loss = 0.447895
[Iter 15047/20000] seg_loss = 0.632508
[Iter 15048/20000] seg_loss = 0.428556
[Iter 15049/20000] seg_loss = 0.335293
[Iter 15050/20000] seg_loss = 0.553426
[Iter 15051/20000] seg_loss = 0.456514
[Iter 15052/20000] seg_loss = 0.326030
[Iter 15053/20000] seg_loss = 0.491916
[Iter 15054/20000] seg_loss = 0.402529
[Iter 15055/20000] seg_loss = 0.334365
[Iter 15056/20000] seg_loss = 0.375790
[Iter 15057/20000] seg_loss = 0.391716
[Iter 15058/20000] seg_loss = 0.304922
[Iter 15059/20000] seg_loss = 0.371826
[Iter 15060/20000] seg_loss = 0.523804
[Iter 15061/20000] seg_loss = 0.410729

In [None]:
# # do_training("/home/sumanth/Documents/sem7/Topics_in_AI/Bacteriophage-Research/filtered_data/DNA_train.csv", 768, 0)
for k in range(6,11,1):
  do_prediction("/content/drive/MyDrive/checkpoints/rbp_net_v1.pth","/content/drive/MyDrive/Bacteriophage_Research/combined_embeddings/test.csv",2048,1,k/10)

Total hosts:  59
Total phages:  8116
Total dataset size: 8116
/content/drive/MyDrive/Bacteriophage_Research/combined_embeddings/test.csv

k = 0.6

f1_score:0.7121649750488619

accuracy:0.7243716116313454



Total hosts:  59
Total phages:  8116
Total dataset size: 8116
/content/drive/MyDrive/Bacteriophage_Research/combined_embeddings/test.csv

k = 0.7

f1_score:0.7225616423671835

accuracy:0.7252341054706752



Total hosts:  59
Total phages:  8116
Total dataset size: 8116
/content/drive/MyDrive/Bacteriophage_Research/combined_embeddings/test.csv

k = 0.8

f1_score:0.7245160244002068

accuracy:0.7201823558403154



Total hosts:  59
Total phages:  8116
Total dataset size: 8116
/content/drive/MyDrive/Bacteriophage_Research/combined_embeddings/test.csv

k = 0.9

f1_score:0.7109503984313323

accuracy:0.7026860522424839



Total hosts:  59
Total phages:  8116
Total dataset size: 8116
/content/drive/MyDrive/Bacteriophage_Research/combined_embeddings/test.csv

k = 1.0

f1_score:0.08060641001693

In [None]:
torch.manual_seed(0)

