In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import torch
import torch.nn as nn

import numpy as np
import random
from torch.utils.data import Dataset
import os
import pandas as pd
import pdb

import torch.nn.functional as F
from torch.utils.data import DataLoader
import pdb, random
import pickle
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

torch.manual_seed(0)

<torch._C.Generator at 0x7fcde417e4d0>

In [None]:
class RBPNet(nn.Module):
    def __init__(self, input_size=2048, hidden_size1=512, hidden_size2=256, output_size=58):
        super(RBPNet, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size1)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size1, hidden_size2)
        self.fc3 = nn.Linear(hidden_size2, output_size)

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.fc3(x)
        return x

In [None]:
class PhageDataset(Dataset):
    def __init__(self, data_path, embeddings_size, start_index):
        embedding_type = "prott5"
        rbp_embeddings = pd.read_csv(data_path,
                                     low_memory = False)
        # rbp_embeddings['Modification Date'] = pd.to_datetime(rbp_embeddings['Modification Date'])
        # Get only the top 25% hosts
        self.hosts = rbp_embeddings['Host'].tolist()
        # self.host_to_idx = { h: i for i, h in enumerate(list(set(self.hosts))) }

        # print(self.hosts)
        # print(self.host_to_idx)


        feature_columns = [str(i) for i in range(start_index, embeddings_size + start_index)]
        self.features = rbp_embeddings.loc[:, rbp_embeddings.columns.isin(feature_columns)].to_numpy()

        print("Total hosts: ", len(set(self.hosts)))
        print("Total phages: ", len(self.hosts))


    def __len__(self):
        return len(self.hosts)

    def __getitem__(self, idx):
        host_id = self.hosts[idx]
        rbp_embedding = self.features[idx]
        host_vector = np.zeros(len(set(self.hosts)), dtype=np.float32)
        host_vector[host_id] = 1.0
        sample = {"rbp_embedding": rbp_embedding, "host_vector": host_vector}
        return sample

In [None]:
class PhageTrainer:
    def __init__(self,data_path, embeddings_size, start_index):
        self.embeddings_size = embeddings_size
        phage_dataset = PhageDataset(data_path, embeddings_size, start_index)  # DISDataset()
        print("Total dataset size:", len(phage_dataset))
        self.phage_dataloader = DataLoader(
            phage_dataset, batch_size=128, shuffle=True, num_workers=1
        )

        NUM_CLASSES = 1
        self.model = RBPNet(input_size=embeddings_size).cuda()
        #self.model.load_state_dict(
        #    torch.load("checkpoints/nca/recursive_nca_v1.pth"),
        #   strict=False,  # "models/init/token_nca_v18.pth"
        #)

        self._setup_optimizers()

        self.phage_iter = iter(self.phage_dataloader)

        self.bce_loss = nn.BCEWithLogitsLoss(reduction="sum")

    def _clip_weights(self):
        """
        Performs clipping of weights.
        """
        for p in self.model.parameters():
            p.data.clamp_(-1.0 * self.clip_value, self.clip_value)

    def _setup_optimizers(self):
        self.iter_size = 1
        self.optimizer = torch.optim.Adam(
            [param for name, param in self.model.named_parameters()],
            lr=3e-4,
            weight_decay=0.00001)
            #momentum = 0.95)
        self.scheduler = torch.optim.lr_scheduler.MultiStepLR(
            self.optimizer, milestones=[30, 80], gamma=0.1
        )

    def save(self, model_path):
        torch.save(self.model.state_dict(), model_path + ".pth")

    @torch.no_grad()
    def predict(self,model_path, data_path):

        self.model.load_state_dict(
            torch.load(model_path),
           strict=False,  # "models/init/token_nca_v18.pth"
        )

        embedding_type = "prott5"
        rbp_embeddings = pd.read_csv(data_path,

                                     low_memory = False)
        # rbp_embeddings['Modification Date'] = pd.to_datetime(rbp_embeddings['Modification Date'])
        # Get only the top 25% hosts
        self.hosts = rbp_embeddings['Host'].tolist()
        self.host_to_idx = { h: i for i, h in enumerate(list(set(self.hosts))) }


        feature_columns = [str(i) for i in range(1, self.embeddings_size+1)]
        features = rbp_embeddings.loc[:, rbp_embeddings.columns.isin(feature_columns)].to_numpy()

        self.model.eval()
        output = self.model(torch.tensor(features).cuda().float())

        for i in range(len(output)):
            output[i] = F.softmax(output[i], dim=0)

        return output.cpu().numpy(), self.hosts

    def step(self):
        self.optimizer.zero_grad()
        seg_loss = 0.0
        for _ in range(self.iter_size):
            # auxialiary classifier
            try:
                phage_sample = next(self.phage_iter)
            except StopIteration:
                print("bbox dataloader reset.")
                self.phage_iter = iter(self.phage_dataloader)
                phage_sample = next(self.phage_iter)

            #edge_band = phage_sample["edge_band"].cuda()
            labels = phage_sample["host_vector"].float().cuda()#.unsqueeze(1)
            output = self.model(phage_sample["rbp_embedding"].float().cuda())
            #print(output.shape, labels.shape)
            loss = self.bce_loss(output, labels) / output.shape[0]
            loss.backward()

        seg_loss = loss.detach().item()
        self.optimizer.step()
        #self._clip_weights()

        return [
            seg_loss / self.iter_size,
        ]

In [None]:
def do_training(data_path,embeddings_size,start_index):
    trainer = PhageTrainer(data_path, embeddings_size, start_index)
    max_iters = 20000
    save_iter = 100
    snap_iter = 1000
    for iter_no in range(max_iters):
        #batch_loss = trainer.step()
        try:
            #pass
            batch_loss = trainer.step()
        except KeyboardInterrupt:
            print("User Exit.")
            exit(1)
        except:
            batch_loss = 0.0, 0.0, 0.0, 0.0
        print(
            "[Iter %d/%d] seg_loss = %f"
            % (iter_no, max_iters, batch_loss[0])
        )
        if (iter_no + 1) % snap_iter == 0:
            trainer.save("/content/drive/MyDrive/checkpoints/rbp_net_%d" % (iter_no + 1))
        elif (iter_no + 1) % save_iter == 0:
            trainer.save("/content/drive/MyDrive/checkpoints/rbp_net_v1")

In [None]:
@torch.no_grad()
def do_prediction(model_path,data_path,embeddings_size,start_index,k):
    trainer = PhageTrainer(data_path, embeddings_size,start_index)
    output, hosts = trainer.predict(model_path, data_path)

    output=output.tolist()

    for i in range(len(output)):
        p1 = max(output[i])
        ind1 = output[i].index(p1)

        output[i].remove(p1)

        p2 = max(output[i])
        ind2 = output[i].index(p2)

        if p1 - p2 >= k:
            output[i] = ind1
        else:
            output[i] = -1

    final_op = []

    for i in output:
        final_op.append(i)

    class_weights = {19: 0.15956136027599802, -1: 0.12148841793987186, 46: 0.07787087235091178, 54: 0.06419418432725481, 40: 0.06320847708230655, 55: 0.05704780680137999, 24: 0.04891572203055693, 18: 0.03523903400689995, 31: 0.030556924593395762, 50: 0.030064070970921637, 5: 0.025135534746180386, 42: 0.02168555938886151, 49: 0.017619517003449974, 16: 0.017496303597831445, 9: 0.015278462296697881, 27: 0.01232134056185313, 1: 0.012074913750616067, 47: 0.011951700344997535, 2: 0.011705273533760474, 57: 0.01145884672252341, 52: 0.009980285855101035, 43: 0.009117792015771316, 33: 0.008748151798915723, 56: 0.007146377525874815, 21: 0.0065303104977821585, 14: 0.006407097092163627, 36: 0.006407097092163627, 17: 0.006037456875308034, 20: 0.005914243469689502, 53: 0.005298176441596846, 41: 0.005174963035978314, 51: 0.005051749630359783, 37: 0.0049285362247412515, 8: 0.004805322819122721, 13: 0.004435682602267127, 11: 0.004189255791030064, 44: 0.004189255791030064, 34: 0.004066042385411533, 30: 0.004066042385411533, 6: 0.0038196155741744703, 29: 0.0034499753573188764, 4: 0.0032035485460818135, 10: 0.0030803351404632825, 12: 0.002957121734844751, 26: 0.0028339083292262196, 35: 0.0022178413011335633, 0: 0.002094627895515032, 39: 0.002094627895515032, 38: 0.001971414489896501, 32: 0.001971414489896501, 23: 0.001971414489896501, 22: 0.0016017742730409068, 45: 0.0016017742730409068, 3: 0.0014785608674223755, 48: 0.0013553474618038443, 7: 0.0012321340561853129, 25: 0.0012321340561853129, 28: 0.0012321340561853129, 15: 0.0012321340561853129}

    results = open('results.txt', 'a')

    print(data_path+"\n")
    print("k = "+str(k)+"\n")
    print("f1_score:" + str(f1_score(hosts,final_op, average="weighted", sample_weight=[class_weights[i] for i in hosts])) + "\n")
    print("accuracy:"+ str(accuracy_score(hosts,final_op)) + "\n")
    print("\n")

In [None]:
do_training("/content/drive/MyDrive/Bacteriophage_Research/preprocessed_data/rbp_embeddings_esm1b_train.csv",1024,1)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[Iter 15038/20000] seg_loss = 1.114301
[Iter 15039/20000] seg_loss = 0.953655
[Iter 15040/20000] seg_loss = 0.955669
[Iter 15041/20000] seg_loss = 0.907124
[Iter 15042/20000] seg_loss = 0.953311
[Iter 15043/20000] seg_loss = 0.995972
[Iter 15044/20000] seg_loss = 1.106127
[Iter 15045/20000] seg_loss = 0.889802
[Iter 15046/20000] seg_loss = 1.156934
[Iter 15047/20000] seg_loss = 1.154852
[Iter 15048/20000] seg_loss = 0.827217
[Iter 15049/20000] seg_loss = 0.834697
[Iter 15050/20000] seg_loss = 0.974472
[Iter 15051/20000] seg_loss = 0.827062
[Iter 15052/20000] seg_loss = 1.078196
[Iter 15053/20000] seg_loss = 0.988710
[Iter 15054/20000] seg_loss = 0.851574
[Iter 15055/20000] seg_loss = 0.927002
[Iter 15056/20000] seg_loss = 0.944703
[Iter 15057/20000] seg_loss = 1.026505
[Iter 15058/20000] seg_loss = 0.993271
[Iter 15059/20000] seg_loss = 1.317530
[Iter 15060/20000] seg_loss = 1.022500
[Iter 15061/20000] seg_loss = 0.910930

In [None]:
for k in range(6,11,1):
  do_prediction("/content/drive/MyDrive/checkpoints/rbp_net_v1.pth","/content/drive/MyDrive/Bacteriophage_Research/preprocessed_data/rbp_embeddings_esm1b_test.csv",1024,1,k/10)

Total hosts:  59
Total phages:  8116
Total dataset size: 8116
/content/drive/MyDrive/Bacteriophage_Research/preprocessed_data/rbp_embeddings_esm1b_test.csv

k = 0.6

f1_score:0.7376173685601572

accuracy:0.6982503696402168



Total hosts:  59
Total phages:  8116
Total dataset size: 8116
/content/drive/MyDrive/Bacteriophage_Research/preprocessed_data/rbp_embeddings_esm1b_test.csv

k = 0.7

f1_score:0.7306848820756173

accuracy:0.685312962050271



Total hosts:  59
Total phages:  8116
Total dataset size: 8116
/content/drive/MyDrive/Bacteriophage_Research/preprocessed_data/rbp_embeddings_esm1b_test.csv

k = 0.8

f1_score:0.7174544329769204

accuracy:0.6696648595367176



Total hosts:  59
Total phages:  8116
Total dataset size: 8116
/content/drive/MyDrive/Bacteriophage_Research/preprocessed_data/rbp_embeddings_esm1b_test.csv

k = 0.9

f1_score:0.6720844647405227

accuracy:0.6290044356826022



Total hosts:  59
Total phages:  8116
Total dataset size: 8116
/content/drive/MyDrive/Bacteriophag

# ESM

In [None]:
torch.manual_seed(0)

<torch._C.Generator at 0x7fcde417e4d0>

In [None]:
do_training("/content/drive/MyDrive/Bacteriophage_Research/preprocessed_data/rbp_embeddings_esm_train.csv",1024,1)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[Iter 15038/20000] seg_loss = 0.584675
[Iter 15039/20000] seg_loss = 0.461054
[Iter 15040/20000] seg_loss = 0.485520
[Iter 15041/20000] seg_loss = 0.386467
[Iter 15042/20000] seg_loss = 0.421728
[Iter 15043/20000] seg_loss = 0.510776
[Iter 15044/20000] seg_loss = 0.551185
[Iter 15045/20000] seg_loss = 0.466964
[Iter 15046/20000] seg_loss = 0.449022
[Iter 15047/20000] seg_loss = 0.555205
[Iter 15048/20000] seg_loss = 0.462300
[Iter 15049/20000] seg_loss = 0.383192
[Iter 15050/20000] seg_loss = 0.403207
[Iter 15051/20000] seg_loss = 0.401098
[Iter 15052/20000] seg_loss = 0.461811
[Iter 15053/20000] seg_loss = 0.363368
[Iter 15054/20000] seg_loss = 0.475373
[Iter 15055/20000] seg_loss = 0.524357
[Iter 15056/20000] seg_loss = 0.445514
[Iter 15057/20000] seg_loss = 0.533797
[Iter 15058/20000] seg_loss = 0.495713
[Iter 15059/20000] seg_loss = 0.620389
[Iter 15060/20000] seg_loss = 0.426611
[Iter 15061/20000] seg_loss = 0.443202

In [None]:
for k in range(6,11,1):
  do_prediction("/content/drive/MyDrive/checkpoints/rbp_net_v1.pth","/content/drive/MyDrive/Bacteriophage_Research/preprocessed_data/rbp_embeddings_esm_test.csv",1024,1,k/10)

Total hosts:  59
Total phages:  8116
Total dataset size: 8116
/content/drive/MyDrive/Bacteriophage_Research/preprocessed_data/rbp_embeddings_esm_test.csv

k = 0.6

f1_score:0.7063769225141528

accuracy:0.7172252341054707



Total hosts:  59
Total phages:  8116
Total dataset size: 8116
/content/drive/MyDrive/Bacteriophage_Research/preprocessed_data/rbp_embeddings_esm_test.csv

k = 0.7

f1_score:0.708692666070282

accuracy:0.7127895515032036



Total hosts:  59
Total phages:  8116
Total dataset size: 8116
/content/drive/MyDrive/Bacteriophage_Research/preprocessed_data/rbp_embeddings_esm_test.csv

k = 0.8

f1_score:0.708066311000062

accuracy:0.7060128141941844



Total hosts:  59
Total phages:  8116
Total dataset size: 8116
/content/drive/MyDrive/Bacteriophage_Research/preprocessed_data/rbp_embeddings_esm_test.csv

k = 0.9

f1_score:0.7004079672181526

accuracy:0.6933218334154756



Total hosts:  59
Total phages:  8116
Total dataset size: 8116
/content/drive/MyDrive/Bacteriophage_Researc

# ProtT5

In [None]:
torch.manual_seed(0)
do_training("/content/drive/MyDrive/Bacteriophage_Research/preprocessed_data/rbp_embeddings_prott5_train.csv",1024,1)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[Iter 15038/20000] seg_loss = 0.829699
[Iter 15039/20000] seg_loss = 0.637503
[Iter 15040/20000] seg_loss = 0.640070
[Iter 15041/20000] seg_loss = 0.637780
[Iter 15042/20000] seg_loss = 0.636031
[Iter 15043/20000] seg_loss = 0.685856
[Iter 15044/20000] seg_loss = 0.864311
[Iter 15045/20000] seg_loss = 0.627909
[Iter 15046/20000] seg_loss = 0.784051
[Iter 15047/20000] seg_loss = 0.808938
[Iter 15048/20000] seg_loss = 0.723041
[Iter 15049/20000] seg_loss = 0.562485
[Iter 15050/20000] seg_loss = 0.631817
[Iter 15051/20000] seg_loss = 0.629082
[Iter 15052/20000] seg_loss = 0.750048
[Iter 15053/20000] seg_loss = 0.582482
[Iter 15054/20000] seg_loss = 0.638859
[Iter 15055/20000] seg_loss = 0.679504
[Iter 15056/20000] seg_loss = 0.632409
[Iter 15057/20000] seg_loss = 0.729159
[Iter 15058/20000] seg_loss = 0.744107
[Iter 15059/20000] seg_loss = 0.841351
[Iter 15060/20000] seg_loss = 0.827387
[Iter 15061/20000] seg_loss = 0.759883

In [None]:
for k in range(6,11,1):
  do_prediction("/content/drive/MyDrive/checkpoints/rbp_net_v1.pth","/content/drive/MyDrive/Bacteriophage_Research/preprocessed_data/rbp_embeddings_prott5_test.csv",1024,1,k/10)

Total hosts:  59
Total phages:  8116
Total dataset size: 8116
/content/drive/MyDrive/Bacteriophage_Research/preprocessed_data/rbp_embeddings_prott5_test.csv

k = 0.6

f1_score:0.716955391930005

accuracy:0.712419911286348



Total hosts:  59
Total phages:  8116
Total dataset size: 8116
/content/drive/MyDrive/Bacteriophage_Research/preprocessed_data/rbp_embeddings_prott5_test.csv

k = 0.7

f1_score:0.7122285006045399

accuracy:0.7025628388368654



Total hosts:  59
Total phages:  8116
Total dataset size: 8116
/content/drive/MyDrive/Bacteriophage_Research/preprocessed_data/rbp_embeddings_prott5_test.csv

k = 0.8

f1_score:0.6923644685629722

accuracy:0.6833415475603746



Total hosts:  59
Total phages:  8116
Total dataset size: 8116
/content/drive/MyDrive/Bacteriophage_Research/preprocessed_data/rbp_embeddings_prott5_test.csv

k = 0.9

f1_score:0.6575633548492406

accuracy:0.6546328240512568



Total hosts:  59
Total phages:  8116
Total dataset size: 8116
/content/drive/MyDrive/Bacteriop

# Prottransalbert

In [None]:
torch.manual_seed(0)
do_training("/content/drive/MyDrive/Bacteriophage_Research/preprocessed_data/rbp_embeddings_prottransalbert_train.csv",1024,1)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[Iter 15038/20000] seg_loss = 1.053537
[Iter 15039/20000] seg_loss = 0.878626
[Iter 15040/20000] seg_loss = 0.964215
[Iter 15041/20000] seg_loss = 0.835661
[Iter 15042/20000] seg_loss = 0.913311
[Iter 15043/20000] seg_loss = 0.918220
[Iter 15044/20000] seg_loss = 1.085905
[Iter 15045/20000] seg_loss = 0.854702
[Iter 15046/20000] seg_loss = 1.065205
[Iter 15047/20000] seg_loss = 1.179272
[Iter 15048/20000] seg_loss = 0.799900
[Iter 15049/20000] seg_loss = 0.755315
[Iter 15050/20000] seg_loss = 0.982337
[Iter 15051/20000] seg_loss = 0.727653
[Iter 15052/20000] seg_loss = 1.073614
[Iter 15053/20000] seg_loss = 1.028423
[Iter 15054/20000] seg_loss = 0.768033
[Iter 15055/20000] seg_loss = 0.881963
[Iter 15056/20000] seg_loss = 0.833432
[Iter 15057/20000] seg_loss = 0.968110
[Iter 15058/20000] seg_loss = 0.896405
[Iter 15059/20000] seg_loss = 1.187231
[Iter 15060/20000] seg_loss = 0.950198
[Iter 15061/20000] seg_loss = 0.914059

In [None]:
for k in range(6,11,1):
  do_prediction("/content/drive/MyDrive/checkpoints/rbp_net_v1.pth","/content/drive/MyDrive/Bacteriophage_Research/preprocessed_data/rbp_embeddings_prottransalbert_test.csv",1024,1,k/10)

Total hosts:  59
Total phages:  8116
Total dataset size: 8116
/content/drive/MyDrive/Bacteriophage_Research/preprocessed_data/rbp_embeddings_prottransalbert_test.csv

k = 0.6

f1_score:0.7103923164963484

accuracy:0.6795219319862001



Total hosts:  59
Total phages:  8116
Total dataset size: 8116
/content/drive/MyDrive/Bacteriophage_Research/preprocessed_data/rbp_embeddings_prottransalbert_test.csv

k = 0.7

f1_score:0.7064407903550222

accuracy:0.6700344997535732



Total hosts:  59
Total phages:  8116
Total dataset size: 8116
/content/drive/MyDrive/Bacteriophage_Research/preprocessed_data/rbp_embeddings_prottransalbert_test.csv

k = 0.8

f1_score:0.6930777295186247

accuracy:0.65352390340069



Total hosts:  59
Total phages:  8116
Total dataset size: 8116
/content/drive/MyDrive/Bacteriophage_Research/preprocessed_data/rbp_embeddings_prottransalbert_test.csv

k = 0.9

f1_score:0.6627942685909127

accuracy:0.6249383932971907



Total hosts:  59
Total phages:  8116
Total dataset size: 8

# ProtTransBert

In [None]:
torch.manual_seed(0)
do_training("/content/drive/MyDrive/Bacteriophage_Research/preprocessed_data/rbp_embeddings_prottransbert_train.csv",1024,1)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[Iter 15038/20000] seg_loss = 1.228175
[Iter 15039/20000] seg_loss = 1.003846
[Iter 15040/20000] seg_loss = 1.004243
[Iter 15041/20000] seg_loss = 1.011603
[Iter 15042/20000] seg_loss = 1.034536
[Iter 15043/20000] seg_loss = 1.062008
[Iter 15044/20000] seg_loss = 1.295435
[Iter 15045/20000] seg_loss = 0.926275
[Iter 15046/20000] seg_loss = 1.237411
[Iter 15047/20000] seg_loss = 1.171013
[Iter 15048/20000] seg_loss = 0.877594
[Iter 15049/20000] seg_loss = 0.936627
[Iter 15050/20000] seg_loss = 0.984529
[Iter 15051/20000] seg_loss = 0.894713
[Iter 15052/20000] seg_loss = 1.152079
[Iter 15053/20000] seg_loss = 1.000247
[Iter 15054/20000] seg_loss = 0.954779
[Iter 15055/20000] seg_loss = 0.996117
[Iter 15056/20000] seg_loss = 0.917230
[Iter 15057/20000] seg_loss = 1.026802
[Iter 15058/20000] seg_loss = 1.023797
[Iter 15059/20000] seg_loss = 1.368523
[Iter 15060/20000] seg_loss = 1.111694
[Iter 15061/20000] seg_loss = 1.018897

In [None]:
for k in range(6,11,1):
  do_prediction("/content/drive/MyDrive/checkpoints/rbp_net_v1.pth","/content/drive/MyDrive/Bacteriophage_Research/preprocessed_data/rbp_embeddings_prottransbert_test.csv",1024,1,k/10)

Total hosts:  59
Total phages:  8116
Total dataset size: 8116
/content/drive/MyDrive/Bacteriophage_Research/preprocessed_data/rbp_embeddings_prottransbert_test.csv

k = 0.6

f1_score:0.6992703160172143

accuracy:0.6663380975850173



Total hosts:  59
Total phages:  8116
Total dataset size: 8116
/content/drive/MyDrive/Bacteriophage_Research/preprocessed_data/rbp_embeddings_prottransbert_test.csv

k = 0.7

f1_score:0.6932847555586024

accuracy:0.6550024642681124



Total hosts:  59
Total phages:  8116
Total dataset size: 8116
/content/drive/MyDrive/Bacteriophage_Research/preprocessed_data/rbp_embeddings_prottransbert_test.csv

k = 0.8

f1_score:0.6701534828963841

accuracy:0.6336865450961064



Total hosts:  59
Total phages:  8116
Total dataset size: 8116
/content/drive/MyDrive/Bacteriophage_Research/preprocessed_data/rbp_embeddings_prottransbert_test.csv

k = 0.9

f1_score:0.6263575899916911

accuracy:0.5947511089206505



Total hosts:  59
Total phages:  8116
Total dataset size: 8116
/c

# ProtXLNet

In [None]:
torch.manual_seed(0)
do_training("/content/drive/MyDrive/Bacteriophage_Research/preprocessed_data/rbp_embeddings_protxlnet_train.csv",1024,1)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[Iter 15038/20000] seg_loss = 0.965949
[Iter 15039/20000] seg_loss = 0.945883
[Iter 15040/20000] seg_loss = 0.998197
[Iter 15041/20000] seg_loss = 0.834972
[Iter 15042/20000] seg_loss = 0.917012
[Iter 15043/20000] seg_loss = 0.887336
[Iter 15044/20000] seg_loss = 1.029763
[Iter 15045/20000] seg_loss = 0.986781
[Iter 15046/20000] seg_loss = 1.043246
[Iter 15047/20000] seg_loss = 1.095380
[Iter 15048/20000] seg_loss = 0.828301
[Iter 15049/20000] seg_loss = 0.805347
[Iter 15050/20000] seg_loss = 1.010691
[Iter 15051/20000] seg_loss = 0.749197
[Iter 15052/20000] seg_loss = 1.000209
[Iter 15053/20000] seg_loss = 0.888437
[Iter 15054/20000] seg_loss = 0.987692
[Iter 15055/20000] seg_loss = 0.918273
[Iter 15056/20000] seg_loss = 0.910535
[Iter 15057/20000] seg_loss = 0.910194
[Iter 15058/20000] seg_loss = 1.014954
[Iter 15059/20000] seg_loss = 1.007102
[Iter 15060/20000] seg_loss = 0.860850
[Iter 15061/20000] seg_loss = 0.756554

In [None]:
for k in range(6,11,1):
  do_prediction("/content/drive/MyDrive/checkpoints/rbp_net_v1.pth","/content/drive/MyDrive/Bacteriophage_Research/preprocessed_data/rbp_embeddings_protxlnet_test.csv",1024,1,k/10)

Total hosts:  59
Total phages:  8116
Total dataset size: 8116
/content/drive/MyDrive/Bacteriophage_Research/preprocessed_data/rbp_embeddings_protxlnet_test.csv

k = 0.6

f1_score:0.7015753107463117

accuracy:0.6775505174963036



Total hosts:  59
Total phages:  8116
Total dataset size: 8116
/content/drive/MyDrive/Bacteriophage_Research/preprocessed_data/rbp_embeddings_protxlnet_test.csv

k = 0.7

f1_score:0.7095094783669513

accuracy:0.6738541153277476



Total hosts:  59
Total phages:  8116
Total dataset size: 8116
/content/drive/MyDrive/Bacteriophage_Research/preprocessed_data/rbp_embeddings_protxlnet_test.csv

k = 0.8

f1_score:0.6929953768713625

accuracy:0.6569738787580088



Total hosts:  59
Total phages:  8116
Total dataset size: 8116
/content/drive/MyDrive/Bacteriophage_Research/preprocessed_data/rbp_embeddings_protxlnet_test.csv

k = 0.9

f1_score:0.6625801079202386

accuracy:0.6291276490882208



Total hosts:  59
Total phages:  8116
Total dataset size: 8116
/content/drive/MyD

# SeqVec

In [None]:
torch.manual_seed(0)
do_training("/content/drive/MyDrive/Bacteriophage_Research/preprocessed_data/rbp_embeddings_seqvec_train.csv",1024,1)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[Iter 15038/20000] seg_loss = 0.786977
[Iter 15039/20000] seg_loss = 0.536680
[Iter 15040/20000] seg_loss = 0.616498
[Iter 15041/20000] seg_loss = 0.538764
[Iter 15042/20000] seg_loss = 0.600502
[Iter 15043/20000] seg_loss = 0.533476
[Iter 15044/20000] seg_loss = 0.775292
[Iter 15045/20000] seg_loss = 0.620888
[Iter 15046/20000] seg_loss = 0.675591
[Iter 15047/20000] seg_loss = 0.725230
[Iter 15048/20000] seg_loss = 0.546032
[Iter 15049/20000] seg_loss = 0.538147
[Iter 15050/20000] seg_loss = 0.542613
[Iter 15051/20000] seg_loss = 0.476137
[Iter 15052/20000] seg_loss = 0.660292
[Iter 15053/20000] seg_loss = 0.575208
[Iter 15054/20000] seg_loss = 0.547040
[Iter 15055/20000] seg_loss = 0.654849
[Iter 15056/20000] seg_loss = 0.554262
[Iter 15057/20000] seg_loss = 0.661884
[Iter 15058/20000] seg_loss = 0.625158
[Iter 15059/20000] seg_loss = 0.759705
[Iter 15060/20000] seg_loss = 0.696401
[Iter 15061/20000] seg_loss = 0.550765

In [None]:
for k in range(6,11,1):
  do_prediction("/content/drive/MyDrive/checkpoints/rbp_net_v1.pth","/content/drive/MyDrive/Bacteriophage_Research/preprocessed_data/rbp_embeddings_seqvec_test.csv",1024,1,k/10)

Total hosts:  59
Total phages:  8116
Total dataset size: 8116
/content/drive/MyDrive/Bacteriophage_Research/preprocessed_data/rbp_embeddings_seqvec_test.csv

k = 0.6

f1_score:0.690617534815918

accuracy:0.6988664366683095



Total hosts:  59
Total phages:  8116
Total dataset size: 8116
/content/drive/MyDrive/Bacteriophage_Research/preprocessed_data/rbp_embeddings_seqvec_test.csv

k = 0.7

f1_score:0.6890101675989079

accuracy:0.6934450468210941



Total hosts:  59
Total phages:  8116
Total dataset size: 8116
/content/drive/MyDrive/Bacteriophage_Research/preprocessed_data/rbp_embeddings_seqvec_test.csv

k = 0.8

f1_score:0.6784895150663057

accuracy:0.6798915722030557



Total hosts:  59
Total phages:  8116
Total dataset size: 8116
/content/drive/MyDrive/Bacteriophage_Research/preprocessed_data/rbp_embeddings_seqvec_test.csv

k = 0.9

f1_score:0.6673891533707274

accuracy:0.6649827501232134



Total hosts:  59
Total phages:  8116
Total dataset size: 8116
/content/drive/MyDrive/Bacterio