In [1]:
%matplotlib inline 

import pandas as pd
import numpy as np
import editdistance as ed
from tqdm import tqdm_notebook
from matplotlib import pyplot as plt
from collections import Counter
import time

import torch
from torch import nn
from torch import optim
from torch.utils.data import TensorDataset, DataLoader

from sklearn.metrics import accuracy_score, f1_score, classification_report

import uuid

import utils
import importlib
importlib.reload(utils)
from utils import *

In [2]:
df_pos = load_dataset("data/GSM3155092_P01_CRVstim_CD8_beta.txt.gz")
df_neg = load_dataset("data/GSM3155090_P01_unstim_CD8_beta.txt.gz")

df_neg = filter_specific(df_pos, df_neg)

print(len(df_pos))
print(len(df_neg))

 - Dropped 3106 duplicates
 - Dropped 62656 duplicates
 - Removed 15566 sequences
80214
495796


In [131]:
# model_1.pth
# class CMVModel(nn.Module):
    
#     def __init__(self):
#         super(CMVModel, self).__init__()
        
#         self._hidden_size = 64
#         self.rnn = nn.GRU(input_size=len(oh_dict["A"]), hidden_size=self._hidden_size, num_layers=2, bidirectional=True, dropout=0, batch_first=True)
#         self.final = nn.Sequential(nn.LeakyReLU(),
#                                    nn.BatchNorm1d(self._hidden_size),
#                                    nn.Linear(self._hidden_size, self._hidden_size), 
#                                    nn.LeakyReLU(),
#                                    nn.BatchNorm1d(self._hidden_size),
#                                    nn.Linear(self._hidden_size, 1), 
#                                    nn.Sigmoid())
#         torch.nn.init.kaiming_uniform_(self.final[2].weight, nonlinearity="leaky_relu")
#         torch.nn.init.kaiming_uniform_(self.final[5].weight, nonlinearity="leaky_relu")
        
    
#     def forward(self, batch):
#         x = self.rnn(batch)[1]
#         x = (x[-2, :, :] + x[-1, :, :]).mul(.5)
#         x = self.final(x)
#         return x


# model_2.pth
# class CMVModel(nn.Module):
    
#     def __init__(self):
#         super(CMVModel, self).__init__()
        
#         self._hidden_size = 64
#         self.rnn = nn.GRU(input_size=len(oh_dict["A"]), hidden_size=self._hidden_size, num_layers=2, bidirectional=True, dropout=0.0, batch_first=True)
# #         self.final = nn.Sequential(nn.LeakyReLU(),
# #                                    nn.BatchNorm1d(self._hidden_size),
# #                                    nn.Linear(self._hidden_size, 1), 
# #                                    nn.Sigmoid())
#         self.final = nn.Sequential(nn.Linear(self._hidden_size, 1), 
#                                    nn.Sigmoid())
        
    
#     def forward(self, batch):
#         x = self.rnn(batch)[1]
#         x = (x[-2, :, :] + x[-1, :, :]).mul(.5)
#         x = self.final(x)
#         return x


class CMVModel(nn.Module):
    
    def __init__(self, hidden_size=32, num_layers=1, rnn_dropout=0):
        super(CMVModel, self).__init__()
        
        self._hidden_size = hidden_size
        self.rnn = nn.GRU(input_size=len(oh_dict["A"]), hidden_size=self._hidden_size, num_layers=num_layers, bidirectional=True, dropout=rnn_dropout, batch_first=True)
        self.final = nn.Sequential(nn.Linear(self._hidden_size, 1), 
                                   nn.Sigmoid())
        
    
    def forward(self, batch):
        x = self.rnn(batch)[1]
        x = (x[-2, :, :] + x[-1, :, :]).mul(.5)
        x = self.final(x)
        return x

In [130]:
seed = 42
oh_dict = load_dict()

TEST_SIZE = 10000

indices = np.random.choice(len(df_pos), TEST_SIZE, replace=False)
df_pos_test = df_pos.iloc[indices, :]
df_pos_train = df_pos.drop(indices)

indices = np.random.choice(len(df_neg), TEST_SIZE, replace=False)
df_neg_test = df_neg.iloc[indices, :]
df_neg_train = df_neg.drop(indices)

print("Train DFs sizes:", len(df_pos_train), len(df_neg_train))
print("Test DFs sizes:", len(df_pos_test), len(df_neg_test))


X_pos_train = seq2vec(df_pos_train["CDR3.sequence"], oh_dict)
X_pos_test = seq2vec(df_pos_test["CDR3.sequence"], oh_dict)
X_neg_train = seq2vec(df_neg_train["CDR3.sequence"], oh_dict)
X_neg_test = seq2vec(df_neg_test["CDR3.sequence"], oh_dict)

print("Train Tensors sizes:", len(X_pos_train), len(X_neg_train))
print("Test Tensors sizes:", len(X_pos_test), len(X_neg_test))

Train DFs sizes: 70214 485796
Test DFs sizes: 10000 10000
Train Tensors sizes: 70214 485796
Test Tensors sizes: 10000 10000


In [132]:
#
# Update the model
#
model = CMVModel().to("cuda")

In [167]:
BATCH_SIZE = 64
# LR = 0.0003
LR = 0.001

indices = np.random.choice(len(X_neg_train), len(X_pos_train), replace=False)
X_neg_train_ss = X_neg_train[indices].contiguous()

dl_train = DataLoader(TensorDataset(X_neg_train_ss.transpose(2, 1), X_pos_train.transpose(2, 1)), batch_size=BATCH_SIZE, shuffle=True, num_workers=1, pin_memory=True, drop_last=False)
dl_test = DataLoader(TensorDataset(X_neg_test.transpose(2, 1), X_pos_test.transpose(2, 1)), batch_size=2048, shuffle=False, num_workers=1, pin_memory=True, drop_last=False)

optimiser = optim.Adam(model.parameters(), lr=LR)
criterion = nn.BCELoss()

In [137]:
def run_epoch(model, criterion, optimiser, dl, train_mode=True):
    if train_mode:
        model.train()
    else:
        model.eval()
        
    loss_list = []
    pred_list = []
    true_list = []
    
    start = time.time()
    for batch_neg, batch_pos in dl:
        optimiser.zero_grad()
        loss = 0
        
        for batch, y_true in create_batches(batch_neg, batch_pos):
            y_pred = model(batch.to("cuda")).reshape((-1,))
            loss += criterion(y_pred, y_true.to("cuda"))
            
            y_pred_round = y_pred.cpu().detach().round().numpy()
            pred_list.append(y_pred_round)
            true_list.append(y_true.cpu().detach().numpy())
        
        if train_mode:
            loss.backward()
            optimiser.step()
        
        loss_list.append(loss.cpu().item())
    end = time.time()
    
    return loss_list, pred_list, true_list, end - start

In [168]:
EPOCHS = 5

for i_epoch in range(1, EPOCHS+1):
    loss_list, pred_list, true_list, exec_time = run_epoch(model, criterion, optimiser, dl_train, True)
    pred_list = np.concatenate(pred_list).ravel()
    true_list = np.concatenate(true_list).ravel()
    cr = classification_report(true_list, pred_list, target_names=["CMV-", "CMV+"])
    
    print("Epoch:{0:4}".format(i_epoch))
    print("  - Train (in {0:3.2}s): ".format(exec_time), )
    print(cr[:162])
    
    loss_list, pred_list, true_list, exec_time = run_epoch(model, criterion, optimiser, dl_test, False)
    pred_list = np.concatenate(pred_list).ravel()
    true_list = np.concatenate(true_list).ravel()
    cr = classification_report(true_list, pred_list, target_names=["CMV-", "CMV+"])
    print("  - Test (in {0:3.2}s): ".format(exec_time), )
    print(cr[:162])
    print()
    torch.save(model.state_dict(), "./model.pth")

Epoch:   1
  - Train (in 8.8s): 
              precision    recall  f1-score   support

        CMV-       0.56      0.58      0.57     70214
        CMV+       0.57      0.55      0.56     70214
  - Test (in 0.73s): 
              precision    recall  f1-score   support

        CMV-       0.54      0.56      0.55     10000
        CMV+       0.54      0.52      0.53     10000

Epoch:   2
  - Train (in 8.6s): 
              precision    recall  f1-score   support

        CMV-       0.56      0.58      0.57     70214
        CMV+       0.57      0.55      0.56     70214
  - Test (in 0.64s): 
              precision    recall  f1-score   support

        CMV-       0.53      0.55      0.54     10000
        CMV+       0.54      0.52      0.53     10000

Epoch:   3
  - Train (in 8.5s): 
              precision    recall  f1-score   support

        CMV-       0.57      0.58      0.57     70214
        CMV+       0.57      0.56      0.56     70214
  - Test (in 0.52s): 
              prec

array([0., 1., 0., ..., 1., 0., 1.], dtype=float32)