In [9]:
# Load libraries
import torch 
import pandas as pd 
import numpy as np
import json
import tokenizers

from torch import tensor
from torch.utils.data import DataLoader, Dataset
from transformers import  RobertaConfig
from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.models import WordLevel, BPE
from tokenizers import pre_tokenizers, normalizers, Tokenizer
from tokenizers.normalizers import Lowercase, NFD
from tokenizers.pre_tokenizers import ByteLevel, Whitespace
from torch.utils.tensorboard import SummaryWriter



In [10]:
# Load settings
with open('settings.json', 'r') as inFile:
    settings = json.load(inFile)
    
# Set device 
device = "cuda:0" if torch.cuda.is_available() else "cpu"

# Load model
model = torch.load('best_model')
model.to(device)
model.eval()

Net(
  (l1): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(26, 1032, padding_idx=1)
      (position_embeddings): Embedding(512, 1032, padding_idx=1)
      (token_type_embeddings): Embedding(2, 1032)
      (LayerNorm): LayerNorm((1032,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.2, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=1032, out_features=1032, bias=True)
              (key): Linear(in_features=1032, out_features=1032, bias=True)
              (value): Linear(in_features=1032, out_features=1032, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=1032, out_features=1032, bias=True)
              (LayerNorm): LayerNorm((1032,), 

In [11]:
class CDR3Dataset(Dataset):
    
    def __init__(self, settings:dict, train:bool = True, label:str = None, tokenizer:tokenizers.Tokenizer=None, equal:bool=False) -> None:
        cols = ["activatedby_HA", "activatedby_NP", "activatedby_HCRT", "activated_any", "multilabel", "negative"]
        if label not in cols:
            raise ValueError("Invalid label type. Expected one of %s" % cols)
        else: 
            self.label = label
        if equal and label == "num_label":
            raise ValueError("Equal size sets only allowed for binary classifications. num_label is multiclass.")
        
        if train == True:
            path_to_data = settings["file"]["train_data"] 
        else:
            path_to_data = settings["file"]["test_data"]   
              
        self.path_to_data = path_to_data
        self.data = pd.read_csv(self.path_to_data)
        if equal == True:
            min_sample=np.min(self.data[self.label].value_counts()) 
            data_pos = self.data[self.data[self.label]==1].sample(min_sample)
            data_neg = self.data[self.data[self.label]==0].sample(min_sample)
            self.data = pd.concat([data_pos, data_neg], ignore_index=True)
        
        if label == "multilabel":
            self.labels = [0,1]
            self.n_labels = 3
        else:
            self.labels = np.unique(self.data[[self.label]])
            self.n_labels = len(self.labels)
            
        self.max_len = self.data.CDR3ab.str.len().max()
        
        self.tokenizer = tokenizer
        
    def __getitem__(self, index:int):
        if isinstance(self.tokenizer.model, tokenizers.models.WordLevel):
            self.tokenizer.enable_padding(length=self.max_len)
            CDR3ab = " ".join(list(self.data.CDR3ab[index]))
            encodings = self.tokenizer.encode(CDR3ab)
            item = {
                "ids":tensor(encodings.ids, dtype=torch.long),
                "attention_mask": tensor(encodings.attention_mask, dtype=torch.long),
                "CDR3ab": self.data.CDR3ab[index]
                }
        elif isinstance(self.tokenizer.model, tokenizers.models.BPE):
            self.tokenizer.enable_padding(length=self.max_len)
            encodings = self.tokenizer.encode(self.data.CDR3ab[index]) 
            item = {
                "ids":tensor(encodings.ids, dtype=torch.long),
                "attention_mask": tensor(encodings.attention_mask, dtype=torch.long),
                "CDR3ab": self.data.CDR3ab[index]
                }
        if self.label == "multilabel":
            item["target"]=tensor(self.data[["activatedby_HA", "activatedby_NP", "activatedby_HCRT"]].iloc[index],dtype =torch.long)
        else:
            item["target"] = tensor(self.data[self.label][index], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.data)

In [12]:
# Create writter 
writer = SummaryWriter(log_dir="runs/test/")

In [13]:
# Create tonekizer from tokenizers library 
if settings["param"]["tokenizer"] == "BPE":
    tokenizer = Tokenizer(BPE()).from_file(settings["tokenizer"]["BPE"])
elif settings["param"]["tokenizer"] == "WL":
    tokenizer = Tokenizer(WordLevel()).from_file(settings["tokenizer"]["WL"])
else:
    raise ValueError("Unknown tokenizer. Tokenizer argument must be BPE or WL.")
tokenizer.enable_padding()
    
# Create training and test dataset
dataset_params={"label":settings["database"]["label"], "tokenizer":tokenizer}
train_data = CDR3Dataset(settings,train=True, equal=False, **dataset_params)
test_data =CDR3Dataset(settings, train=False, **dataset_params)

# Crate dataloaders
loader_params = {'batch_size': 20,
            'shuffle': True,
            'num_workers': 0
            }
train_dataloader = DataLoader(train_data, **loader_params)
test_dataloader = DataLoader(test_data, **loader_params)

In [14]:
# Test tokenizers
bpe_tokenizer = Tokenizer(BPE()).from_file(settings["tokenizer"]["BPE"])
wl_tokenizer = Tokenizer(WordLevel()).from_file(settings["tokenizer"]["WL"])
bpe_tokenizer.enable_padding()
wl_tokenizer.enable_padding()

# Get 10 random CDRs and predict 
sample_train = next(iter(train_dataloader))
cdr = sample_train['CDR3ab'][0]
cdr

'CAASEGAQKLVF_CASSFPLRGIYEQYF'

In [30]:
# Get example
cdr = sample_train['CDR3ab'][0]
encoded = bpe_tokenizer.encode(cdr)
decoded = bpe_tokenizer.decode(encoded.ids)
print("Ids: " + str(encoded.ids))
print("Tokens: " + str(encoded.tokens))
print("Vocab size:" + str(bpe_tokenizer.get_vocab_size()))


Ids: [388, 6, 403, 5, 1554, 1018]
Tokens: ['Ġcavi', 'a', 'kllf', '_', 'casslap', 'seklff']
Vocab size:3817


In [18]:
# Get example 2
encoded = wl_tokenizer.encode(" ".join(cdr))
decoded = wl_tokenizer.decode(encoded.ids)
print("Ids: " + str(encoded.ids))
print("Tokens: " + str(encoded.tokens))

Ids: [9, 7, 7, 5, 17, 6, 7, 12, 19, 11, 16, 8, 15, 9, 7, 5, 5, 8, 22, 11, 18, 6, 21, 13, 17, 12, 13, 8]
Tokens: ['c', 'a', 'a', 's', 'e', 'g', 'a', 'q', 'k', 'l', 'v', 'f', '_', 'c', 'a', 's', 's', 'f', 'p', 'l', 'r', 'g', 'i', 'y', 'e', 'q', 'y', 'f']


In [31]:
# Get 10 random CDRs and predict 
sample_train = next(iter(train_dataloader))

# Predict 
torch.cuda.empty_cache() 
model.eval()
outs_df = []
ids = sample_train["ids"].to(device)
attention_mask = sample_train["attention_mask"].to(device)
targets = sample_train["target"].to(device)
outs = model(ids, attention_mask)

# Bring to CPU 
targets = targets.to('cpu') .detach().numpy()
outs = outs.to('cpu')
outs = outs.detach().numpy()
outs = np.around(outs, decimals=3) 

cols_prob = ["PROB_activatedby_HA", "PROB_activatedby_NP", "PROB_activatedby_HCRT"]
cols = ["activatedby_HA", "activatedby_NP", "activatedby_HCRT"]

# Crate dataframes 
outs_df = pd.DataFrame.from_records(outs)
outs_df.columns = cols_prob

# Crate dataframe of targets
targets_df = pd.DataFrame.from_records(targets)
targets_df.columns = cols
targets_df.insert(0, 'CDR3ab', sample_train['CDR3ab'])

# Concat 
comp_df = pd.concat([targets_df, outs_df], axis=1)
comp_df



Unnamed: 0,CDR3ab,activatedby_HA,activatedby_NP,activatedby_HCRT,PROB_activatedby_HA,PROB_activatedby_NP,PROB_activatedby_HCRT
0,CAESRYNTDKLIF_CASRDYVGGGTEAFF,0,0,0,0.15,0.0,0.0
1,CIVIQTWAVEKLTF_CASSPTGVSYEQYF,0,0,0,0.002,0.0,0.0
2,CAVGEGNNDMRF_CASSPGVGGNQPQHF,0,1,0,0.0,0.999,0.0
3,CAVRDNDYKLSF_CASSQDALVTDTQYF,0,0,0,0.001,0.0,0.0
4,CAVRLDSWGKLQF_CASSPLSSGGNTIYF,0,0,0,0.0,0.0,0.0
5,CATGVYGQNFVF_CASSLGDRGQYEQYF,0,1,0,0.0,0.999,0.0
6,CAYRRRDDKIIF_CASSLAGAHTEAFF,0,0,0,0.0,0.036,0.0
7,CAVSVGGNKLVF_CASSSSGSRNQPQHF,1,1,0,0.987,0.962,0.0
8,CATALYNTDKLIF_CASSQGATGGTNYGYTF,1,0,0,0.986,0.001,0.001
9,CAMSSGGSNYKLTF_CASSPGWGNQPQHF,0,0,0,0.003,0.0,0.0


In [33]:

# Get 10 random CDRs and predict 
sample_test = next(iter(test_dataloader))

# Predict 
model.eval()
outs_df = []
ids = sample_test["ids"].to(device)
attention_mask = sample_test["attention_mask"].to(device)
targets = sample_test["target"].to(device)
outs = model(ids, attention_mask)

# Bring to CPU 
targets = targets.to('cpu') .detach().numpy()
outs = outs.to('cpu')
outs = outs.detach().numpy()
outs = np.around(outs, decimals=3) 

cols_prob = ["PROB_activatedby_HA", "PROB_activatedby_NP", "PROB_activatedby_HCRT"]
cols = ["activatedby_HA", "activatedby_NP", "activatedby_HCRT"]

# Crate dataframes 
outs_df = pd.DataFrame.from_records(outs)
outs_df.columns = cols_prob

# Crate dataframe of targets
targets_df = pd.DataFrame.from_records(targets)
targets_df.columns = cols
targets_df.insert(0, 'CDR3ab', sample_test['CDR3ab'])

# Concat 
comp_df = pd.concat([targets_df, outs_df], axis=1)
comp_df



Unnamed: 0,CDR3ab,activatedby_HA,activatedby_NP,activatedby_HCRT,PROB_activatedby_HA,PROB_activatedby_NP,PROB_activatedby_HCRT
0,CAVNYLGGKLIF_CASSPGTGGNSPLHF,0,1,0,0.84,0.992,0.0
1,CALITGSARQLTF_CASSLTSGETNEQFF,0,0,0,0.0,0.127,0.0
2,CALSYSNYQLIW_CASSEGTGDYGYTF,0,0,0,0.0,0.029,0.0
3,CAVNARLMF_CASSTQGAGEAFF,0,0,0,0.002,0.0,0.0
4,CAVTSNFGNEKLTF_CASSFRQGSSYEQYF,0,0,0,0.001,0.0,0.0
5,CAPRRGAQKLVF_CASSELVADTQYF,0,0,0,0.001,0.0,0.0
6,CAPRGSGNTPLVF_CASSTVQGASEKLFF,0,0,0,0.001,0.0,0.0
7,CALGDTGRRALTF_CASSTGTGGYNEQFF,0,0,0,0.002,0.0,0.0
8,CIVRRVYGGSQGNLIF_CASSLGGQQGGDTQYF,0,0,0,0.009,0.0,0.0
9,CAVSTSSNYKLTF_CASSSQTAGANVLTF,0,0,0,0.0,0.0,0.0


In [41]:
# Train data 
train_data.data.activated_by.value_counts()

negative      2783
NP136          766
HA69           535
HCRT           269
HCRT|NP136      33
HA69|NP136      28
HA69|HCRT        6
Name: activated_by, dtype: int64

In [26]:
train_data.data.shape[0]

4420

In [38]:
test_data.data.activated_by.value_counts()

negative      1174
NP136          365
HA69           192
HCRT           143
HA69|NP136      14
HCRT|NP136       6
Name: activated_by, dtype: int64

In [27]:
test_data.data.shape[0]

1894

In [48]:
test_data.data.activatedby_NP.value_counts()

1    1682
0     212
Name: activatedby_NP, dtype: int64