Create inputs CDR3-Epitope for language translation
==========================================

The program needs 3 lang1 and lang2 files
1. train 
2. dev (most likely validation set for setting hyperparams?)
3. test

The above files are subscript with language subsscript (e.g. "en" and "vi") have one sentence per line. The lines are paired (at least) in the training set. 

_train.en_
"That report was written by 620 scientists from 40 countries ."
_train.vi_
"Nghiên cứu được viết bởi 620 nhà khoa học từ 40 quốc gia khác nhau ."

In addition it also needs vocabulory for lang1 and lang2


In [1]:
from ipywidgets import widgets
import tensorflow as tf
import r2python
import numpy as np
import platform
import pandas as pd
if platform.system() != 'Darwin':
    import matplotlib as mpl
    mpl.use('Agg')
import matplotlib.pyplot as plt
from time import strftime
from sklearn import metrics
import os
import  csv
# import pixiedust

We will create a kmer from CDR3 and antigen, with stride =1. The line will end with ".".

In [2]:
def create_antigen_cdr3b_kmers(
    train_set_filename,
    outputDir=".",
    prefix="train",
    kmer_len=3,
    vocab_CDR3b = [],
    vocab_epitope = [],
    min_CDR3b_len=None,
    max_CDR3b_len=None,
    # min_CDR3a_len=7,
    # max_CDR3a_len=17,
    min_epitope_len=None,
    max_epitope_len=None
):
    """
    Create a kmers for antigen and cdr3b. 
    """
    df_nu = pd.read_csv(train_set_filename, "\t")
    duplicate_index = df_nu['CDR3b'].duplicated(keep='first')
    df = df_nu[~duplicate_index]
    df_select = df
    if min_CDR3b_len is not None:
        df_select = df[df['CDR3b'].str.len() >= min_CDR3b_len]
    if max_CDR3b_len is not None:
        df_select = df_select[df_select['CDR3b'].str.len() <= max_CDR3b_len]
    if min_epitope_len is not None:
        df_select = df[df['Epitope'].str.len() >= min_epitope_len]
    if max_epitope_len is not None:
        df_select = df_select[df_select['Epitope'].str.len() <= max_epitope_len]
    print("Dataset size:")
    print(df_select.shape)
    df_unique = df_select
    cdr3s_list = df_unique['CDR3b'].tolist()
    cdr3s_kmers_precontext, cdr3s_kmers, cdr3s_kmers_postcontext = [], [], []
    for inx in range(len(cdr3s_list)):
        cdr3 = cdr3s_list[inx]
        cdr3_kmers = [cdr3[cdr3_inx:(cdr3_inx + kmer_len)] for cdr3_inx in range(0, len(cdr3) - (kmer_len - 1))]
        cdr3_kmers = cdr3_kmers + ["."]
        cdr3s_kmers.append(cdr3_kmers)
    
#     import ipdb; ipdb.set_trace()

    filename = (f'{outputDir}/{prefix}.cdr3b')
    with open(filename,"w") as f:
        wr = csv.writer(f, lineterminator='\n', delimiter=' ')
        wr.writerows(cdr3s_kmers)
        
    epitopes_list = df_unique['Epitope'].tolist()
    epitopes_kmers_precontext, epitopes_kmers, epitopes_kmers_postcontext = [], [], []
    for inx in range(len(epitopes_list)):
        epitope = epitopes_list[inx]
        epitope_kmers = [epitope[epitope_inx:(epitope_inx + kmer_len)] for epitope_inx in range(0, len(epitope) - (kmer_len - 1))]
        epitope_kmers = epitope_kmers + ["."]
        epitopes_kmers.append(epitope_kmers)

    filename = (f'{outputDir}/{prefix}.epitope')
#     print(filename)
    with open(filename,"w") as f:
        wr = csv.writer(f, lineterminator='\n', delimiter=' ')
        wr.writerows(epitopes_kmers)
        
#     filename = (f'{outputDir}/{prefix}.cdr3b')
    # create vocabulary 
    vocab_CDR3b = list(set([item for sublist in cdr3s_kmers for item in sublist] + vocab_CDR3b))
    vocab_CDR3b_all = ["<unk>", "<s>", "</s>"] + vocab_CDR3b
    filename = (f'{outputDir}/{prefix}_vocab.cdr3b')
    with open(filename,"w") as f:
        wr = csv.writer(f, lineterminator='\n', delimiter=' ')
        for elem in vocab_CDR3b_all:
            wr.writerow([elem])
        
#         wr.writerows(vocab_CDR3b_all)

    vocab_epitope = list(set([item for sublist in epitopes_kmers for item in sublist] + vocab_epitope))
    vocab_epitope_all = ["<unk>", "<s>", "</s>"] + vocab_epitope 
    filename = (f'{outputDir}/{prefix}_vocab.epitopes')
    with open(filename,"w") as f:
        wr = csv.writer(f, lineterminator='\n', delimiter=' ')
        for elem in vocab_epitope_all:
            wr.writerow([elem])
        
#         wr.writerows(vocab_epitope)
        
    return cdr3s_kmers, epitopes_kmers, vocab_CDR3b, vocab_epitope

In [3]:
outputDir="/Users/avi/project/code/deeplearning/antigen_recognition/data/nmt_inps_2mer"

if not os.path.exists(outputDir):
    os.makedirs(outputDir)

_, _, vocab_train_cdr3b, vocab_train_epitope = create_antigen_cdr3b_kmers(
    train_set_filename="/Users/avi/project/code/deeplearning/antigen_recognition/data/vdjdb_B.tsv",
    outputDir=outputDir,
    kmer_len = 2)

out_test = create_antigen_cdr3b_kmers(
    train_set_filename="/Users/avi/project/code/deeplearning/antigen_recognition/data/CDR3a_CDR3b_Epitope.tsv",
    outputDir=outputDir, prefix="test",
    kmer_len = 2, vocab_CDR3b = vocab_train_cdr3b, vocab_epitope = vocab_train_epitope)


Dataset size:
(23555, 2)
Dataset size:
(5857, 3)


In [5]:
%%bash -s "$outputDir"
head -3000 $1/test.epitope > $1/test_3000.epitope
tail -n +3000 $1/test.epitope > $1/test_end.epitope

head -3000 $1/test.cdr3b > $1/test_3000.cdr3b
tail -n +3000 $1/test.cdr3b > $1/test_end.cdr3b

In [4]:
# print(len(out_train))
# print(len(vocab_train_cdr3b))
# print(len(out_test[3]))
aa = list([1,2,3, 4,4,4,5] + [])
set(aa)

with open("temp.txt","w") as f:
    wr = csv.writer(f, lineterminator='\n', delimiter=' ')
    for elem in aa:
        wr.writerow([elem])

Todos: 

+ Remove C and A. So that the training and testing set are good. 
+ create 2mer mode
+ Make inference with current model
+ Change the loss function
+ Try attention model