### NetMHCpan_Analysis

This notebook produces post processes the output produced from NetMHCpan. The saved files are used for producing figures in other notebooks in this directory

In [1]:
import pandas as pd
import glob
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import tqdm
from multiprocessing import Pool

%matplotlib inline

In [166]:
allele = 'HLA-B35_03'
files = glob.glob('/dfs/user/yhr/neopeptide/results/'+allele+'/*')

In [2]:
# Read an parse all peptide lines from text

def get_peptides(f):
    with open(f, "r+") as read:
            text=read.readlines()

    peptide_text = []
    for it, line in enumerate(text):
        if line[1:4]=='Pos':
            k = 2
            while text[it+k][0] != '-':
                peptide_text.append(text[it+k])
                k = k+1
    return peptide_text

def get_BA(peptide_out):
    ba = [x for x in peptide_out.split(' ')if len(x)>0][15]
    id_ = [x for x in peptide_out.split(' ')if len(x)>0][10]
    return (id_, float(ba))

def get_all_BA(peptide_outs):
    return [get_BA(p) for p in peptide_outs]

In [169]:
# Get all peptide results for a given allele
peptide_outs_all = []
for f in files:
    if f[-2:] != 'sh' or f[-2:] != 'ip':
        try:
            peptide_outs_all.append(get_peptides(f))
        except:
            continue

In [3]:
def Map(F, x, workers):
    """
    wrapper for imap()
    Spawn workers for parallel processing
    
    """
    with Pool(workers) as pool:
        #ret = pool.map(F, x)
        ret = list(tqdm.tqdm(pool.imap(F, x), total=len(x)))
    return ret

In [None]:
# Read in binding affinity for all output files
BAs = Map(get_all_BA, peptide_outs_all, workers=50)
BAs_flat = [item for sublist in BAs for item in sublist]

100%|██████████| 68/68 [06:29<00:00,  5.73s/it]  


In [None]:
# Save output to file
with open('BAs_flat_'+allele+'.txt', 'w') as f:
    for it, item in enumerate(BAs_flat):
        if it%1000000 == 0:
            print(it)
        f.write("%s\n" % str(item))

In [None]:
plot_df = pd.DataFrame(BAs_flat)

In [None]:
plot_df.to_pickle('BAs_flat_'+allele+'.pkl')

### Read in processed data

In [4]:
# Read in file
with open('BAs_flat_'+allele+'.txt', 'r') as f:
    content = f.readlines()
    
content = [x.strip() for x in content] 

In [18]:
data_mat = []
for c in content:
    data_mat.append([ci.strip('()') for ci in c.split(',')])

In [19]:
# Create a dataframe
plot_df = pd.DataFrame(data_mat)
plot_df[0] = plot_df[0].astype('float')

In [23]:
# Save dataframe as pickle
plot_df.to_pickle('BAs_flat_'+allele+'.pkl')