In [1]:
import re
import pandas as pd
import numpy as np
from sklearn.metrics import precision_score
import json

In [2]:
df = pd.read_csv("abelin_peptides.all_predictions.csv.bz2", sep=",")
df = df[["hit", "allele", "peptide", "mhcflurry_1local", "netmhc", "netmhcpan"]]

unify_alleles = lambda x: re.sub('[*|:|-]', '', x)
df["allele"] = df["allele"].map(unify_alleles)

print(len(df))
df.head()

2566814


Unnamed: 0,hit,allele,peptide,mhcflurry_1local,netmhc,netmhcpan
0,0,HLAA0203,VGKPEMQQKI,24390.961883,28521.26,29805.9
1,0,HLAB4403,LERYPKVALRVLF,15158.077039,7484.45,2175.5
2,0,HLAA0301,AHKRSERLQRAPL,34574.000722,31664.31,31651.3
3,0,HLAB5401,QLKFAPFKCVVPT,35389.874783,394.8,2328.2
4,0,HLAA0101,DFLNNLATGLVFIIV,38669.110483,21391.3,24540.4


In [3]:
length_vec = df["peptide"].map(len).values
length_list = np.unique(length_vec)
column_list = ["mhcflurry_1local", "netmhc", "netmhcpan"]
allele_list = np.unique(df["allele"])

In [4]:
def process(allele_df, column):
    n_of_binders = np.where(allele_df["hit"] == 1)[0].shape[0]
    pred_df = allele_df.reset_index()
    pred_df = pred_df.loc[pred_df[column].argsort(), ]
    return pred_df["hit"].values[:n_of_binders].sum() / n_of_binders

score_d = {}
# for (min_len, max_len) in [(8, 11), (8, 10), (8, 12)] + [(x, x) for x in range(8, 16)]:
for (min_len, max_len) in [(8, 11)]:
    print((min_len, max_len))
    score_d[str((min_len, max_len))] = {}
    for allele in allele_list:
        scores = []
        if allele != "HLAA0204":
            for col in column_list:
                tmp = df.loc[(df["allele"] == allele) & (length_vec >= min_len) & (length_vec <= max_len), ]
                scores.append(process(tmp, col))
        else:
            scores = [-.0, -.0]
            tmp = df.loc[(df["allele"] == allele) & (length_vec >= min_len) & (length_vec <= max_len), ]
            scores.append(process(tmp, "netmhcpan"))
        print("{:<8} {:>7.4} {:>7.4} {:>7.4}".format(allele, *scores))
        score_d[str((min_len, max_len))][allele] = scores
    print()

(8, 11)
HLAA0101  0.8511   0.817   0.848
HLAA0201  0.7045  0.6903  0.7098
HLAA0203  0.7531   0.734  0.7239
HLAA0204    -0.0    -0.0  0.7022
HLAA0207  0.3887  0.4709  0.5778
HLAA0301  0.6502  0.6283   0.655
HLAA2402  0.7966   0.754  0.7737
HLAA2902   0.767  0.7478  0.7682
HLAA3101   0.554  0.5434  0.5583
HLAA6802  0.6102  0.5999  0.6109
HLAB3501  0.7105  0.6934  0.7289
HLAB4402  0.7914  0.7579  0.8029
HLAB4403  0.8137  0.7612  0.7937
HLAB5101  0.7254  0.6846  0.6823
HLAB5401  0.6974  0.6741  0.6778
HLAB5701  0.7334   0.685  0.7097

