In [1]:
%matplotlib inline

import pandas as pd
import re
import numpy as np
from collections import Counter 
from matplotlib import pyplot as plt

unify_alleles = lambda x: re.sub('[*|:|-]', '', x)

In [2]:
HLA_OF_INTEREST = "HLAA0201"

# Default AAE data
### just split to two datasets

In [77]:
df1 = pd.read_csv("curated.csv.gz")
df1["mhc"] = df1["mhc"].map(unify_alleles)

df2 = pd.read_csv("jci.csv.gz")
df2["mhc"] = df2["mhc"].map(unify_alleles)

df3 = pd.concat([df1, df2])
df3 = df3.loc[df3["mhc"] == HLA_OF_INTEREST, :]

In [78]:
df3["len"] = df3["sequence"].map(len)
df3 = df3.loc[(df3["len"] >= 8) & (df3["len"] <= 11), :]
df3.reset_index(inplace=True, drop=True)

In [79]:
np.random.seed(42)
perm = np.random.permutation(len(df3))
train_size = int(len(perm) * 0.8)
train_size, len(perm) - train_size

(14280, 3571)

In [80]:
train_ind = perm[:train_size]
test_ind = perm[train_size:]
print(Counter(df3["len"][train_ind]))
print(Counter(df3["len"][test_ind]))

Counter({9: 10312, 10: 3243, 11: 501, 8: 224})
Counter({9: 2557, 10: 823, 11: 137, 8: 54})


In [81]:
df3.loc[train_ind, :].to_csv("aae_train.csv.gz", compression="gzip")
df3.loc[test_ind, :].to_csv("aae_test.csv.gz", compression="gzip")

# Quality AAE data
### Find quality data and get the Abelin data for testing

In [64]:
def load_mhc_data(names, qualitative=False, hla_of_interest=None):
    if type(names) is not list:
        names = [names]

    res = []
    for df_name in names:
        df = pd.read_csv(df_name)
        
        df["mhc"] = df["mhc"].map(unify_alleles)
        df = df.loc[df["mhc"] == hla_of_interest, :]
        
        df["len"] = df["sequence"].map(len)
        df = df.loc[(df["len"] >= 8) & (df["len"] <= 11), :]

        if qualitative:
            df["binder"] = df["meas"]
        else:
            df["meas"].values[df["meas"] > 50000] = 50000
            df["binder"] = np.where(df["meas"].values <= 500, 1, 0)
            
        df.reset_index(inplace=True, drop=True)
        res.append(df)
        print(df_name, "--", len(df), "rows")
    
    return res if len(res) > 1 else res[0]

In [65]:
def merge_mhc_data(datasets):

    def _get_confidence(x):
        x_sum = x["binder"].sum()
        if_all_zero = x_sum == 0
        if_all_ones = x_sum == len(x)
        if if_all_zero or if_all_ones:
            return len(x)
        else:
            return 0
    
    if type(datasets) is not list:
        datasets = [datasets]
    print("Pre-merge:")
    for df in datasets:
        print(" --", len(df), "rows")

    pd1 = pd.concat(datasets)
    print("First merge:", len(pd1), "rows")

    tmp = pd1.groupby(["mhc", "sequence"]).apply(_get_confidence)
    tmp2 = tmp.reset_index()
    tmp2.columns = ["mhc", "sequence", "confidence"]

    pd1_new = pd1.merge(tmp2).sort_values(by=["sequence"])
    print("Confidence stats:")
    counter = Counter(pd1_new["confidence"].apply(str) + "_" + pd1_new["binder"].apply(str))
    for conf_val in range(0, max(map(lambda x: int(x.split("_")[0]), counter.keys()))):
        val_str = str(conf_val)
        print(" --", val_str + ":", counter.get(val_str + "_0", 0), counter.get(val_str + "_1", 0))
    
    return pd1_new

In [73]:
def remove_duplicates(train_df, test_df):
    # Drop all intersected peptides from train_df
    df_merged = train_df.merge(test_df.drop_duplicates(keep=False, subset=["mhc", "sequence"]), 
                               on=["mhc", "sequence"], how="left", indicator=True)
    print("before drop:", len(df_merged))
    df_merged = df_merged.loc[df_merged["_merge"] == "left_only", :]
    print("after drop:", len(df_merged))

    df_merged = df_merged.reset_index(drop=True)
    df_merged["meas"] = df_merged["meas_x"]
    return df_merged.drop(["meas_x", "meas_y", "_merge", "binder_x", "len_x", "len_y", "binder_y"], axis=1)

In [90]:
datasets = load_mhc_data(["curated.csv.gz", "mhc_data.csv.gz"], False, HLA_OF_INTEREST)
final_df = merge_mhc_data(datasets)
abelin_df = load_mhc_data("abelin.csv.gz", True, HLA_OF_INTEREST)
abelin_df.loc[abelin_df["binder"] == 1, "meas"] = 200
abelin_df.loc[abelin_df["binder"] == 0, "meas"] = 20000
final_df = remove_duplicates(final_df, abelin_df)

curated.csv.gz -- 15605 rows
mhc_data.csv.gz -- 34821 rows
Pre-merge:
 -- 15605 rows
 -- 34821 rows
First merge: 50426 rows
Confidence stats:
 -- 0: 2527 2427
 -- 1: 1164 13154
 -- 2: 10652 7526
 -- 3: 7461 4128
 -- 4: 48 696
 -- 5: 70 345
 -- 6: 12 126
 -- 7: 0 42
abelin.csv.gz -- 133818 rows
before drop: 50426
after drop: 47914


In [91]:
confidence_threshold = 2
counter = Counter(final_df["confidence"] >= confidence_threshold)
print("After threshold:\n", 
      "--", counter[True], "high-quality pts.\n",
      "--", counter[False], "low-quality pts.")

final_df.loc[final_df["confidence"] >= confidence_threshold, :].to_csv("aae_train_high.csv.gz", compression="gzip")
final_df.loc[final_df["confidence"] < confidence_threshold, :].to_csv("aae_train_low.csv.gz", compression="gzip")
abelin_df.to_csv("aae_test_v2.csv.gz", compression="gzip")

After threshold:
 -- 31154 high-quality pts.
 -- 16760 low-quality pts.


In [89]:
Counter(abelin_df["meas"])

Counter({200: 2413, 20000: 131405})