# Reformat Dash et al data

## Setup

In [1]:
from pathlib import Path
import pandas as pd
from pandas import isna, notna
import tidytcells
from tqdm import tqdm

In [2]:
raw_dir = Path("/home/yutanagano/UCLOneDrive/MBPhD/projects/tcr_embedder/data/tcrdist/")
preprocessed_dir = Path(
    "/home/yutanagano/UCLOneDrive/MBPhD/projects/tcr_embedder/data/preprocessed/dash"
)

### Reformat and clean data

In [30]:
dash_human = pd.read_csv(raw_dir / "dash_human.csv")
dash_mouse_1 = pd.read_csv(raw_dir / "dash.csv")
dash_mouse_2 = pd.read_csv(raw_dir / "dash2.csv")

In [58]:
combined = pd.concat([dash_human, dash_mouse_1, dash_mouse_2])
combined = combined.rename(
    columns={
        "v_a_gene": "TRAV",
        "cdr3_a_aa": "CDR3A",
        "j_a_gene": "TRAJ",
        "v_b_gene": "TRBV",
        "cdr3_b_aa": "CDR3B",
        "j_b_gene": "TRBJ",
        "epitope": "Epitope",
        "count": "duplicate_count",
    }
)

In [70]:
combined

Unnamed: 0.1,subject,Epitope,duplicate_count,TRAV,TRAJ,CDR3A,TRBV,TRBJ,CDR3B,clone_id,cdr3_a_nucseq,cdr3_b_nucseq,Unnamed: 0,cdr1_a_aa,cdr2_a_aa,pmhc_a_aa,cdr1_b_aa,cdr2_b_aa,pmhc_b_aa
0,human_subject0005,BMLF,1,TRAV5*01,TRAJ31*01,CAADSNARFMF,TRBV20-1*01,TRBJ1-3*01,CSARDRVGNTIYF,clone_0,,,,,,,,,
1,human_subject0005,BMLF,1,TRAV5*01,TRAJ31*01,CAADSNARLMF,TRBV20-1*01,TRBJ1-3*01,CSARDRVGNTIYF,clone_1,,,,,,,,,
2,human_subject0028,M1,1,TRAV13-1*01,TRAJ43*01,CAAENNNDMRF,TRBV7-6*01,TRBJ2-7*01,CASSLGTSYEQYF,clone_2,,,,,,,,,
3,human_subject0029,M1,1,TRAV41*01,TRAJ40*01,CAAETTSGTYKYIF,TRBV14*01,TRBJ2-7*01,CASSPIAGSSYEQYF,clone_3,,,,,,,,,
4,human_subject0011,M1,1,TRAV27*01,TRAJ42*01,CAAGGSQGNLIF,TRBV19*02,TRBJ2-7*01,CASSIRSSYEQYF,clone_4,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1919,mouse_subject0065,M45,1,TRAV12-3*04,TRAJ37*01,CALRVITGNTGKLIF,TRBV19*03,TRBJ1-5*01,CASRYRGDNQAPLF,mouse_tcr1772.clone,tgtgctttgagggtcataacaggcaataccggaaaactcatcttt,tgtgccagcagatacaggggagacaaccaggctccgcttttt,1919.0,TIYS.....HPF,SSTD...NKR,HKSS.SS,FNH.......DS,SIT....END,E.KKSS
1920,mouse_subject0065,M45,2,TRAV5-4*01,TRAJ37*01,CAASTGNTGKLIF,TRBV17*01,TRBJ2-3*01,CASSPLGSSAETLYF,mouse_tcr1765.clone,tgtgctgcaagcacaggcaataccggaaaactcatcttt,tgtgctagcagccccctggggtctagtgcagaaacgctgtatttt,1920.0,NSA......SNY,IRSN...MER,DKKA.KR,MNH.......DT,YYD....KIL,P.NNSF
1921,mouse_subject0065,M45,1,TRAV8-1*01,TRAJ26*01,CATGDNYAQGLTF,TRBV13-3*01,TRBJ1-5*01,CASSDWTSQAPLF,mouse_tcr1775.clone,tgtgctactggggataactatgcccagggattaaccttc,tgtgccagcagtgattggacatcccaggctccgcttttt,1921.0,TSI.......TA,IRSN...ERE,DTSS.QS,NNH.......DY,SYV....ADS,P.SQEN
1922,mouse_subject0045,PA,1,TRAV6D-6*01,TRAJ22*01,CALGSGGSWQLIF,TRBV29*01,TRBJ1-1*01,CASSGPEVFF,mouse_tcr1919.clone,tgtgctctggggtccgggggcagctggcaactcatcttt,tgtgctagcagtggcccagaagtcttcttt,1922.0,ATSI....AYPN,VITA...GQK,NKET.TS,MSH.......ET,SYD....VDS,K.KREH


In [80]:
combined['pMHC'] = combined.apply(lambda row: f"{row['Epitope']}_{row['pmhc_a_aa']}_{row['pmhc_b_aa']}", axis=1)

In [81]:
alpha_gb = combined.groupby(
    ["TRAV", "CDR3A", "TRAJ"],
    as_index=False,
    dropna=False
)
beta_gb = combined.groupby(
    ["TRBV", "CDR3B", "TRBJ"],
    as_index=False,
    dropna=False
)

In [89]:
dash_alpha_pmhc_count = alpha_gb.pMHC.nunique()
dash_alpha_epitope_count = alpha_gb.Epitope.nunique()

dash_promiscuous_alpha_by_pmhc = dash_alpha_pmhc_count[dash_alpha_pmhc_count.pMHC > 1]
dash_promiscuous_alpha_by_epitope = dash_alpha_epitope_count[dash_alpha_epitope_count.Epitope > 1]

In [90]:
dash_beta_pmhc_count = beta_gb.pMHC.nunique()
dash_beta_epitope_count = beta_gb.Epitope.nunique()

dash_promiscuous_beta_by_pmhc = dash_beta_pmhc_count[dash_beta_pmhc_count.pMHC > 1]
dash_promiscuous_beta_by_epitope = dash_beta_epitope_count[dash_beta_epitope_count.Epitope > 1]

In [91]:
dash_promiscuous_alpha_by_pmhc.to_csv("dash_promiscuous_alpha_by_pmhc.csv", index=False)
dash_promiscuous_alpha_by_epitope.to_csv("dash_promiscuous_alpha_by_epitope.csv", index=False)

dash_promiscuous_beta_by_pmhc.to_csv("dash_promiscuous_beta_by_pmhc.csv", index=False)
dash_promiscuous_beta_by_epitope.to_csv("dash_promiscuous_beta_by_epitope.csv", index=False)

In [98]:
for_ml = dash_human.rename(
    columns={
        "v_a_gene": "TRAV",
        "cdr3_a_aa": "CDR3A",
        "j_a_gene": "TRAJ",
        "v_b_gene": "TRBV",
        "cdr3_b_aa": "CDR3B",
        "j_b_gene": "TRBJ",
        "epitope": "Epitope",
        "count": "duplicate_count",
    }
)
for_ml = for_ml[["TRAV", "CDR3A", "TRAJ", "TRBV", "CDR3B", "TRBJ", "Epitope", "duplicate_count"]]

In [99]:
for_ml.TRAV = for_ml.TRAV.map(lambda x: None if not isinstance(x, str) else tidytcells.tcr.standardize(x, enforce_functional=True))
for_ml.TRAJ = for_ml.TRAJ.map(lambda x: None if not isinstance(x, str) else tidytcells.tcr.standardize(x, enforce_functional=True))
for_ml.TRBV = for_ml.TRBV.map(lambda x: None if not isinstance(x, str) else tidytcells.tcr.standardize(x, enforce_functional=True))
for_ml.TRBJ = for_ml.TRBJ.map(lambda x: None if not isinstance(x, str) else tidytcells.tcr.standardize(x, enforce_functional=True))

### Save reformatted data

In [100]:
for_ml.to_csv(preprocessed_dir / "preprocessed.csv", index=False)