# Extract epitope-labelled evaluation data from VDJdb

## Setup

In [29]:
from pathlib import Path
import pandas as pd
from pandas import isna, notna
import tidytcells
from tqdm import tqdm

In [30]:
travs = {
    "TRAV1-1",
    "TRAV1-2",
    "TRAV2",
    "TRAV3",
    "TRAV4",
    "TRAV5",
    "TRAV6",
    "TRAV7",
    "TRAV8-1",
    "TRAV8-2",
    "TRAV8-3",
    "TRAV8-4",
    "TRAV8-6",
    "TRAV9-1",
    "TRAV9-2",
    "TRAV10",
    "TRAV12-1",
    "TRAV12-2",
    "TRAV12-3",
    "TRAV13-1",
    "TRAV13-2",
    "TRAV14/DV4",
    "TRAV16",
    "TRAV17",
    "TRAV18",
    "TRAV19",
    "TRAV20",
    "TRAV21",
    "TRAV22",
    "TRAV23/DV6",
    "TRAV24",
    "TRAV25",
    "TRAV26-1",
    "TRAV26-2",
    "TRAV27",
    "TRAV29/DV5",
    "TRAV30",
    "TRAV34",
    "TRAV35",
    "TRAV36/DV7",
    "TRAV38-1",
    "TRAV38-2/DV8",
    "TRAV39",
    "TRAV40",
    "TRAV41"
}

trajs = {
    "TRAJ3",
    "TRAJ4",
    "TRAJ5",
    "TRAJ6",
    "TRAJ7",
    "TRAJ8",
    "TRAJ9",
    "TRAJ10",
    "TRAJ11",
    "TRAJ12",
    "TRAJ13",
    "TRAJ14",
    "TRAJ15",
    "TRAJ16",
    "TRAJ17",
    "TRAJ18",
    "TRAJ20",
    "TRAJ21",
    "TRAJ22",
    "TRAJ23",
    "TRAJ24",
    "TRAJ26",
    "TRAJ27",
    "TRAJ28",
    "TRAJ29",
    "TRAJ30",
    "TRAJ31",
    "TRAJ32",
    "TRAJ33",
    "TRAJ34",
    "TRAJ35",
    "TRAJ36",
    "TRAJ37",
    "TRAJ38",
    "TRAJ39",
    "TRAJ40",
    "TRAJ41",
    "TRAJ42",
    "TRAJ43",
    "TRAJ44",
    "TRAJ45",
    "TRAJ46",
    "TRAJ47",
    "TRAJ48",
    "TRAJ49",
    "TRAJ50",
    "TRAJ52",
    "TRAJ53",
    "TRAJ54",
    "TRAJ56",
    "TRAJ57"
}

trbvs = {
    "TRBV2",
    "TRBV3-1",
    "TRBV4-1",
    "TRBV4-2",
    "TRBV4-3",
    "TRBV5-1",
    "TRBV5-4",
    "TRBV5-5",
    "TRBV5-6",
    "TRBV5-8",
    "TRBV6-1",
    "TRBV6-2",
    "TRBV6-3",
    "TRBV6-4",
    "TRBV6-5",
    "TRBV6-6",
    "TRBV6-8",
    "TRBV6-9",
    "TRBV7-2",
    "TRBV7-3",
    "TRBV7-4",
    "TRBV7-6",
    "TRBV7-7",
    "TRBV7-8",
    "TRBV7-9",
    "TRBV9",
    "TRBV10-1",
    "TRBV10-2",
    "TRBV10-3",
    "TRBV11-1",
    "TRBV11-2",
    "TRBV11-3",
    "TRBV12-3",
    "TRBV12-4",
    "TRBV12-5",
    "TRBV13",
    "TRBV14",
    "TRBV15",
    "TRBV16",
    "TRBV18",
    "TRBV19",
    "TRBV20-1",
    "TRBV24-1",
    "TRBV25-1",
    "TRBV27",
    "TRBV28",
    "TRBV29-1",
    "TRBV30"
}


trbjs = {
    "TRBJ1-1",
    "TRBJ1-2",
    "TRBJ1-3",
    "TRBJ1-4",
    "TRBJ1-5",
    "TRBJ1-6",
    "TRBJ2-1",
    "TRBJ2-2",
    "TRBJ2-3",
    "TRBJ2-4",
    "TRBJ2-5",
    "TRBJ2-6",
    "TRBJ2-7"
}

In [31]:
raw_dir = Path('/home/yutanagano/UCLOneDrive/MBPhD/projects/tcr_embedder/data/raw/vdjdb')
preprocessed_dir = Path('/home/yutanagano/UCLOneDrive/MBPhD/projects/tcr_embedder/data/preprocessed/vdjdb')

## Reformat and clean data

In [32]:
df = pd.read_csv(raw_dir/'vdjdb_20220607.tsv',sep='\t')

In [33]:
filtered = df[df['Species'] == 'HomoSapiens']

In [34]:
filtered.head()

Unnamed: 0,complex.id,Gene,CDR3,V,J,Species,MHC A,MHC B,MHC class,Epitope,Epitope gene,Epitope species,Reference,Method,Meta,CDR3fix,Score
0,1,TRA,CIVRAPGRADMRF,TRAV26-1*01,TRAJ43*01,HomoSapiens,HLA-B*08,B2M,MHCI,FLKEKGGL,Nef,HIV-1,PMID:15596521,"{""frequency"": """", ""identification"": ""tetramer-...","{""cell.subset"": ""CD8+"", ""clone.id"": """", ""donor...","{""cdr3"": ""CIVRAPGRADMRF"", ""cdr3_old"": ""CIVRAPG...",2
1,1,TRB,CASSYLPGQGDHYSNQPQHF,TRBV13*01,TRBJ1-5*01,HomoSapiens,HLA-B*08,B2M,MHCI,FLKEKGGL,Nef,HIV-1,PMID:15596521,"{""frequency"": """", ""identification"": ""tetramer-...","{""cell.subset"": ""CD8+"", ""clone.id"": """", ""donor...","{""cdr3"": ""CASSYLPGQGDHYSNQPQHF"", ""cdr3_old"": ""...",2
2,0,TRB,CASSFEAGQGFFSNQPQHF,TRBV13*01,TRBJ1-5*01,HomoSapiens,HLA-B*08,B2M,MHCI,FLKEKGGL,Nef,HIV-1,PMID:15596521,"{""frequency"": """", ""identification"": ""tetramer-...","{""cell.subset"": ""CD8+"", ""clone.id"": """", ""donor...","{""cdr3"": ""CASSFEAGQGFFSNQPQHF"", ""cdr3_old"": ""C...",2
3,2,TRA,CAVPSGAGSYQLTF,TRAV20*01,TRAJ28*01,HomoSapiens,HLA-B*08,B2M,MHCI,FLKEKGGL,Nef,HIV-1,PMID:15596521,"{""frequency"": """", ""identification"": ""tetramer-...","{""cell.subset"": ""CD8+"", ""clone.id"": """", ""donor...","{""cdr3"": ""CAVPSGAGSYQLTF"", ""cdr3_old"": ""CAVPSG...",2
4,2,TRB,CASSFEPGQGFYSNQPQHF,TRBV13*01,TRBJ1-5*01,HomoSapiens,HLA-B*08,B2M,MHCI,FLKEKGGL,Nef,HIV-1,PMID:15596521,"{""frequency"": """", ""identification"": ""tetramer-...","{""cell.subset"": ""CD8+"", ""clone.id"": """", ""donor...","{""cdr3"": ""CASSFEPGQGFYSNQPQHF"", ""cdr3_old"": ""C...",2


In [35]:
filtered['Epitope species'].unique()

array(['HIV-1', 'TriticumAestivum', 'CMV', 'SARS-CoV-2', 'HomoSapiens',
       'EBV', 'M.tuberculosis', 'HTLV-1', 'InfluenzaA',
       'SaccharomycesCerevisiae', 'HCV', 'E.Coli', 'HHV', 'synthetic',
       'DENV1', 'DENV3/4', 'SelaginellaMoellendorffii',
       'PseudomonasFluorescens', 'PseudomonasAeruginosa', 'HIV1',
       'Homo sapiens', 'YFV', 'HSV-2', 'DENV2', 'MCPyV', 'HPV',
       'StreptomycesKanamyceticus', 'HIV', 'HCoV-HKU1'], dtype=object)

In [36]:
filtered = filtered[filtered['Epitope species'] != 'SARS-CoV-2']

In [37]:
len(filtered)

73761

In [38]:
def reformat(df: pd.DataFrame) -> pd.DataFrame:
    reformatted_rows = []

    # Process bulk data
    bulk_data = df[df['complex.id'] == 0]
    for _, row in tqdm(bulk_data.iterrows(), total=len(bulk_data)):
        if row['Gene'] == 'TRA':
            chain_cols = ['TRAV', 'CDR3A', 'TRAJ']
        else:
            chain_cols = ['TRBV', 'CDR3B', 'TRBJ']
        reformatted_rows.append(
            {
                chain_cols[0]: row['V'],
                chain_cols[1]: row['CDR3'],
                chain_cols[2]: row['J'],
                'Epitope': row['Epitope'],
                'MHCA': row['MHC A'],
                'MHCB': row['MHC B'],
                'duplicate_count': pd.NA
            }
        )

    # Process single cell data
    sc_complex_ids = df[df['complex.id'] != 0]['complex.id'].unique()
    for complex_id in tqdm(sc_complex_ids):
        tcr_info = df[df['complex.id'] == complex_id]
        tcr_info = tcr_info.drop_duplicates(subset=['V','J','CDR3'])

        tra_info = tcr_info[tcr_info['Gene'] == 'TRA'].iloc[0]
        trb_info = tcr_info[tcr_info['Gene'] == 'TRB'].iloc[0]

        reformatted_rows.append(
            {
                'TRAV': tra_info['V'],
                'CDR3A': tra_info['CDR3'],
                'TRAJ': tra_info['J'],
                'TRBV': trb_info['V'],
                'CDR3B': trb_info['CDR3'],
                'TRBJ': trb_info['J'],
                'Epitope': tra_info['Epitope'],
                'MHCA': tra_info['MHC A'],
                'MHCB': tra_info['MHC B'],
                'duplicate_count': pd.NA
            }
        )

    reformatted_df = pd.DataFrame.from_records(reformatted_rows)
    reformatted_df = reformatted_df.drop_duplicates()
    return reformatted_df[
        ['TRAV','CDR3A','TRAJ','TRBV','CDR3B','TRBJ','Epitope','MHCA','MHCB','duplicate_count']
    ]

In [39]:
filtered = reformat(filtered)

100%|██████████| 25163/25163 [00:01<00:00, 19496.24it/s]
100%|██████████| 24299/24299 [00:47<00:00, 512.27it/s]


In [40]:
filtered.head()

Unnamed: 0,TRAV,CDR3A,TRAJ,TRBV,CDR3B,TRBJ,Epitope,MHCA,MHCB,duplicate_count
0,,,,TRBV13*01,CASSFEAGQGFFSNQPQHF,TRBJ1-5*01,FLKEKGGL,HLA-B*08,B2M,
1,,,,TRBV13*01,CASSFEAGQGFFSNQPQHF,TRBJ1-5*01,FLKEQGGL,HLA-B*08,B2M,
2,,,,TRBV13*01,CASSFEAGQGFFSNQPQHF,TRBJ1-5*01,FLKETGGL,HLA-B*08,B2M,
3,,,,TRBV13*01,CASSFEAGQGFFSNQPQHF,TRBJ1-5*01,FLKEMGGL,HLA-B*08,B2M,
4,,,,TRBV7-2*01,CASSFGVEDEQYF,TRBJ2-7*01,LQPFPQPELPYPQPQ,HLA-DQA1*05:01,HLA-DQB1*02:01,


In [41]:
filtered = filtered.drop_duplicates(subset=['TRAV','CDR3A','TRAJ','TRBV','CDR3B','TRBJ'], keep=False)

In [42]:
len(filtered)

41478

In [43]:
filtered['TRAV'] = filtered['TRAV'].map(lambda x: None if isna(x) else tidytcells.tcr.standardise(x, enforce_functional=True))
filtered['TRAJ'] = filtered['TRAJ'].map(lambda x: None if isna(x) else tidytcells.tcr.standardise(x, enforce_functional=True))
filtered['TRBV'] = filtered['TRBV'].map(lambda x: None if isna(x) else tidytcells.tcr.standardise(x, enforce_functional=True))
filtered['TRBJ'] = filtered['TRBJ'].map(lambda x: None if isna(x) else tidytcells.tcr.standardise(x, enforce_functional=True))

In [44]:
len(filtered)

41478

## Create general (messy) evaluation dataset

In [45]:
subsampled = filtered.groupby('Epitope').filter(lambda x: len(x) >= 100)

In [46]:
subsampled = subsampled.groupby('Epitope').sample(100, random_state=420)

In [47]:
len(subsampled)

3400

In [48]:
subsampled.to_csv(preprocessed_dir/'evaluation.csv', index=False)

## Create beta-guaranteed evaluation dataset

In [49]:
filtered_beta = filtered[filtered.notna()['CDR3B']]
filtered_beta = filtered_beta.drop_duplicates(subset=['TRBV','CDR3B','TRBJ'], keep=False)

In [50]:
len(filtered_beta)

31463

In [51]:
filtered_beta.head()

Unnamed: 0,TRAV,CDR3A,TRAJ,TRBV,CDR3B,TRBJ,Epitope,MHCA,MHCB,duplicate_count
4,,,,TRBV7-2*01,CASSFGVEDEQYF,TRBJ2-7*01,LQPFPQPELPYPQPQ,HLA-DQA1*05:01,HLA-DQB1*02:01,
5,,,,TRBV3-1*01,CASSSLNTQYF,TRBJ2-3*01,LQPFPQPELPYPQPQ,HLA-DQA1*05:01,HLA-DQB1*02:01,
6,,,,TRBV7-3*01,CASSIRSTDTQYF,TRBJ2-3*01,LQPFPQPELPYPQPQ,HLA-DQA1*05:01,HLA-DQB1*02:01,
7,,,,TRBV4-1*01,CASSQVTLPTETQYF,TRBJ2-5*01,LQPFPQPELPYPQPQ,HLA-DQA1*05:01,HLA-DQB1*02:01,
8,,,,TRBV7-2*01,CASNFGVEDEQYF,TRBJ2-7*01,LQPFPQPELPYPQPQ,HLA-DQA1*05:01,HLA-DQB1*02:01,


In [52]:
subsampled_beta = filtered_beta.groupby('Epitope').filter(lambda x: len(x) >= 100)

In [53]:
subsampled_beta = subsampled_beta.groupby('Epitope').sample(100, random_state=420)

In [54]:
len(subsampled_beta)

2900

In [55]:
subsampled_beta.to_csv(preprocessed_dir/'evaluation_beta.csv', index=False)