In [1]:
from pathlib import Path
import pandas as pd
from tqdm import tqdm

In [2]:
vdjdb_dir = Path('/home/yutanagano/UCLOneDrive/MBPhD/data/tcr_embedder/data_sources/vdjdb')

In [3]:
df = pd.read_csv(vdjdb_dir/'vdjdb_20220607.tsv',sep='\t')

In [4]:
df.head()

Unnamed: 0,complex.id,Gene,CDR3,V,J,Species,MHC A,MHC B,MHC class,Epitope,Epitope gene,Epitope species,Reference,Method,Meta,CDR3fix,Score
0,1,TRA,CIVRAPGRADMRF,TRAV26-1*01,TRAJ43*01,HomoSapiens,HLA-B*08,B2M,MHCI,FLKEKGGL,Nef,HIV-1,PMID:15596521,"{""frequency"": """", ""identification"": ""tetramer-...","{""cell.subset"": ""CD8+"", ""clone.id"": """", ""donor...","{""cdr3"": ""CIVRAPGRADMRF"", ""cdr3_old"": ""CIVRAPG...",2
1,1,TRB,CASSYLPGQGDHYSNQPQHF,TRBV13*01,TRBJ1-5*01,HomoSapiens,HLA-B*08,B2M,MHCI,FLKEKGGL,Nef,HIV-1,PMID:15596521,"{""frequency"": """", ""identification"": ""tetramer-...","{""cell.subset"": ""CD8+"", ""clone.id"": """", ""donor...","{""cdr3"": ""CASSYLPGQGDHYSNQPQHF"", ""cdr3_old"": ""...",2
2,0,TRB,CASSFEAGQGFFSNQPQHF,TRBV13*01,TRBJ1-5*01,HomoSapiens,HLA-B*08,B2M,MHCI,FLKEKGGL,Nef,HIV-1,PMID:15596521,"{""frequency"": """", ""identification"": ""tetramer-...","{""cell.subset"": ""CD8+"", ""clone.id"": """", ""donor...","{""cdr3"": ""CASSFEAGQGFFSNQPQHF"", ""cdr3_old"": ""C...",2
3,2,TRA,CAVPSGAGSYQLTF,TRAV20*01,TRAJ28*01,HomoSapiens,HLA-B*08,B2M,MHCI,FLKEKGGL,Nef,HIV-1,PMID:15596521,"{""frequency"": """", ""identification"": ""tetramer-...","{""cell.subset"": ""CD8+"", ""clone.id"": """", ""donor...","{""cdr3"": ""CAVPSGAGSYQLTF"", ""cdr3_old"": ""CAVPSG...",2
4,2,TRB,CASSFEPGQGFYSNQPQHF,TRBV13*01,TRBJ1-5*01,HomoSapiens,HLA-B*08,B2M,MHCI,FLKEKGGL,Nef,HIV-1,PMID:15596521,"{""frequency"": """", ""identification"": ""tetramer-...","{""cell.subset"": ""CD8+"", ""clone.id"": """", ""donor...","{""cdr3"": ""CASSFEPGQGFYSNQPQHF"", ""cdr3_old"": ""C...",2


In [5]:
df['Epitope species'].unique()

array(['HIV-1', 'TriticumAestivum', 'CMV', 'SARS-CoV-2', 'HomoSapiens',
       'EBV', 'M.tuberculosis', 'HTLV-1', 'GallusGallus', 'MusMusculus',
       'InfluenzaA', 'Synthetic', 'VSV', 'SaccharomycesCerevisiae',
       'ManducaSexta', 'HCV', 'synthetic', 'E.Coli', 'HHV', 'LCMV',
       'DENV1', 'DENV3/4', 'PlasmodiumBerghei',
       'SelaginellaMoellendorffii', 'PseudomonasFluorescens',
       'PseudomonasAeruginosa', 'HIV1', 'Homo sapiens', 'YFV', 'RSV',
       'HSV-2', 'DENV2', 'MCMV', 'MCPyV', 'HPV',
       'StreptomycesKanamyceticus', 'HIV', 'HCoV-HKU1', 'SIV'],
      dtype=object)

In [6]:
filtered = df[df['Epitope species'] != 'SARS-CoV-2']

In [7]:
len(filtered)

81024

In [8]:
def reformat(df: pd.DataFrame) -> pd.DataFrame:
    reformatted_rows = []

    # Process bulk data
    bulk_data = df[df['complex.id'] == 0]
    for _, row in tqdm(bulk_data.iterrows(), total=len(bulk_data)):
        if row['Gene'] == 'TRA':
            chain_cols = ['TRAV', 'CDR3A', 'TRAJ']
        else:
            chain_cols = ['TRBV', 'CDR3B', 'TRBJ']
        reformatted_rows.append(
            {
                chain_cols[0]: row['V'],
                chain_cols[1]: row['CDR3'],
                chain_cols[2]: row['J'],
                'Epitope': row['Epitope'],
                'MHCA': row['MHC A'],
                'MHCB': row['MHC B'],
                'duplicate_count': pd.NA
            }
        )

    # Process single cell data
    sc_complex_ids = df[df['complex.id'] != 0]['complex.id'].unique()
    for complex_id in tqdm(sc_complex_ids):
        tcr_info = df[df['complex.id'] == complex_id]
        tcr_info = tcr_info.drop_duplicates(subset=['V','J','CDR3'])

        tra_info = tcr_info[tcr_info['Gene'] == 'TRA'].iloc[0]
        trb_info = tcr_info[tcr_info['Gene'] == 'TRB'].iloc[0]

        reformatted_rows.append(
            {
                'TRAV': tra_info['V'],
                'CDR3A': tra_info['CDR3'],
                'TRAJ': tra_info['J'],
                'TRBV': trb_info['V'],
                'CDR3B': trb_info['CDR3'],
                'TRBJ': trb_info['J'],
                'Epitope': tra_info['Epitope'],
                'MHCA': tra_info['MHC A'],
                'MHCB': tra_info['MHC B'],
                'duplicate_count': pd.NA
            }
        )

    reformatted_df = pd.DataFrame.from_records(reformatted_rows)
    reformatted_df = reformatted_df.drop_duplicates()
    return reformatted_df[
        ['TRAV','CDR3A','TRAJ','TRBV','CDR3B','TRBJ','Epitope','MHCA','MHCB','duplicate_count']
    ]

In [9]:
filtered = reformat(filtered)

100%|██████████| 27934/27934 [00:01<00:00, 22976.66it/s]
100%|██████████| 26545/26545 [00:31<00:00, 841.61it/s]


In [10]:
filtered = filtered.groupby('Epitope').filter(lambda x: len(x) >= 100)

In [11]:
subsampled = filtered.groupby('Epitope').sample(100)

In [12]:
subsampled.to_csv(vdjdb_dir/'evaluation.csv', index=False)