# Extract epitope-labelled evaluation data from VDJdb

## Setup

In [1]:
from pathlib import Path
import pandas as pd
from pandas import isna, notna
import tidytcells
from tqdm import tqdm

In [2]:
raw_dir = Path(
    "/home/yutanagano/UCLOneDrive/MBPhD/projects/tcr_embedder/data/raw/vdjdb"
)
preprocessed_dir = Path(
    "/home/yutanagano/UCLOneDrive/MBPhD/projects/tcr_embedder/data/preprocessed/vdjdb"
)

## Reformat and clean data

In [3]:
df = pd.read_csv(raw_dir / "vdjdb_20220607.tsv", sep="\t")

In [4]:
filtered = df[df["Species"] == "HomoSapiens"]

In [5]:
filtered.head()

Unnamed: 0,complex.id,Gene,CDR3,V,J,Species,MHC A,MHC B,MHC class,Epitope,Epitope gene,Epitope species,Reference,Method,Meta,CDR3fix,Score
0,1,TRA,CIVRAPGRADMRF,TRAV26-1*01,TRAJ43*01,HomoSapiens,HLA-B*08,B2M,MHCI,FLKEKGGL,Nef,HIV-1,PMID:15596521,"{""frequency"": """", ""identification"": ""tetramer-...","{""cell.subset"": ""CD8+"", ""clone.id"": """", ""donor...","{""cdr3"": ""CIVRAPGRADMRF"", ""cdr3_old"": ""CIVRAPG...",2
1,1,TRB,CASSYLPGQGDHYSNQPQHF,TRBV13*01,TRBJ1-5*01,HomoSapiens,HLA-B*08,B2M,MHCI,FLKEKGGL,Nef,HIV-1,PMID:15596521,"{""frequency"": """", ""identification"": ""tetramer-...","{""cell.subset"": ""CD8+"", ""clone.id"": """", ""donor...","{""cdr3"": ""CASSYLPGQGDHYSNQPQHF"", ""cdr3_old"": ""...",2
2,0,TRB,CASSFEAGQGFFSNQPQHF,TRBV13*01,TRBJ1-5*01,HomoSapiens,HLA-B*08,B2M,MHCI,FLKEKGGL,Nef,HIV-1,PMID:15596521,"{""frequency"": """", ""identification"": ""tetramer-...","{""cell.subset"": ""CD8+"", ""clone.id"": """", ""donor...","{""cdr3"": ""CASSFEAGQGFFSNQPQHF"", ""cdr3_old"": ""C...",2
3,2,TRA,CAVPSGAGSYQLTF,TRAV20*01,TRAJ28*01,HomoSapiens,HLA-B*08,B2M,MHCI,FLKEKGGL,Nef,HIV-1,PMID:15596521,"{""frequency"": """", ""identification"": ""tetramer-...","{""cell.subset"": ""CD8+"", ""clone.id"": """", ""donor...","{""cdr3"": ""CAVPSGAGSYQLTF"", ""cdr3_old"": ""CAVPSG...",2
4,2,TRB,CASSFEPGQGFYSNQPQHF,TRBV13*01,TRBJ1-5*01,HomoSapiens,HLA-B*08,B2M,MHCI,FLKEKGGL,Nef,HIV-1,PMID:15596521,"{""frequency"": """", ""identification"": ""tetramer-...","{""cell.subset"": ""CD8+"", ""clone.id"": """", ""donor...","{""cdr3"": ""CASSFEPGQGFYSNQPQHF"", ""cdr3_old"": ""C...",2


In [6]:
filtered["Epitope species"].unique()

array(['HIV-1', 'TriticumAestivum', 'CMV', 'SARS-CoV-2', 'HomoSapiens',
       'EBV', 'M.tuberculosis', 'HTLV-1', 'InfluenzaA',
       'SaccharomycesCerevisiae', 'HCV', 'E.Coli', 'HHV', 'synthetic',
       'DENV1', 'DENV3/4', 'SelaginellaMoellendorffii',
       'PseudomonasFluorescens', 'PseudomonasAeruginosa', 'HIV1',
       'Homo sapiens', 'YFV', 'HSV-2', 'DENV2', 'MCPyV', 'HPV',
       'StreptomycesKanamyceticus', 'HIV', 'HCoV-HKU1'], dtype=object)

In [7]:
filtered = filtered[filtered["Epitope species"] != "SARS-CoV-2"]

In [8]:
len(filtered)

73761

In [9]:
def reformat(df: pd.DataFrame) -> pd.DataFrame:
    reformatted_rows = []

    # Process bulk data
    bulk_data = df[df["complex.id"] == 0]
    for _, row in tqdm(bulk_data.iterrows(), total=len(bulk_data)):
        if row["Gene"] == "TRA":
            chain_cols = ["TRAV", "CDR3A", "TRAJ"]
        else:
            chain_cols = ["TRBV", "CDR3B", "TRBJ"]
        reformatted_rows.append(
            {
                chain_cols[0]: row["V"],
                chain_cols[1]: row["CDR3"],
                chain_cols[2]: row["J"],
                "Epitope": row["Epitope"],
                "MHCA": row["MHC A"],
                "MHCB": row["MHC B"],
                "duplicate_count": pd.NA,
            }
        )

    # Process single cell data
    sc_complex_ids = df[df["complex.id"] != 0]["complex.id"].unique()
    for complex_id in tqdm(sc_complex_ids):
        tcr_info = df[df["complex.id"] == complex_id]
        tcr_info = tcr_info.drop_duplicates(subset=["V", "J", "CDR3"])

        tra_info = tcr_info[tcr_info["Gene"] == "TRA"].iloc[0]
        trb_info = tcr_info[tcr_info["Gene"] == "TRB"].iloc[0]

        reformatted_rows.append(
            {
                "TRAV": tra_info["V"],
                "CDR3A": tra_info["CDR3"],
                "TRAJ": tra_info["J"],
                "TRBV": trb_info["V"],
                "CDR3B": trb_info["CDR3"],
                "TRBJ": trb_info["J"],
                "Epitope": tra_info["Epitope"],
                "MHCA": tra_info["MHC A"],
                "MHCB": tra_info["MHC B"],
                "duplicate_count": pd.NA,
            }
        )

    reformatted_df = pd.DataFrame.from_records(reformatted_rows)
    reformatted_df = reformatted_df.drop_duplicates()
    return reformatted_df[
        [
            "TRAV",
            "CDR3A",
            "TRAJ",
            "TRBV",
            "CDR3B",
            "TRBJ",
            "Epitope",
            "MHCA",
            "MHCB",
            "duplicate_count",
        ]
    ]

In [10]:
filtered = reformat(filtered)

100%|██████████| 25163/25163 [00:01<00:00, 23982.28it/s]
100%|██████████| 24299/24299 [00:28<00:00, 855.53it/s]


In [11]:
filtered.head()

Unnamed: 0,TRAV,CDR3A,TRAJ,TRBV,CDR3B,TRBJ,Epitope,MHCA,MHCB,duplicate_count
0,,,,TRBV13*01,CASSFEAGQGFFSNQPQHF,TRBJ1-5*01,FLKEKGGL,HLA-B*08,B2M,
1,,,,TRBV13*01,CASSFEAGQGFFSNQPQHF,TRBJ1-5*01,FLKEQGGL,HLA-B*08,B2M,
2,,,,TRBV13*01,CASSFEAGQGFFSNQPQHF,TRBJ1-5*01,FLKETGGL,HLA-B*08,B2M,
3,,,,TRBV13*01,CASSFEAGQGFFSNQPQHF,TRBJ1-5*01,FLKEMGGL,HLA-B*08,B2M,
4,,,,TRBV7-2*01,CASSFGVEDEQYF,TRBJ2-7*01,LQPFPQPELPYPQPQ,HLA-DQA1*05:01,HLA-DQB1*02:01,


In [12]:
filtered = filtered.drop_duplicates(
    subset=["TRAV", "CDR3A", "TRAJ", "TRBV", "CDR3B", "TRBJ"], keep=False
)

In [13]:
filtered["TRAV"] = filtered["TRAV"].map(
    lambda x: None
    if isna(x)
    else tidytcells.tcr.standardise(x, enforce_functional=True)
)
filtered["TRAJ"] = filtered["TRAJ"].map(
    lambda x: None
    if isna(x)
    else tidytcells.tcr.standardise(x, enforce_functional=True)
)
filtered["TRBV"] = filtered["TRBV"].map(
    lambda x: None
    if isna(x)
    else tidytcells.tcr.standardise(x, enforce_functional=True)
)
filtered["TRBJ"] = filtered["TRBJ"].map(
    lambda x: None
    if isna(x)
    else tidytcells.tcr.standardise(x, enforce_functional=True)
)

In [14]:
len(filtered)

41478

## Create general (messy) evaluation dataset

In [15]:
subsampled = filtered.groupby("Epitope").filter(lambda x: len(x) >= 100)

In [16]:
subsampled = subsampled.groupby("Epitope").sample(100, random_state=420)

In [17]:
len(subsampled)

3400

In [18]:
subsampled.to_csv(preprocessed_dir / "evaluation.csv", index=False)

## Create beta-guaranteed evaluation dataset

In [19]:
filtered_beta = filtered[filtered.notna()["CDR3B"] & filtered.notna()["TRBV"]]
filtered_beta = filtered_beta.drop_duplicates(
    subset=["TRBV", "CDR3B", "TRBJ"], keep=False
)

In [20]:
len(filtered_beta)

31456

In [21]:
filtered_beta.head()

Unnamed: 0,TRAV,CDR3A,TRAJ,TRBV,CDR3B,TRBJ,Epitope,MHCA,MHCB,duplicate_count
4,,,,TRBV7-2*01,CASSFGVEDEQYF,TRBJ2-7*01,LQPFPQPELPYPQPQ,HLA-DQA1*05:01,HLA-DQB1*02:01,
5,,,,TRBV3-1*01,CASSSLNTQYF,TRBJ2-3*01,LQPFPQPELPYPQPQ,HLA-DQA1*05:01,HLA-DQB1*02:01,
6,,,,TRBV7-3*01,CASSIRSTDTQYF,TRBJ2-3*01,LQPFPQPELPYPQPQ,HLA-DQA1*05:01,HLA-DQB1*02:01,
7,,,,TRBV4-1*01,CASSQVTLPTETQYF,TRBJ2-5*01,LQPFPQPELPYPQPQ,HLA-DQA1*05:01,HLA-DQB1*02:01,
8,,,,TRBV7-2*01,CASNFGVEDEQYF,TRBJ2-7*01,LQPFPQPELPYPQPQ,HLA-DQA1*05:01,HLA-DQB1*02:01,


In [22]:
subsampled_beta = filtered_beta.groupby("Epitope").filter(lambda x: len(x) >= 100)

In [23]:
subsampled_beta = subsampled_beta.groupby("Epitope").sample(100, random_state=420)

In [24]:
len(subsampled_beta)

2900

In [25]:
subsampled_beta.to_csv(preprocessed_dir / "evaluation_beta.csv", index=False)

### Create alpha-beta-guaranteed evaluation set

In [26]:
filtered_alphabeta = filtered[filtered.notna()["CDR3A"] & filtered.notna()["CDR3B"]]

In [27]:
len(filtered_alphabeta)

20812

In [28]:
filtered_alphabeta.head()

Unnamed: 0,TRAV,CDR3A,TRAJ,TRBV,CDR3B,TRBJ,Epitope,MHCA,MHCB,duplicate_count
25166,TRAV38-2/DV8*01,CAYRPPGTYKYIF,TRAJ40*01,TRBV14*01,CASSALASLNEQFF,TRBJ2-1*01,FLKEKGGL,HLA-B*08,B2M,
25176,TRAV38-1*01,CAYTVLGNEKLTF,TRAJ48*01,TRBV28*01,CASSFTPYNEQFF,TRBJ2-1*01,ELAGIGILTV,HLA-A*02,B2M,
25177,TRAV12-2*01,CAVAGYGGSQGNLIF,TRAJ42*01,TRBV28*01,CASSPQGLGTEAFF,TRBJ1-1*01,ELAGIGILTV,HLA-A*02,B2M,
25178,TRAV12-2*01,CAVSFGNEKLTF,TRAJ48*01,TRBV28*01,CAEGQGFVGQPQHF,TRBJ1-5*01,ELAGIGILTV,HLA-A*02,B2M,
25179,TRAV12-2*01,CAVTHYGGSQGNLIF,TRAJ42*01,TRBV28*01,CASLRSAVWADTQYF,TRBJ2-3*01,ELAGIGILTV,HLA-A*02,B2M,


In [29]:
subsampled_alphabeta = filtered_alphabeta.groupby("Epitope").filter(
    lambda x: len(x) >= 100
)

In [30]:
subsampled_alphabeta = subsampled_alphabeta.groupby("Epitope").sample(
    100, random_state=420
)

In [31]:
len(subsampled_alphabeta)

1400

In [32]:
subsampled_alphabeta.to_csv(preprocessed_dir / "evaluation_alphabeta.csv", index=False)