In [1]:
import os
from pathlib import Path
import sys

if "__project_dir__" not in globals():
    __project_dir__ = Path.cwd().parents[1].resolve()

sys.path.append(__project_dir__)
os.chdir(__project_dir__)

In [2]:
import pandas as pd
from pandas import DataFrame
import tidytcells as tt

In [3]:
raw_dir = Path("/home/yutanagano/UCLOneDrive/MBPhD/projects/tcr_embedder/data/raw")
processed_dir = Path(
    "/home/yutanagano/UCLOneDrive/MBPhD/projects/tcr_embedder/data/preprocessed"
)

In [4]:
df = pd.read_excel(
    raw_dir / "minervina" / "Minervina_SARS_CoV_2_TCRs.xlsx", sheet_name="clonotypes"
)

In [5]:
df = df.drop_duplicates(["cdr3b_nt", "vb", "jb", "cdr3a_nt", "va", "ja", "epitope"])

In [6]:
def extract_mhca_and_epitope(encoding):
    tokens = encoding.split("_")
    
    mhca = tokens[0]
    epitope = "_".join(tokens[1:])

    return tt.mhc.standardize(mhca), epitope

df[["MHCA", "Epitope"]] = df.apply(lambda row: extract_mhca_and_epitope(row["epitope"]), result_type="expand", axis=1)

In [7]:
def drop_chains_independently(df: DataFrame):
    df = df.sort_values(by="Degree", ascending=False)
    df = df.drop_duplicates(subset=["cdr3a_nt", "Epitope"], keep="first")
    df = df.drop_duplicates(subset=["cdr3b_nt", "Epitope"], keep="first")
    return df

tcrs_chains_dropped_independently = drop_chains_independently(df)

In [8]:
df[df.cdr3b == "CASSPDIEAFF"].cdr3b_nt.unique()

array(['TGTGCCAGCAGCCCGGATATTGAAGCTTTCTTT',
       'TGTGCCAGCAGCCCCGACATTGAAGCTTTCTTT',
       'TGTGCCAGCAGCCCAGACATAGAAGCTTTCTTT',
       'TGTGCCAGCAGCCCGGACATTGAAGCTTTCTTT',
       'TGTGCCAGCAGCCCTGACATTGAAGCTTTCTTT',
       'TGTGCCAGCAGTCCGGACATTGAAGCTTTCTTT',
       'TGTGCCAGCAGCCCCGATATTGAAGCTTTCTTT',
       'TGTGCCAGCAGCCCAGATATTGAAGCTTTCTTT',
       'TGTGCCAGCAGCCCCGACATCGAAGCTTTCTTT',
       'TGTGCCAGCAGCCCAGACATCGAAGCTTTCTTT',
       'TGTGCCAGCAGCCCAGACATTGAAGCTTTCTTT'], dtype=object)

In [9]:
def rename_columns(df: DataFrame):
    df = df.copy(deep=True)[["va", "cdr3a", "ja", "vb", "cdr3b", "jb", "Epitope", "MHCA"]]
    df.columns = ["TRAV", "CDR3A", "TRAJ", "TRBV", "CDR3B", "TRBJ", "Epitope", "MHCA"]
    return df

In [10]:
tcrs = rename_columns(df)
tcrs_chains_dropped_independently = rename_columns(tcrs_chains_dropped_independently)

In [11]:
def standardize_gene_symbols(df: DataFrame):
    df = df.copy(deep=True)
    df["TRAV"] = df["TRAV"].map(
        lambda x: tt.tcr.standardise(x, enforce_functional=True)
    )
    df["TRAJ"] = df["TRAJ"].map(
        lambda x: tt.tcr.standardise(x, enforce_functional=True)
    )
    df["TRBV"] = df["TRBV"].map(
        lambda x: tt.tcr.standardise(x, enforce_functional=True)
    )
    df["TRBJ"] = df["TRBJ"].map(
        lambda x: tt.tcr.standardise(x, enforce_functional=True)
    )
    return df

In [12]:
tcrs = standardize_gene_symbols(tcrs)
tcrs_chains_dropped_independently = standardize_gene_symbols(tcrs_chains_dropped_independently)

  warn(
  warn(


In [13]:
tcrs_chains_dropped_independently[tcrs_chains_dropped_independently.CDR3B == "CASSPDIEAFF"]

Unnamed: 0,TRAV,CDR3A,TRAJ,TRBV,CDR3B,TRBJ,Epitope,MHCA
2211,TRAV12-1,CVVNGADKLIF,TRAJ34,TRBV7-9,CASSPDIEAFF,TRBJ1-1,YLQ,HLA-A*02
508,TRAV12-1,CVVNRADKLIF,TRAJ34,TRBV7-9,CASSPDIEAFF,TRBJ1-1,YLQ,HLA-A*02
146,TRAV12-1,CVVNSGDKLTF,TRAJ46,TRBV7-9,CASSPDIEAFF,TRBJ1-1,YLQ,HLA-A*02
3089,TRAV12-1,CVVNEDDKIIF,TRAJ30,TRBV7-9,CASSPDIEAFF,TRBJ1-1,AEV,HLA-B*44
3988,TRAV12-1,CVVNEDDKIIF,TRAJ30,TRBV7-9,CASSPDIEAFF,TRBJ1-1,YLQ,HLA-A*02
1545,TRAV12-1,CVVNEDDKIIF,TRAJ30,TRBV7-9,CASSPDIEAFF,TRBJ1-1,YLQ,HLA-A*02
2001,TRAV12-2,CAVNRDDKIIF,TRAJ30,TRBV7-9,CASSPDIEAFF,TRBJ1-1,YLQ,HLA-A*02
1809,TRAV12-2,CAVNGDDKIIF,TRAJ30,TRBV7-9,CASSPDIEAFF,TRBJ1-1,YLQ,HLA-A*02
536,TRAV12-2,CAVNGLGMGGGADGLTF,TRAJ45,TRBV7-9,CASSPDIEAFF,TRBJ1-1,YLQ,HLA-A*02
116,TRAV21,CAVSNSGYSTLTF,TRAJ11,TRBV7-9,CASSPDIEAFF,TRBJ1-1,YLQ,HLA-A*02


In [14]:
def aggregate_similar_clones(df: DataFrame):
    df = df.copy(deep=True)
    df["MHCB"] = pd.NA
    df["clone_count"] = 1

    df = df.groupby(
        ["TRAV", "CDR3A", "TRAJ", "TRBV", "CDR3B", "TRBJ", "Epitope", "MHCA", "MHCB"],
        as_index=False,
        dropna=False
    ).aggregate({"clone_count": "sum"})
    return df

In [15]:
tcrs = aggregate_similar_clones(tcrs)
tcrs_chains_dropped_independently = aggregate_similar_clones(tcrs_chains_dropped_independently)

In [17]:
def drop_trivial_epitope_groups(df: DataFrame):
    filtered = df.groupby("Epitope").filter(lambda x: len(x) > 1)
    filtered = filtered.dropna(subset=["TRAV", "CDR3A", "TRBV", "CDR3B", "Epitope"])
    return filtered

In [18]:
tcrs = drop_trivial_epitope_groups(tcrs)
tcrs_chains_dropped_independently = drop_trivial_epitope_groups(tcrs_chains_dropped_independently)

In [19]:
tcrs.to_csv(processed_dir / "minervina" / "preprocessed.csv", index=False)
tcrs_chains_dropped_independently.to_csv(processed_dir/"minervina"/"preprocessed_chains_dropped_independently.csv", index=False)