In [1]:
import os
from pathlib import Path
import sys

if "__project_dir__" not in globals():
    __project_dir__ = Path.cwd().parents[1].resolve()

sys.path.append(__project_dir__)
os.chdir(__project_dir__)

In [2]:
import pandas as pd
import tidytcells as tt

In [3]:
raw_dir = Path("/home/yutanagano/UCLOneDrive/MBPhD/projects/tcr_embedder/data/raw")
processed_dir = Path(
    "/home/yutanagano/UCLOneDrive/MBPhD/projects/tcr_embedder/data/preprocessed"
)

In [4]:
tcrs = pd.read_excel(
    raw_dir / "minervina" / "Minervina_SARS_CoV_2_TCRs.xlsx", sheet_name="clonotypes"
)

In [5]:
tcrs = tcrs.drop_duplicates(["cdr3b_nt", "vb", "jb", "cdr3a_nt", "va", "ja", "epitope"])

In [6]:
def extract_mhca_and_epitope(encoding):
    tokens = encoding.split("_")
    
    mhca = tokens[0]
    epitope = "_".join(tokens[1:])

    return tt.mhc.standardize(mhca), epitope

tcrs[["MHCA", "Epitope"]] = tcrs.apply(lambda row: extract_mhca_and_epitope(row["epitope"]), result_type="expand", axis=1)

In [7]:
tcrs = tcrs[["va", "cdr3a", "ja", "vb", "cdr3b", "jb", "Epitope", "MHCA"]]
tcrs.columns = ["TRAV", "CDR3A", "TRAJ", "TRBV", "CDR3B", "TRBJ", "Epitope", "MHCA"]

In [8]:
tcrs["TRAV"] = tcrs["TRAV"].map(
    lambda x: tt.tcr.standardise(x, enforce_functional=True)
)
tcrs["TRAJ"] = tcrs["TRAJ"].map(
    lambda x: tt.tcr.standardise(x, enforce_functional=True)
)
tcrs["TRBV"] = tcrs["TRBV"].map(
    lambda x: tt.tcr.standardise(x, enforce_functional=True)
)
tcrs["TRBJ"] = tcrs["TRBJ"].map(
    lambda x: tt.tcr.standardise(x, enforce_functional=True)
)

  warn(
  warn(


In [9]:
tcrs["MHCB"] = pd.NA
tcrs["clone_count"] = 1

tcrs = tcrs.groupby(
    ["TRAV", "CDR3A", "TRAJ", "TRBV", "CDR3B", "TRBJ", "Epitope", "MHCA", "MHCB"],
    as_index=False,
    dropna=False
).aggregate({"clone_count": "sum"})

In [10]:
tcrs.groupby("Epitope").size()

Epitope
AEA         1
AEV        57
ALS       196
DTD        85
FTS       445
LLY        52
LTD       394
NQK       142
NTN         9
NYN       166
PTD       153
QEL        31
QYI       102
RVA        68
TTD      1887
VEN_H       4
VYF       269
VYI        91
YLQ       285
dtype: int64

In [11]:
filtered = tcrs.groupby("Epitope").filter(lambda x: len(x) > 1)

In [12]:
filtered.to_csv(processed_dir / "minervina" / "preprocessed.csv", index=False)