In [1]:
import json
import matplotlib.pyplot as plt
from pathlib import Path
import pandas as pd
import seaborn
import tidytcells

seaborn.set_theme()
seaborn.set_style("darkgrid")
plt.tight_layout()

<Figure size 640x480 with 0 Axes>

In [2]:
raw_dir = Path(
    "/home/yutanagano/UCLOneDrive/MBPhD/projects/tcr_embedder/data/raw/tanno"
)
preprocessed_dir = Path(
    "/home/yutanagano/UCLOneDrive/MBPhD/projects/tcr_embedder/data/preprocessed/tanno"
)

In [3]:
example = pd.read_csv(raw_dir / "data" / "A1 memory.txt", sep="\t")

In [4]:
example.sort_values(by="Clustered", ascending=False).head()

Unnamed: 0,Clustered,Collapsed,ClusterID,CDRH3_AA,CDRL3_AA,CDRH3_NT,CDRL3_NT,VH,DH,JH,VL,JL
0,8899,5544,1,CASSLQGARETQYF,CALLSTGGGNKLTF,TGTGCCAGCAGCTTACAGGGGGCGCGGGAGACCCAGTACTTC,TGTGCTCTCCTTTCCACGGGAGGAGGAAACAAACTCACCTTT,TRBV11-2,TRBD1,TRBJ2-5,TRAV9-2,TRAJ10
1,1945,1077,2,CSWKREIVEQFF,CAVSDRNTNAGKSTF,TGCAGTTGGAAGCGGGAGATAGTTGAGCAGTTCTTC,TGTGCTGTGAGTGATAGGAACACCAATGCAGGCAAATCAACCTTT,TRBV20-1,TRBD2,TRBJ2-1,TRAV8-4,TRAJ27
2,1215,867,3,CASSSQGANTEAFF,CIVRKRNQFYF,TGTGCCAGCAGCTCCCAAGGAGCGAACACTGAAGCTTTCTTT,TGCATCGTCAGAAAACGTAACCAGTTCTATTTT,TRBV9,,TRBJ1-1,TRAV26-1,TRAJ49
3,1185,866,4,CASSFQGQGGQPQHF,CAVRPRDTGGFKTIF,TGTGCCAGCAGTTTCCAGGGACAGGGCGGTCAGCCCCAGCATTTT,TGTGCTGTGAGGCCTCGGGATACTGGAGGCTTCAAAACTATCTTT,TRBV12-3,TRBD1,TRBJ1-5,TRAV21,TRAJ9
4,1002,785,5,CSAPLAGVSYNEQFF,CIVRSYNYGQNFVF,TGCAGTGCGCCACTAGCGGGCGTCTCCTACAATGAGCAGTTCTTC,TGCATCGTCAGATCTTATAACTATGGTCAGAATTTTGTCTTT,TRBV20-1,TRBD2,TRBJ2-1,TRAV26-1,TRAJ26


In [5]:
dfs = []

for f in (raw_dir / "data").iterdir():
    df = pd.read_csv(f, sep="\t")

    # Filter for potential mispairings
    df = df.sort_values(by="Clustered", ascending=False)
    df = df.drop_duplicates("CDRH3_NT", keep="first")
    df = df.drop_duplicates("CDRL3_NT", keep="first")

    df = df[["VL", "CDRL3_AA", "JL", "VH", "CDRH3_AA", "JH"]]
    df.columns = ["TRAV", "CDR3A", "TRAJ", "TRBV", "CDR3B", "TRBJ"]

    dfs.append(df)

combined = pd.concat(dfs, axis="index")

combined["TRAV"] = combined["TRAV"].map(
    lambda x: tidytcells.tcr.standardise(x, enforce_functional=True)
)
combined["TRAJ"] = combined["TRAJ"].map(
    lambda x: tidytcells.tcr.standardise(x, enforce_functional=True)
)
combined["TRBV"] = combined["TRBV"].map(
    lambda x: tidytcells.tcr.standardise(x, enforce_functional=True)
)
combined["TRBJ"] = combined["TRBJ"].map(
    lambda x: tidytcells.tcr.standardise(x, enforce_functional=True)
)

  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


In [6]:
test_frac = 0.1
test_rows = int(test_frac * len(combined))

shuffled = combined.sample(frac=1, random_state=12345)

test = shuffled.iloc[:test_rows]
train = shuffled.iloc[test_rows:]

In [7]:
def combine_similar_clones(df):
    df = df.copy()

    if not "clone_count" in df:
        df["clone_count"] = 1

    df = df.groupby(
        ["TRAV", "CDR3A", "TRAJ", "TRBV", "CDR3B", "TRBJ"],
        as_index=False,
        dropna=False
    ).aggregate({"clone_count": "sum"})

    df[["Epitope", "MHCA", "MHCB"]] = pd.NA 

    df = df[["TRAV", "CDR3A", "TRAJ", "TRBV", "CDR3B", "TRBJ", "Epitope", "MHCA", "MHCB", "clone_count"]]

    df = df.sort_values(by=df.columns.tolist(), ignore_index=True)
    
    return df

In [8]:
combined = combine_similar_clones(combined)
test = combine_similar_clones(test)
train = combine_similar_clones(train)

In [9]:
def generate_tcr_string(tcr_row):
    return "{}{}{}{}{}{}".format(
        tcr_row["TRAV"],
        tcr_row["CDR3A"],
        tcr_row["TRAJ"],
        tcr_row["TRBV"],
        tcr_row["CDR3B"],
        tcr_row["TRBJ"]
    )

def remove_tcrs_in_b_from_a(df_a, df_b):
    tcrs_in_b = set(df_b.apply(generate_tcr_string, axis=1).unique())

    tcrs_to_remove_from_a = df_a.apply(generate_tcr_string, axis=1).map(lambda tcr: tcr in tcrs_in_b)

    return df_a.copy()[~tcrs_to_remove_from_a]

In [10]:
train = remove_tcrs_in_b_from_a(train, test)

In [11]:
travs = tidytcells.tcr.query(precision="gene", functionality="F", contains="TRAV")
trajs = tidytcells.tcr.query(precision="gene", functionality="F", contains="TRAJ")
trbvs = tidytcells.tcr.query(precision="gene", functionality="F", contains="TRBV")
trbjs = tidytcells.tcr.query(precision="gene", functionality="F", contains="TRBJ")

In [12]:
for dataset in (train, test):
    assert travs == set(dataset["TRAV"].dropna())
    assert trajs == set(dataset["TRAJ"].dropna())
    assert trbvs == set(dataset["TRBV"].dropna())
    assert trbjs == set(dataset["TRBJ"].dropna())

In [51]:
bv_counts = dict()

for bv in combined["TRBV"].dropna().unique():
    bv_count = combined["clone_count"][combined["TRBV"] == bv].sum()
    bv_counts[bv] = int(bv_count)

In [52]:
with open("trbv_frequencies.json", "w") as f:
    json.dump(bv_counts, f, indent=4)

In [53]:
combined.to_csv(preprocessed_dir / "combined.csv", index=False)

In [54]:
train.to_csv(preprocessed_dir / "train.csv", index=False)
test.to_csv(preprocessed_dir / "test.csv", index=False)

In [55]:
test.sample(n=5, random_state=420).to_csv(
    preprocessed_dir / "exemplars.csv", index=False
)