In [None]:
import os
from pathlib import Path
import sys

if "__project_dir__" not in globals():
    __project_dir__ = Path.cwd().parents[1].resolve()

sys.path.append(__project_dir__)
os.chdir(__project_dir__)

In [None]:
import json
from pathlib import Path
import pandas as pd
import tidytcells

In [None]:
raw_dir = Path(
    "tcr_data/raw/tanno"
)
preprocessed_dir = Path(
    "tcr_data/preprocessed/tanno"
)

In [None]:
example = pd.read_csv(raw_dir / "data" / "A1 memory.txt", sep="\t")

In [None]:
example.sort_values(by="Clustered", ascending=False).head()

In [None]:
dfs = []

for f in (raw_dir / "data").iterdir():
    df = pd.read_csv(f, sep="\t")

    # Filter for potential mispairings
    df = df.sort_values(by="Clustered", ascending=False)
    df = df.drop_duplicates("CDRH3_NT", keep="first")
    df = df.drop_duplicates("CDRL3_NT", keep="first")

    df = df[["VL", "CDRL3_AA", "JL", "VH", "CDRH3_AA", "JH"]]
    df.columns = ["TRAV", "CDR3A", "TRAJ", "TRBV", "CDR3B", "TRBJ"]

    dfs.append(df)

combined = pd.concat(dfs, axis="index")

combined["TRAV"] = combined["TRAV"].map(
    lambda x: tidytcells.tr.standardise(x, enforce_functional=True)
)
combined["TRAJ"] = combined["TRAJ"].map(
    lambda x: tidytcells.tr.standardise(x, enforce_functional=True)
)
combined["TRBV"] = combined["TRBV"].map(
    lambda x: tidytcells.tr.standardise(x, enforce_functional=True)
)
combined["TRBJ"] = combined["TRBJ"].map(
    lambda x: tidytcells.tr.standardise(x, enforce_functional=True)
)

combined = combined.dropna(subset=["TRAV", "CDR3A", "TRBV", "CDR3B"])

In [None]:
test_frac = 0.1
test_rows = int(test_frac * len(combined))

shuffled = combined.sample(frac=1, random_state=12345)

test = shuffled.iloc[:test_rows]
train = shuffled.iloc[test_rows:]

In [None]:
def combine_similar_clones(df):
    df = df.copy()

    if not "clone_count" in df:
        df["clone_count"] = 1

    df = df.groupby(
        ["TRAV", "CDR3A", "TRAJ", "TRBV", "CDR3B", "TRBJ"],
        as_index=False,
        dropna=False
    ).aggregate({"clone_count": "sum"})

    df[["Epitope", "MHCA", "MHCB"]] = pd.NA 

    df = df[["TRAV", "CDR3A", "TRAJ", "TRBV", "CDR3B", "TRBJ", "Epitope", "MHCA", "MHCB", "clone_count"]]
    
    return df.sample(frac=1, random_state=12345)

In [None]:
combined = combine_similar_clones(combined)
test = combine_similar_clones(test)
train = combine_similar_clones(train)

In [None]:
def generate_tcr_string(tcr_row):
    return "{}{}{}{}{}{}".format(
        tcr_row["TRAV"],
        tcr_row["CDR3A"],
        tcr_row["TRAJ"],
        tcr_row["TRBV"],
        tcr_row["CDR3B"],
        tcr_row["TRBJ"]
    )

def remove_tcrs_in_b_from_a(df_a, df_b):
    tcrs_in_b = set(df_b.apply(generate_tcr_string, axis=1).unique())

    tcrs_to_remove_from_a = df_a.apply(generate_tcr_string, axis=1).map(lambda tcr: tcr in tcrs_in_b)

    return df_a.copy()[~tcrs_to_remove_from_a]

In [None]:
train = remove_tcrs_in_b_from_a(train, test)

In [None]:
travs = tidytcells.tr.query(precision="gene", functionality="F", contains_pattern="TRAV")
trajs = tidytcells.tr.query(precision="gene", functionality="F", contains_pattern="TRAJ")
trbvs = tidytcells.tr.query(precision="gene", functionality="F", contains_pattern="TRBV")
trbjs = tidytcells.tr.query(precision="gene", functionality="F", contains_pattern="TRBJ")

In [None]:
for dataset in (train, test):
    assert travs == set(dataset["TRAV"].dropna())
    assert trajs == set(dataset["TRAJ"].dropna())
    assert trbvs == set(dataset["TRBV"].dropna())
    assert trbjs == set(dataset["TRBJ"].dropna())

In [None]:
bv_counts = dict()

for bv in combined["TRBV"].dropna().unique():
    bv_count = combined["clone_count"][combined["TRBV"] == bv].sum()
    bv_counts[bv] = int(bv_count)

In [None]:
with open("trbv_frequencies.json", "w") as f:
    json.dump(bv_counts, f, indent=4)

### Compute pGen of TCRs in the test set

#### OLGA setup

In [None]:
from olga import load_model
import olga.generation_probability as pgen

OLGA_PATH = Path(".venv") / "lib64" / "python3.11" / "site-packages" / "olga"
DEFAULT_HUMAN_T_BETA_PATH = OLGA_PATH / "default_models" / "human_T_beta"
DEFAULT_HUMAN_T_ALPHA_PATH = OLGA_PATH / "default_models" / "human_T_alpha"

In [None]:
def get_alpha_pgen_model():
    params_file_name = DEFAULT_HUMAN_T_ALPHA_PATH / "model_params.txt"
    marginals_file_name = DEFAULT_HUMAN_T_ALPHA_PATH / "model_marginals.txt"
    v_anchor_pos_file = DEFAULT_HUMAN_T_ALPHA_PATH / "V_gene_CDR3_anchors.csv"
    j_anchor_pos_file = DEFAULT_HUMAN_T_ALPHA_PATH / "J_gene_CDR3_anchors.csv"

    genomic_data = load_model.GenomicDataVJ()
    genomic_data.load_igor_genomic_data(params_file_name, v_anchor_pos_file, j_anchor_pos_file)

    generative_model = load_model.GenerativeModelVJ()
    generative_model.load_and_process_igor_model(marginals_file_name)

    pgen_model = pgen.GenerationProbabilityVJ(generative_model, genomic_data)

    return pgen_model

In [None]:
def get_beta_pgen_model():
    params_file_name = DEFAULT_HUMAN_T_BETA_PATH / "model_params.txt"
    marginals_file_name = DEFAULT_HUMAN_T_BETA_PATH / "model_marginals.txt"
    v_anchor_pos_file = DEFAULT_HUMAN_T_BETA_PATH / "V_gene_CDR3_anchors.csv"
    j_anchor_pos_file = DEFAULT_HUMAN_T_BETA_PATH / "J_gene_CDR3_anchors.csv"

    genomic_data = load_model.GenomicDataVDJ()
    genomic_data.load_igor_genomic_data(params_file_name, v_anchor_pos_file, j_anchor_pos_file)

    generative_model = load_model.GenerativeModelVDJ()
    generative_model.load_and_process_igor_model(marginals_file_name)

    pgen_model = pgen.GenerationProbabilityVDJ(generative_model, genomic_data)

    return pgen_model

In [None]:
def compute_pgen(model, cdr3, v, j) -> float:
    if pd.isna(cdr3):
        return None
    
    if pd.isna(v):
        v = None
    if pd.isna(j):
        j = None

    return model.compute_aa_CDR3_pgen(cdr3, v, j)

#### Compute pGens

In [None]:
alpha_model = get_alpha_pgen_model()

test["alpha_pgen"] = test.apply(
    lambda row: compute_pgen(alpha_model, row["CDR3A"], row["TRAV"], row["TRAJ"]),
    axis=1
)

In [None]:
beta_model = get_beta_pgen_model()

test["beta_pgen"] = test.apply(
    lambda row: compute_pgen(beta_model, row["CDR3B"], row["TRBV"], row["TRBJ"]),
    axis=1
)

In [None]:
combined.to_csv(preprocessed_dir / "combined.csv", index=False)

In [None]:
train.to_csv(preprocessed_dir / "train.csv", index=False)
test.to_csv(preprocessed_dir / "test.csv", index=False)

In [None]:
test.sample(n=5, random_state=420).to_csv(
    preprocessed_dir / "exemplars.csv", index=False
)