In [1]:
from pathlib import Path
import pandas as pd
from pandas import DataFrame
import tidytcells as tt

In [2]:
raw_dir = Path("/home/yutanagano/UCLOneDrive/MBPhD/projects/tcr_embedder/data/raw/rds")
preprocessed_dir = Path(
    "/home/yutanagano/UCLOneDrive/MBPhD/projects/tcr_embedder/data/preprocessed/rds"
)

In [3]:
paths_to_tsvs = [item for item in raw_dir.iterdir() if "tsv" in item.name]

In [5]:
def load_and_clean_decombinator_tsv(path_to_tsv: Path) -> DataFrame:
    tcrs = pd.read_csv(path_to_tsv, sep="\t")
    functional_tcrs = tcrs[tcrs.productive == "T"]
    functional_tcrs_simplified = functional_tcrs[["decombinator_id", "v_call", "junction_aa", "j_call", "duplicate_count"]]
    functional_tcrs_simplified.columns = ["decombinator_id", "TRBV", "CDR3B", "TRBJ", "duplicate_count"]
    functional_tcrs_standardized = standardize_simplified_decombinator_dataframe(functional_tcrs_simplified.copy())
    return functional_tcrs_standardized

def standardize_simplified_decombinator_dataframe(simplified_decombinator_dataframe: DataFrame) -> DataFrame:
    simplified_decombinator_dataframe.TRBV = simplified_decombinator_dataframe.TRBV.map(lambda x: tt.tcr.standardize(x, enforce_functional=True, suppress_warnings=True))
    simplified_decombinator_dataframe.TRBJ = simplified_decombinator_dataframe.TRBJ.map(lambda x: tt.tcr.standardize(x, enforce_functional=True, suppress_warnings=True))
    return simplified_decombinator_dataframe.dropna()

In [6]:
tsvs = [load_and_clean_decombinator_tsv(path_to_tsv) for path_to_tsv in paths_to_tsvs]

In [7]:
combined = pd.concat(tsvs)

In [8]:
hyperexpanded = combined.copy()[combined.duplicate_count > 2]

In [10]:
memory_like = combined.copy()[combined.duplicate_count > 1]

In [11]:
naive_like = combined.copy()[combined.duplicate_count == 1]

In [12]:
def collapse_clones(standardized_decombinator_dataframe: DataFrame) -> DataFrame:
    standardized_decombinator_dataframe["clone_count"] = 1
    collapsed_dataframe = standardized_decombinator_dataframe.groupby(
        by=["TRBV", "CDR3B", "TRBJ"],
        as_index=False
    ).aggregate({"clone_count": "sum"})
    return collapsed_dataframe

In [13]:
combined_collapsed = collapse_clones(combined)

In [16]:
hyperexpanded_collapsed = collapse_clones(hyperexpanded)

In [14]:
memory_like_collapsed = collapse_clones(memory_like)

In [15]:
naive_like_collapsed = collapse_clones(naive_like)

In [20]:
SUBSAMPLE_SIZE = 734355

combined_collapsed_subsampled = combined_collapsed.sample(n=SUBSAMPLE_SIZE, random_state=12345)
memory_like_collapsed_subsampled = memory_like_collapsed.sample(n=SUBSAMPLE_SIZE, random_state=12345)
naive_like_collapsed_subsampled = naive_like_collapsed.sample(n=SUBSAMPLE_SIZE, random_state=12345)

In [21]:
combined_collapsed_subsampled

Unnamed: 0,TRBV,CDR3B,TRBJ,clone_count
6616803,TRBV6-1,CASSYSTSGLAKNIQYF,TRBJ2-4,1
1556466,TRBV18,CASSPGDGFYEQYF,TRBJ2-7,1
6751207,TRBV6-5,CASRGLAGAIEGETQYF,TRBJ2-5,1
4071323,TRBV29-1,CSSRRGGGETQYF,TRBJ2-5,1
2036577,TRBV2,CASGRADTQYF,TRBJ2-3,3
...,...,...,...,...
2289735,TRBV2,CASSVRQGIDTQYF,TRBJ2-3,1
2979369,TRBV20-1,CSATGLAGGPSNEQFF,TRBJ2-1,1
5616598,TRBV5-1,CASSLSWTSPEGYTF,TRBJ1-2,2
7998002,TRBV7-7,CASSYNGPGGQETQYF,TRBJ2-5,1


In [22]:
combined_collapsed.to_csv(preprocessed_dir/"combined.csv", index=False)
hyperexpanded_collapsed.to_csv(preprocessed_dir/"hyperexpanded.csv", index=False)
memory_like_collapsed.to_csv(preprocessed_dir/"memory_like.csv", index=False)
naive_like_collapsed.to_csv(preprocessed_dir/"naive_like.csv", index=False)

In [23]:
combined_collapsed_subsampled.to_csv(preprocessed_dir/"combined_subsampled.csv", index=False)
memory_like_collapsed_subsampled.to_csv(preprocessed_dir/"memory_like_subsampled.csv", index=False)
naive_like_collapsed_subsampled.to_csv(preprocessed_dir/"naive_like_subsampled.csv", index=False)