In [1]:
import pandas as pd
import pyrepseq as prs
import tidytcells
from pathlib import Path

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
tcr_data_path = Path("../tcr_data/")

In [2]:
df = pd.read_csv(tcr_data_path/"full_table.csv")

tc_df = df.copy(deep=True)
# Only keep data where annotation is CD4 or CD8
tc_df = tc_df[tc_df['annotation_L1'].isin(['CD4', 'CD8'])].reset_index(drop=True)

tc_df = tc_df.drop_duplicates('bioidentity').reset_index(drop=True)

#Subsample so that each donor has equal CD4 and CD8 populations
grouped = tc_df.groupby(['donor', 'label']).size().unstack(fill_value=0)
min_counts = grouped[['CD4', 'CD8']].min(axis=1)
subsampled = tc_df.groupby(['donor', 'label']).apply(lambda x: x.sample(min_counts[x.name[0]], random_state=1))
tc_df = subsampled.reset_index(drop=True)

# Standardize data set to use with sceptr
tc_df_standard = prs.standardize_dataframe(tc_df,
                          {"v_gene_A":"TRAV", "cdr3_A":"CDR3A", "j_gene_A":"TRAJ",
                           "v_gene_B":"TRBV", "cdr3_B":"CDR3B", "j_gene_B":"TRBJ", "annotation_L1":"label", "donor":"donor"}, suppress_warnings=True, tcr_precision='allele').reset_index(drop=True)



In [3]:
tc_df_2 = tc_df_standard.dropna()

In [7]:
tc_df_final = tc_df_2.copy(deep=True)
tc_df_final["TRAV"] = tc_df_2["TRAV"].map(
    lambda x: tidytcells.tr.standardise(x, enforce_functional=True)
)
tc_df_final["TRAJ"] = tc_df_2["TRAJ"].map(
    lambda x: tidytcells.tr.standardise(x, enforce_functional=True)
)
tc_df_final["TRBV"] = tc_df_2["TRBV"].map(
    lambda x: tidytcells.tr.standardise(x, enforce_functional=True)
)
tc_df_final["TRBJ"] = tc_df_2["TRBJ"].map(
    lambda x: tidytcells.tr.standardise(x, enforce_functional=True)
)

In [9]:
tc_df_final.to_csv(tcr_data_path/"preprocessed"/"celltype_binary.csv", index=False)