In [40]:
import pandas as pd

df_paired_allele = pd.read_csv("../../data/WnB_Download/paired/allele/test.tsv", sep="\t")
df_paired_gene = pd.read_csv("../../data/WnB_Download/paired/gene/test.tsv", sep="\t")

df_paired_allele_levenshtein = pd.read_csv("../../data/EDA/paired/paired_allele_levenshtein_reclassified.tsv", sep="\t")
df_paired_gene_levenshtein = pd.read_csv("../../data/EDA/paired/paired_gene_levenshtein_reclassified.tsv", sep="\t")

In [41]:
# Define the columns to match
match_columns = [
    'TCR_name', 'TRAV', 'TRAJ', 'TRA_CDR3', 'TRBV', 'TRBJ', 'TRB_CDR3',
    'TRAC', 'TRBC', 'Epitope', 'MHC', 'Binding'
]

In [42]:
# Merge the datasets on the specified columns
df_merged_allele = df_paired_allele_levenshtein.merge(df_paired_allele, on=match_columns, how='left', indicator=True)
print(f"in total {len(df_merged_allele)} entries. both: {len(df_merged_allele[df_merged_allele["_merge"] == 'both'])}")
if len(df_merged_allele) == len(df_merged_allele[df_merged_allele["_merge"] == 'both']):
  print("Merge is correct")

if not len(df_merged_allele[df_merged_allele["_merge"] == "both"]) == len(df_paired_allele_levenshtein):
  raise Exception("ERROR")

df_merged_allele.drop(columns=["_merge", ], inplace=True)
df_merged_allele.drop(columns=["task_x", ], inplace=True)
df_merged_allele["task"] = df_merged_allele["task_y"]
df_merged_allele.drop(columns=["task_y", ], inplace=True)

if not len(df_paired_allele[df_paired_allele["task"] == "TPP1"]) == len(df_merged_allele[df_merged_allele["task"] == "TPP1"]):
  raise Exception("WRONG TPP")
if not len(df_paired_allele[df_paired_allele["task"] == "TPP2"]) == len(df_merged_allele[df_merged_allele["task"] == "TPP2"]):
  raise Exception("WRONG TPP")
if not len(df_paired_allele[df_paired_allele["task"] == "TPP3"]) == len(df_merged_allele[df_merged_allele["task"] == "TPP3"]):
  raise Exception("WRONG TPP")

in total 15609 entries. both: 15609
Merge is correct


In [43]:
print("ALLELE:")
print("Each entry has a minimum levenshtein to the seen data. From this values, we take the mean.")
df_levenshtein_TPP2 = df_merged_allele[df_merged_allele["task"] == "TPP2"]
print(f"mean levenshtein distance of TRA TPP2: {df_levenshtein_TPP2['min_levenshtein_cdr_alpha_to_seen'].mean()}")

df_levenshtein_TPP3 = df_merged_allele[df_merged_allele["task"] == "TPP3"]
print(f"mean levenshtein distance of TRA TPP3: {df_levenshtein_TPP3['min_levenshtein_cdr_alpha_to_seen'].mean()}")

df_levenshtein_TPP2 = df_merged_allele[df_merged_allele["task"] == "TPP2"]
print(f"mean levenshtein distance of TRB TPP2: {df_levenshtein_TPP2['min_levenshtein_cdr_beta_to_seen'].mean()}")

df_levenshtein_TPP3 = df_merged_allele[df_merged_allele["task"] == "TPP3"]
print(f"mean levenshtein distance of TRB TPP3: {df_levenshtein_TPP3['min_levenshtein_cdr_beta_to_seen'].mean()}")

ALLELE:
Each entry has a minimum levenshtein to the seen data. From this values, we take the mean.
mean levenshtein distance of TRA TPP2: 1.169956002514142
mean levenshtein distance of TRA TPP3: 1.1241379310344828
mean levenshtein distance of TRB TPP2: 1.7622878692646136
mean levenshtein distance of TRB TPP3: 1.8827586206896552


In [44]:
df_merged_allele.to_csv("../../data/EDA/paired/paired_allele_levenshtein.tsv", index=False, sep="\t")

In [45]:
# Merge the datasets on the specified columns
df_merged_gene = df_paired_gene_levenshtein.merge(df_paired_gene, on=match_columns, how='left', indicator=True)
print(f"in total {len(df_merged_gene)} entries. both: {len(df_merged_gene[df_merged_gene["_merge"] == 'both'])}")
if len(df_merged_gene) == len(df_merged_gene[df_merged_gene["_merge"] == 'both']):
  print("Merge is correct")

if not len(df_merged_gene[df_merged_gene["_merge"] == "both"]) == len(df_paired_gene_levenshtein):
  raise Exception("ERROR")

df_merged_gene.drop(columns=["_merge", ], inplace=True)
df_merged_gene.drop(columns=["task_x", ], inplace=True)
df_merged_gene["task"] = df_merged_gene["task_y"]
df_merged_gene.drop(columns=["task_y", ], inplace=True)

if not len(df_paired_gene[df_paired_gene["task"] == "TPP1"]) == len(df_merged_gene[df_merged_gene["task"] == "TPP1"]):
  raise Exception("WRONG TPP")
if not len(df_paired_gene[df_paired_gene["task"] == "TPP2"]) == len(df_merged_gene[df_merged_gene["task"] == "TPP2"]):
  raise Exception("WRONG TPP")
if not len(df_paired_gene[df_paired_gene["task"] == "TPP3"]) == len(df_merged_gene[df_merged_gene["task"] == "TPP3"]):
  raise Exception("WRONG TPP")

in total 14415 entries. both: 14415


Merge is correct


In [46]:
print("GENE:")
print("Each entry has a minimum levenshtein to the seen data. From this values, we take the mean.")
df_levenshtein_TPP2 = df_merged_gene[df_merged_gene["task"] == "TPP2"]
print(f"mean levenshtein distance of TRA TPP2: {df_levenshtein_TPP2['min_levenshtein_cdr_alpha_to_seen'].mean()}")

df_levenshtein_TPP3 = df_merged_gene[df_merged_gene["task"] == "TPP3"]
print(f"mean levenshtein distance of TRA TPP3: {df_levenshtein_TPP3['min_levenshtein_cdr_alpha_to_seen'].mean()}")

df_levenshtein_TPP2 = df_merged_gene[df_merged_gene["task"] == "TPP2"]
print(f"mean levenshtein distance of TRB TPP2: {df_levenshtein_TPP2['min_levenshtein_cdr_beta_to_seen'].mean()}")

df_levenshtein_TPP3 = df_merged_gene[df_merged_gene["task"] == "TPP3"]
print(f"mean levenshtein distance of TRB TPP3: {df_levenshtein_TPP3['min_levenshtein_cdr_beta_to_seen'].mean()}")

GENE:
Each entry has a minimum levenshtein to the seen data. From this values, we take the mean.
mean levenshtein distance of TRA TPP2: 1.1970317297850563
mean levenshtein distance of TRA TPP3: 1.3406593406593406
mean levenshtein distance of TRB TPP2: 1.829324462640737
mean levenshtein distance of TRB TPP3: 2.142857142857143


In [47]:
df_merged_gene.to_csv("../../data/EDA/paired/paired_gene_levenshtein.tsv", index=False, sep="\t")