In [None]:
import pandas as pd
import numpy as np
import tidytcells as tt

if not 'precision' in locals():
  precision = "gene" # possible values are gene and allele

if not 'custom_dataset_path' in locals():
  custom_dataset_path = "../data/customDatasets/" + precision + "/"

if not 'vdjdb_beta_read_path' in locals():
  vdjdb_beta_read_path = "../data/VDJdb/cleaned_data/vdjdb_cleaned_data_beta.tsv"
vdjdb_beta_df = pd.read_csv(vdjdb_beta_read_path, sep="\t")

if not 'mcpastcr_beta_read_path' in locals():
  mcpastcr_beta_read_path = "../data/McPAS-TCR/cleaned_data/mcpastcr_cleaned_data_beta.tsv"
mcpastcr_beta_df = pd.read_csv(mcpastcr_beta_read_path, sep="\t")

if not 'iedb_beta_read_path' in locals():
  iedb_beta_read_path = "../data/IEDB/cleaned_data/IEDB_cleaned_data_beta.csv"
iedb_beta_df = pd.read_csv(iedb_beta_read_path)

if not 'vdjdb_paired_read_path' in locals():
  vdjdb_paired_read_path = "../data/VDJdb/cleaned_data/vdjdb_cleaned_data_paired.tsv"
vdjdb_paired_df = pd.read_csv(vdjdb_paired_read_path, sep="\t")

if not 'mcpastcr_paired_read_path' in locals():
  mcpastcr_paired_read_path = "../data/McPAS-TCR/cleaned_data/mcpastcr_cleaned_data_paired.tsv"
mcpastcr_paired_df = pd.read_csv(mcpastcr_paired_read_path, sep="\t")

if not 'iedb_paired_read_path' in locals():
  iedb_paired_read_path = "../data/IEDB/cleaned_data/IEDB_cleaned_data_paired.csv"
iedb_paired_df = pd.read_csv(iedb_paired_read_path)

if not 'output_file_beta' in locals():
  output_file_beta = "beta_concatenated.tsv"

if not 'output_file_paired' in locals():
  output_file_paired = "paired_concatenated.tsv"

In [None]:
def standardize_mhc(val):
  if isinstance(val, str): return tt.mh.standardize(val, species="homosapiens", precision=precision)
  else: return np.nan

def standardize_vj(val):
  if isinstance(val, str): return tt.tr.standardize(gene=val, species="homosapiens", precision=precision)
  else: return np.nan

def standardize_cdr3(val):
  if isinstance(val, str): return tt.junction.standardize(seq=val)
  else: return np.nan

In [None]:
beta_df = pd.concat([vdjdb_beta_df, mcpastcr_beta_df, iedb_beta_df], axis=0)
obsolete_columns = ["TRA_leader", "TRB_leader", "Linker", "Link_order", "TRA_5_prime_seq", "TRA_3_prime_seq", "TRB_5_prime_seq", "TRB_3_prime_seq",\
                    "Score", "MHC class", "TRAC", "TRAV" ,"TRAJ", "TRA_CDR3"]

beta_df = beta_df.drop(columns=obsolete_columns)
print(f"length of beta_df: {len(beta_df.index)}")

In [None]:
# Before dropping duplicates we need to order the entries based on how many informations they contain
beta_df["info_score"] = beta_df.notnull().sum(axis=1)
beta_df = beta_df.sort_values(by=['info_score'], ascending=[False])

In [None]:
beta_df["MHC"] = beta_df["MHC"].apply(standardize_mhc)
beta_df["TRBV"] = beta_df["TRBV"].apply(standardize_vj)
beta_df["TRBJ"] = beta_df["TRBJ"].apply(standardize_vj)
beta_df["TRB_CDR3"] = beta_df["TRB_CDR3"].apply(standardize_cdr3)
beta_df = beta_df.dropna(subset=["TRB_CDR3"])

In [None]:
print(f"The following script removes a lot of rows. They are kept and some of them get added again later")
cultivated_columns = beta_df.columns.difference(["TCR_name", "info_score"]).tolist()
most_important_columns = ["TRB_CDR3", "Epitope"]

beta_df_distinct = beta_df.drop_duplicates(subset=cultivated_columns, keep="first")
print(f"distinct entries (all columns, keep=first). {len(beta_df.index)-len(beta_df_distinct.index)} entries removed.")

beta_df_no_duplicates = beta_df_distinct.drop_duplicates(subset=most_important_columns, keep=False)
print(f"removed all duplicates (CDR3, Epitope) from distinct values (most_important_columns, keep=False). {len(beta_df_distinct.index)-len(beta_df_no_duplicates.index)} entries removed.")

beta_df_removed_entries = pd.merge(beta_df_distinct, beta_df_no_duplicates, how="left", indicator=True)
beta_df_removed_entries = beta_df_removed_entries[beta_df_removed_entries['_merge'] == 'left_only'] # left-only values from left-join merge are dropped out rows
print(f"beta removed entries df length: {len(beta_df_removed_entries.index)}")
print()
print()

# Check data integrity and make sure beta_df_removed_entries and beta_df_no_duplicates have no same rows
beta_df_removed_entries.drop("_merge", axis='columns', inplace=True)
merged_df = pd.merge(beta_df_removed_entries, beta_df_no_duplicates, on=most_important_columns, indicator=True, how='inner')
# Check if merged_df is empty
if not merged_df.empty:
  print(merged_df)  # Optional: Display the common rows
  print("There are identical rows between the two DataFrames.")
  raise Exception("ERROR: There shouldn't be identical rows")

In [None]:
def is_duplicate(base_row, compare_row, columns):
    if compare_row['is_duplicated'] is True: return True

    # print()
    # print()
    # print("base row:")
    # print(base_row)
    # print()
    # print("comparing row:")
    # print(compare_row)

    for key in columns:
      if base_row[key] != compare_row[key] and not pd.isna(compare_row[key]):
        return False
    

    # print("*********************DUPLICATION*******************************")
    # print("base row:")
    # print(base_row)
    # print()
    # print("comparing row:")
    # print(compare_row)

    return True


In [None]:
# prepare both lists
beta_df_removed_entries['is_duplicated'] = False
duplicates_to_add = pd.DataFrame(columns=beta_df_removed_entries.columns)

# Iterate from top to bottom (top has highest information score)
beta_df_removed_grouped = beta_df_removed_entries.groupby(most_important_columns)
number_of_groups = len(beta_df_removed_grouped)
print("Number of groups formed:", number_of_groups)

for name, group in beta_df_removed_grouped:
  group = group.sort_values(by=['info_score'], ascending=False).reset_index(drop=True)
  # print(f"group {name}")
  # print(group)
  # print()
  # print()

  for i in range(len(group.index)-1):
    if group.iloc[i]['is_duplicated'].any(): continue

    for j in range(i+1, len(group.index)):
      if not group.iloc[j]['is_duplicated'].any() and is_duplicate(group.iloc[i], group.iloc[j], cultivated_columns):
        group.at[j, 'is_duplicated'] = True
    
  duplicates_to_add = pd.concat([duplicates_to_add, group[group['is_duplicated'] == False]])
  #print(f"for group duplicates to add has size {len(duplicates_to_add)}")

In [None]:
print(f"{len(duplicates_to_add.index)} can be re-added to the no-duplicated dataframe")
duplicates_to_add.drop("_merge", axis='columns', inplace=True, errors='ignore')

In [None]:
def analyzer(description, df_plain, df_clean):
  print(f"for {description}:")
  print(f"size difference is: {len(df_plain.index)-len(df_clean.index)}")
  print(f"  {len(df_clean.index)} information score cleaned: {df_clean["info_score"].mean()}")
  print(f"  {len(df_plain.index)} information score dropout: {df_plain["info_score"].mean()}")

In [None]:
# Check data integrity and make sure duplicates_to_add and beta_df_no_duplicates have no same rows
merged_df = pd.merge(beta_df_no_duplicates, duplicates_to_add, on=cultivated_columns, indicator=True, how='inner')

# Check if merged_df is empty
if not merged_df.empty:
  print(merged_df)  # Optional: Display the common rows
  print("There are identical rows between the two DataFrames.")
  raise Exception("ERROR: There shouldn't be identical rows")

final_beta_df = pd.concat([beta_df_no_duplicates, duplicates_to_add])
print(f"from the plain dataset which has {len(beta_df.index)} entries, {len(beta_df.index)-len(final_beta_df.index)} entries have been removed.")

In [None]:
analyzer("beta dataset ", beta_df, final_beta_df)

In [None]:
final_beta_df["TCR_name"] = range(1, len(final_beta_df) + 1)
final_beta_df["Binding"] = 1
final_beta_df.drop("info_score", axis='columns', inplace=True)
print(f"final_beta_df length = {len(final_beta_df)}")

In [None]:
final_beta_df

In [None]:
final_beta_df.to_csv(custom_dataset_path+output_file_beta, sep="\t", index=False)

-------------------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------------------

Here we concatenate the paired datasets!

In [None]:
paired_df = pd.concat([vdjdb_paired_df, mcpastcr_paired_df, iedb_paired_df], axis=0)
obsolete_columns = ["TRA_leader", "TRB_leader", "Linker", "Link_order", "TRA_5_prime_seq", "TRA_3_prime_seq", "TRB_5_prime_seq", "TRB_3_prime_seq",\
                    "Score", "MHC class"]
paired_df = paired_df.drop(columns=obsolete_columns)
print(f"length of paired_df: {len(paired_df.index)}")

In [None]:
# Before dropping duplicates we need to order the entries based on how many informations they contain
paired_df["info_score"] = paired_df.notnull().sum(axis=1)
paired_df = paired_df.sort_values(by=['info_score'], ascending=[False])

In [None]:
paired_df["MHC"] = paired_df["MHC"].apply(standardize_mhc)
paired_df["TRAV"] = paired_df["TRAV"].apply(standardize_vj)
paired_df["TRBV"] = paired_df["TRBV"].apply(standardize_vj)
paired_df["TRAJ"] = paired_df["TRAJ"].apply(standardize_vj)
paired_df["TRBJ"] = paired_df["TRBJ"].apply(standardize_vj)
paired_df["TRA_CDR3"] = paired_df["TRA_CDR3"].apply(standardize_cdr3)
paired_df["TRB_CDR3"] = paired_df["TRB_CDR3"].apply(standardize_cdr3)
paired_df = paired_df.dropna(subset=["TRA_CDR3", "TRB_CDR3"])

In [None]:
print(f"The following script removes a lot of rows. They are kept and some of them get added again later")
cultivated_columns = paired_df.columns.difference(["TCR_name", "info_score"]).tolist()
most_important_columns = ["TRA_CDR3", "TRB_CDR3", "Epitope"]

paired_df_distinct = paired_df.drop_duplicates(subset=cultivated_columns, keep="first")
print(f"distinct entries (all columns, keep=first). {len(paired_df.index)-len(paired_df_distinct.index)} entries removed.")

paired_df_no_duplicates = paired_df_distinct.drop_duplicates(subset=most_important_columns, keep=False)
print(f"removed all duplicates from distinct values (cultivated columns, keep=False). {len(paired_df_distinct.index)-len(paired_df_no_duplicates.index)} entries removed.")

paired_df_removed_entries = pd.merge(paired_df_distinct, paired_df_no_duplicates, how="left", indicator=True)
paired_df_removed_entries = paired_df_removed_entries[paired_df_removed_entries['_merge'] == 'left_only'] # left-only values from left-join merge are dropped out rows
print(f"paired removed entries df length: {len(paired_df_removed_entries.index)}")
print()
print()

# Check data integrity and make sure paired_df_removed_entries and paired_df_no_duplicates have no same rows
paired_df_removed_entries.drop("_merge", axis='columns', inplace=True)
merged_df = pd.merge(paired_df_removed_entries, paired_df_no_duplicates, on=most_important_columns, indicator=True, how='inner')
# Check if merged_df is empty
if not merged_df.empty:
  print(merged_df)  # Optional: Display the common rows
  print("There are identical rows between the two DataFrames.")
  raise Exception("ERROR: There shouldn't be identical rows")



In [None]:
# prepare both lists
paired_df_removed_entries['is_duplicated'] = False
duplicates_to_add = pd.DataFrame(columns=paired_df_removed_entries.columns)

# Iterate from top to bottom (top has highest information score)
paired_df_removed_grouped = paired_df_removed_entries.groupby(most_important_columns)

for name, group in paired_df_removed_grouped:
  group = group.sort_values(by=['info_score'], ascending=False).reset_index(drop=True)

  for i in range(len(group.index)-2):
    if group.iloc[i]['is_duplicated'].any(): continue

    for j in range(i+1, len(group.index)-1):
      if not group.iloc[j]['is_duplicated'].any() and is_duplicate(group.iloc[i], group.iloc[j], cultivated_columns):
        group.at[j, 'is_duplicated'] = True
    
  duplicates_to_add = pd.concat([duplicates_to_add, group[group['is_duplicated'] == False]])

In [None]:

'''
# Now I want to re-add some of the duplicated elements which have the highest info_score.
paired_df_removed_grouped = paired_df_removed_entries.groupby(most_important_columns)
duplicates_to_add = pd.DataFrame(columns=paired_df_removed_entries.columns) #  create empty dataframe

for name, group in paired_df_removed_grouped:
    highest_info_score = group['info_score'].max()

    for index, row in group.iterrows():
      removed = True
      if row["info_score"] == highest_info_score:
        removed = False
        row = pd.DataFrame([row])
        duplicates_to_add = pd.concat([duplicates_to_add, row], ignore_index=True)

print(f"{len(duplicates_to_add.index)} will be re-added to the no-duplicates dataframe")
'''


In [None]:
print(f"{len(duplicates_to_add.index)} can be re-added to the no-duplicated dataframe")
duplicates_to_add.drop("_merge", axis='columns', inplace=True, errors='ignore')

In [None]:
# Check data integrity and make sure duplicates_to_add and paired_df_no_duplicates have no same rows
merged_df = pd.merge(paired_df_no_duplicates, duplicates_to_add, on=most_important_columns, indicator=True, how='inner')

# Check if merged_df is empty
if not merged_df.empty:
  print(merged_df)  # Optional: Display the common rows
  print("There are identical rows between the two DataFrames.")
  raise Exception("ERROR: There shouldn't be identical rows")

final_paired_df = pd.concat([paired_df_no_duplicates, duplicates_to_add])
print(f"from the plain dataset which has {len(paired_df.index)} entries, {len(paired_df.index)-len(final_paired_df.index)} entries have been removed.")

In [None]:
analyzer("paired dataset", paired_df, final_paired_df)

In [None]:
final_paired_df["TCR_name"] = range(1, len(final_paired_df) + 1)
final_paired_df["Binding"] = 1
final_paired_df.drop("info_score", axis='columns', inplace=True)
print(f"final_paired_df length: {len(final_paired_df)}")

In [None]:
final_paired_df.to_csv(custom_dataset_path+output_file_paired, sep="\t", index=False)