In [None]:
import pandas as pd
import numpy as np

In [None]:
if not 'fitted_input_file' in locals():
  fitted_input_file = "../../data/McPAS-TCR/fitted_data/mcpastcr_fitted_data_both.tsv"

if not 'path_prefix_cleaned' in locals():
  path_prefix_cleaned = "../../data/McPAS-TCR/cleaned_data"

if not 'cleaned_file_paired' in locals():
  cleaned_file_paired = "mcpastcr_cleaned_data_paired.tsv"

if not 'cleaned_file_beta' in locals():
  cleaned_file_beta = "mcpastcr_cleaned_data_beta.tsv"

In [None]:
mcpastcr_fitted_both_df = pd.read_csv(fitted_input_file, sep="\t")
mcpastcr_fitted_both_df 

In [None]:
mcpastcr_fitted_both_df.columns

In [None]:
mcpastcr_cleaned_both_df = mcpastcr_fitted_both_df

In [None]:
new_columns = [
    "TCR_name", 'TRAC', 'TRBC', 'TRA_leader', 'TRB_leader', 'Linker', 'Link_order',
    'TRA_5_prime_seq', 'TRA_3_prime_seq', 'TRB_5_prime_seq', 'TRB_3_prime_seq'
]

for column in new_columns:
    mcpastcr_cleaned_both_df[column] = np.nan

template_columns_order = [
    'TCR_name', 'TRAV', 'TRAJ', 'TRA_CDR3', 'TRBV', 'TRBJ', 'TRB_CDR3',
    'TRAC', 'TRBC', 'TRA_leader', 'TRB_leader', 'Linker', 'Link_order',
    'TRA_5_prime_seq', 'TRA_3_prime_seq', 'TRB_5_prime_seq', 'TRB_3_prime_seq'
]

mcpastcr_cleaned_both_df = mcpastcr_cleaned_both_df[template_columns_order]

mcpastcr_cleaned_both_df.columns

In [None]:
mcpastcr_cleaned_both_df = pd.concat([mcpastcr_cleaned_both_df, mcpastcr_fitted_both_df["Epitope"]], axis=1)
mcpastcr_cleaned_both_df.columns


In [None]:
# print(mcpastcr_fitted_both_df["T.Cell.Type"].to_string())

In [None]:
# Define the mapping for the replacement
value_mapping = {
    "CD8": "MHCI",
    "CD4": "MHCII"
}

# Use .map() to replace the values, passing the 'na_action='ignore'' parameter to keep NaN values as NaN
mcpastcr_fitted_both_df["T.Cell.Type"] = mcpastcr_fitted_both_df["T.Cell.Type"].map(value_mapping, na_action='ignore')

# Set anything else to np.nan
# This step is only necessary if there are other values besides NaN, CD8, or CD4. If not, this can be skipped.
# It's a safeguard in case your data changes over time or if you're unsure about the presence of other values.
mcpastcr_fitted_both_df.loc[~mcpastcr_fitted_both_df["T.Cell.Type"].isin([None, "MHCI", "MHCII"]), "T.Cell.Type"] = np.nan
mcpastcr_fitted_both_df.columns

In [None]:
mcpastcr_cleaned_both_df["Score"] = np.nan
mcpastcr_cleaned_both_df["TCR_name"] = range(1, len(mcpastcr_cleaned_both_df) + 1) # use index as TCR_name
mcpastcr_cleaned_both_df["MHC"] = mcpastcr_fitted_both_df["MHC"]
mcpastcr_cleaned_both_df["MHC class"] = mcpastcr_fitted_both_df["T.Cell.Type"]

print("MHC Class I has " + str(len(mcpastcr_cleaned_both_df[mcpastcr_cleaned_both_df["MHC class"] == "MHCI"].index)) + " entries")
print(f"whole dataframe has {len(mcpastcr_cleaned_both_df.index)} entries")

mcpastcr_cleaned_both_df = mcpastcr_cleaned_both_df[mcpastcr_cleaned_both_df["MHC class"] == "MHCI"]
print(f"filtered to only use MHC Class I. Length of dataset: {len(mcpastcr_cleaned_both_df.index)}")

In [None]:
def convert_region_format(entry):
    # Check if the entry is NaN or not a string to avoid errors.
    if pd.isna(entry) or not isinstance(entry, str):
        return entry  # Return the entry unchanged if it's NaN or not a string.
    
    # Only replace ':' with '*' if ':' is present in the entry.
    # This specifically targets the allele annotation separation.
    if ':' in entry:
        entry = entry.replace(':', '*')
    
    return entry


In [None]:
for column in ['TRAV', 'TRAJ', 'TRBV', 'TRBJ']:
    mcpastcr_cleaned_both_df[column] = mcpastcr_cleaned_both_df[column].apply(convert_region_format)

**check this...**

In [None]:
def starts_with_m(df, columns):
    mask = pd.Series([False] * len(df))
    for column in columns:
        temp_mask = df[column].apply(lambda entry: False if pd.isna(entry) else str(entry).startswith("m"))
        mask = mask | temp_mask
    return mask

columns_to_check = ["TRAV", "TRBV", "TRAJ", "TRBJ"]
mask = starts_with_m(mcpastcr_cleaned_both_df, columns_to_check)

In [None]:
mcpastcr_cleaned_both_df = mcpastcr_cleaned_both_df[~mask]

In [None]:
# List of columns to update
columns_to_update = ["TRAV", "TRBV", "TRAJ", "TRBJ"]

# Loop through each column and replace the pattern
# This pattern finds '/D' followed by non-asterisk characters (if any) until an asterisk '*' or end of the string, but does not capture the asterisk and following characters for replacement
for column in columns_to_update:
    mcpastcr_cleaned_both_df.loc[:, column] = mcpastcr_cleaned_both_df[column].str.replace(r"\/D[^*]*", "", regex=True)


In [None]:
# List of columns to update
columns_to_update = ["TRBV", "TRBJ", "TRAV", "TRAJ"]

# Loop through each column and replace everything after a ',' or '/' with an empty string
for column in columns_to_update:
    mcpastcr_cleaned_both_df.loc[:, column] = mcpastcr_cleaned_both_df[column].str.replace(r"[,\/].*", "", regex=True)


In [None]:
mcpastcr_cleaned_both_df

In [None]:
paired_condition = mcpastcr_cleaned_both_df["TRA_CDR3"].notna() & mcpastcr_cleaned_both_df["TRB_CDR3"].notna()

# Condition for beta only information (TRB_CDR3 is not NaN)
beta_only_condition = mcpastcr_cleaned_both_df["TRB_CDR3"].notna()

# Creating DataFrames based on conditions
mcpastcr_cleaned_paired_df = mcpastcr_cleaned_both_df[paired_condition]
mcpastcr_cleaned_beta_df = mcpastcr_cleaned_both_df[beta_only_condition]


In [None]:
mcpastcr_cleaned_paired_df

In [None]:
mcpastcr_cleaned_beta_df

In [None]:
mcpastcr_cleaned_paired_df.to_csv(path_prefix_cleaned+"/"+cleaned_file_paired, sep="\t", index=False)
mcpastcr_cleaned_beta_df.to_csv(path_prefix_cleaned+"/"+cleaned_file_beta, sep="\t", index=False)