In [None]:
import pandas as pd
import numpy as np

if not 'precision' in locals():
  precision = "gene" # allele or gene

if not 'input_file' in locals():
  input_file = f"../../data/customDatasets/{precision}/paired_concatenated.tsv"
df = pd.read_csv(input_file, sep='\t', low_memory=False)

if not 'paired_output_folder' in locals():
  paired_output_folder = f"../../data/splitted_data/{precision}/paired"

if not 'validation_file_name' in locals():
  validation_file_name = "validation.tsv"

if not 'test_file_name' in locals():
  test_file_name = "test.tsv"

if not 'train_file_name' in locals():
  train_file_name = "train.tsv"

if not 'aimed_test_ratio' in locals():
  aimed_test_ratio = 0.3

First the data entries (without negative data) is analysed.

In [None]:
tcr_key = "tcr_key"

df[tcr_key] = df['TRA_CDR3'].astype(str) + '_' + df['TRB_CDR3'].astype(str)


distinct_tcrs = df.drop_duplicates(subset=[tcr_key], keep="first", inplace=False)
unique_epitopes = df.drop_duplicates(subset=["Epitope"], keep=False, inplace=False)
unique_tcrs = df.drop_duplicates(subset=[tcr_key], keep=False, inplace=False)


print(f"distinct tcr's: {len(distinct_tcrs)} from {len(df)}")
print(f"unique tcr's: {len(unique_tcrs)} from {len(df)}")
print(f"unique epitopes: {len(unique_epitopes["Epitope"])} from {len(df)}")

Now a train and test set is created. The test set should consist only of TPP2 and TPP3 Tasks (TPP => TCR–Peptide/Epitope Pairing).
TPP2 means the epitope is seen in training but TCR is unseen.
TPP3 means neither the TCR nor the epitope is seen in training .

In [None]:
df_train = pd.merge(df, unique_tcrs, how='left', indicator=True)
df_train = df_train[df_train['_merge'] == 'left_only']
df_train["task"] = ""
train_epitopes = set(df_train["Epitope"])

df_test = unique_tcrs.copy()
df_test["task"] = df_test["Epitope"].apply(lambda x: 'TPP3' if x not in train_epitopes else 'TPP2')

number_of_TPP3 = (df_test['task'] == 'TPP3').sum()
number_of_TPP2 = (df_test['task'] == 'TPP2').sum()
number_of_TPP1 = (df_test['task'] == 'TPP1').sum()
test_ratio = len(df_test)/(len(df_test) + len(df_train))

print(f"train data has {len(df_train)} entries")
print(f"test data has {len(df_test)} entries")
print(f"test data has {number_of_TPP1} TPP1 tasks (unseen tcr & seen epitopes).")
print(f"test data has {number_of_TPP2} TPP2 tasks (unseen tcr & seen epitopes).")
print(f"test data has {number_of_TPP3} TPP3 tasks (unseen tcr & unseen epitope).")
print(f"the train/test ratio is {(1-test_ratio)}/{test_ratio}")

if the test ratio is below 0.3, we fill up the test data with TPP1 tasks (seen tcr & seen epitope). If the ratio is higher than 0.3, an exception get's thrown to prevent working with unbalanced data.

In [None]:
def tpp1_from_train_to_test(amount, df_train, df_test):
  number_of_TPP1 = (df_test['task'] == 'TPP1').sum() 
  number_of_TPP2 = (df_test['task'] == 'TPP2').sum()
  print(f"{amount} entries from train will be moved to test (TPP1)")
  print(f"df_train size before: {len(df_train.index)}")
  print(f"number of tpp1 before: {number_of_TPP1}")
  print(f"number of tpp2 before: {number_of_TPP2}")

  for i in range(amount):
    # Find values that appear more than once in each column
    non_unique_epitopes = df_train['Epitope'].value_counts()
    non_unique_epitopes = non_unique_epitopes[non_unique_epitopes > 1].index.tolist()

    non_unique_CDR3 = df_train[tcr_key].value_counts()
    non_unique_CDR3 = non_unique_CDR3[non_unique_CDR3 > 1].index.tolist()

    # Filter df_train to only include rows where the Epitope and CDR3 values are not unique
    filtered_df = df_train[df_train['Epitope'].isin(non_unique_epitopes) & df_train[tcr_key].isin(non_unique_CDR3)]
    
    if not filtered_df.empty:
      first_row_index = filtered_df.index[0]
      df_train.loc[first_row_index, 'task'] = "TPP1"
      # Append this row to df_test
      df_test = pd.concat([df_test, pd.DataFrame([df_train.loc[first_row_index]])], ignore_index=True)
      # Drop this row from df_train using its index
      df_train = df_train.drop(first_row_index)
    else:
      raise Exception("The specific row does not exist in df_train.")
    
  number_of_TPP1 = (df_test['task'] == 'TPP1').sum() 
  number_of_TPP2 = (df_test['task'] == 'TPP2').sum()
  print(f"df_train size after: {len(df_train.index)}")
  print(f"number of tpp1 after: {number_of_TPP1}")
  print(f"number of tpp2 after: {number_of_TPP2}")
  return df_train, df_test

In [None]:
import math

if(test_ratio > aimed_test_ratio):
  abundant_test_data_count = math.ceil((test_ratio - aimed_test_ratio) * (len(df_test) + len(df_train)))
  print(f"{abundant_test_data_count} entries will be shifted from test to train so the train/test ratio can be {1-aimed_test_ratio}/{aimed_test_ratio}")
  # Filter and select rows
  filtered_rows = df_test[df_test["task"] == "TPP2"]
  if len(filtered_rows) < abundant_test_data_count:
      raise ValueError("Not enough entries with 'task' == 'TPP2' to move.")
  rows_to_move = filtered_rows.head(abundant_test_data_count)
  # Append to df_train
  df_train = pd.concat([df_train, rows_to_move], ignore_index=True)
  # Remove from df_test
  df_test = df_test.drop(rows_to_move.index)
elif(test_ratio < aimed_test_ratio):
  missing_test_data_count = math.ceil((aimed_test_ratio - test_ratio) * (len(df_test) + len(df_train)))
  print(f"{missing_test_data_count} entries need to be shifted from train to test so the train/test ratio can be {1-aimed_test_ratio}/{aimed_test_ratio}")
  df_train, df_test = tpp1_from_train_to_test(missing_test_data_count, df_train, df_test)
    

The test data should have the same amount of TPP1 and TPP 2 Tasks. There is not enough data to have a proper TPP3 share so we just take as many as we can, without removing information from the training set.

In [None]:
# recalculate TPP counts
number_of_TPP3 = (df_test['task'] == 'TPP3').sum()
number_of_TPP2 = (df_test['task'] == 'TPP2').sum()
number_of_TPP1 = (df_test['task'] == 'TPP1').sum()
tpp1_ratio = number_of_TPP1/(number_of_TPP1 + number_of_TPP2)

aimed_tpp1_ratio = 0.5
missing_tpp1_count = math.ceil((aimed_tpp1_ratio - tpp1_ratio) * (number_of_TPP2 + number_of_TPP1))
abundant_tpp2_count = number_of_TPP2 - math.ceil((1-aimed_tpp1_ratio) * (number_of_TPP1 + number_of_TPP2))

# move surplus TPP2 entries back to train
print(f"{abundant_tpp2_count} entries will be shifted from test to train so the tpp1/tpp2 ratio can be {aimed_tpp1_ratio}/{1-aimed_tpp1_ratio}")
# Filter and select rows
filtered_rows = df_test[df_test["task"] == "TPP2"]
if len(filtered_rows) < abundant_tpp2_count:
    raise ValueError("Not enough entries with 'task' == 'TPP2' to move.")
rows_to_move = filtered_rows.head(abundant_tpp2_count)
# Append to df_train
df_train = pd.concat([df_train, rows_to_move], ignore_index=True)
# Remove from df_test
df_test = df_test.drop(rows_to_move.index)

# get TPP1 tasks from train and move them to test
print(f"{missing_tpp1_count} entries need to be shifted from train to test so the tpp1/tpp2 ratio can be {aimed_tpp1_ratio}/{1-aimed_tpp1_ratio}")
df_train, df_test = tpp1_from_train_to_test(missing_tpp1_count, df_train, df_test)

In [None]:
number_of_TPP1 = (df_test['task'] == 'TPP1').sum()
number_of_TPP2 = (df_test['task'] == 'TPP2').sum()
number_of_TPP3 = (df_test['task'] == 'TPP3').sum()
test_ratio = len(df_test)/(len(df_test) + len(df_train))

print(f"train data has {len(df_train)} entries")
print(f"test data has {len(df_test)} entries")
print(f"test data has {number_of_TPP1} TPP1 tasks (seen tcr & seen epitopes).")
print(f"test data has {number_of_TPP2} TPP2 tasks (unseen tcr & seen epitopes).")
print(f"test data has {number_of_TPP3} TPP3 tasks (unseen tcr & unseen epitope).")
print(f"the train/test ratio is {(1-test_ratio)}/{test_ratio}")

df_test.drop(columns=["_merge", "tcr_key"], inplace=True, errors='ignore')
df_train.drop(columns=["_merge", "tcr_key"], inplace=True, errors='ignore')

In [None]:
# Function to verify each row
def calculate_task(row, known_epitopes, known_tcr):
    epitope_exists = row['Epitope'] in known_epitopes
    row[tcr_key] = row['TRA_CDR3'] + '_' + row['TRB_CDR3']
    cdr3_exists = row[tcr_key] in known_tcr
    
    if epitope_exists and cdr3_exists:
        return 'TPP1'
    elif epitope_exists and not cdr3_exists:
        return 'TPP2'
    elif not epitope_exists and not cdr3_exists:
        return 'TPP3'
    raise Exception("Something seems wrong")  # This handles unexpected cases

In [None]:
from sklearn.utils import shuffle

tpp1_df = df_test[df_test['task'] == 'TPP1']
tpp2_df = df_test[df_test['task'] == 'TPP2']
tpp3_df = df_test[df_test['task'] == 'TPP3']

# Shuffle and split tpp1 entries
tpp1_df = shuffle(tpp1_df, random_state=42)
split_index = len(tpp1_df) // 2
val_tpp1 = tpp1_df[:split_index]
test_tpp1 = tpp1_df[split_index:]

# Shuffle and split tpp2 entries
tpp2_df = shuffle(tpp2_df, random_state=42)
split_index = len(tpp2_df) // 2
val_tpp2 = tpp2_df[:split_index]
test_tpp2 = tpp2_df[split_index:]

# Shuffle and split tpp3 entries
tpp3_df = shuffle(tpp3_df, random_state=42)
split_index = len(tpp3_df) // 2
val_tpp3 = tpp3_df[:split_index]
test_tpp3 = tpp3_df[split_index:]

#df_test = pd.concat([test_tpp2, test_tpp3])
df_test = pd.concat([test_tpp1, test_tpp2, test_tpp3])
df_validation = pd.concat([val_tpp1, val_tpp2, val_tpp3])

# Recalculate TPP3 classification in test
df_train_val = pd.concat([df_train, df_validation])
df_train_val[tcr_key] = df_train_val['TRA_CDR3'].astype(str) + '_' + df_train_val['TRB_CDR3'].astype(str)

seen_epitopes = set(df_train_val["Epitope"])
seen_tcr = set(df_train_val[tcr_key])
df_test["task"] = df_test.apply(lambda x: calculate_task(x, seen_epitopes, seen_tcr), axis=1)

In [None]:
number_of_TPP1 = (df_test['task'] == 'TPP1').sum()
number_of_TPP2 = (df_test['task'] == 'TPP2').sum()
number_of_TPP3 = (df_test['task'] == 'TPP3').sum()
test_ratio = len(df_test)/(len(df_test) + len(df_train) + len(df_validation))
validation_ratio = len(df_validation)/(len(df_test) + len(df_train) + len(df_validation))

print(f"test data has {len(df_test)} entries")
print(f"validation data has {len(df_validation)} entries")
print(f"train data has {len(df_train)} entries")
print(f"test data has {number_of_TPP1} TPP1 tasks (seen tcr & seen epitopes).")
print(f"test data has {number_of_TPP2} TPP2 tasks (unseen tcr & seen epitopes).")
print(f"test data has {number_of_TPP3} TPP3 tasks (unseen tcr & unseen epitope).")
print(f"the test ratio is {(1-test_ratio)}/{test_ratio}")
print(f"the validation ratio is {(1-validation_ratio)}/{validation_ratio}")

In [None]:
df_train.drop(columns=["is_duplicated"], inplace=True)
df_train["task"] = np.nan
df_train

In [None]:
df_validation.drop(columns=["is_duplicated"], inplace=True)
df_validation["task"] = np.nan
df_validation

In [None]:
df_test.drop(columns=["is_duplicated"], inplace=True)
df_test

In [None]:
df_validation.to_csv(f"{paired_output_folder}/{validation_file_name}", sep="\t", index=False)
df_test.to_csv(f"{paired_output_folder}/{test_file_name}", sep="\t", index=False)
df_train.to_csv(f"{paired_output_folder}/{train_file_name}", sep="\t", index=False)