In [None]:
import pandas as pd

df = pd.read_csv("../../data/customDatasets/beta_concatenated.tsv", sep='\t', low_memory=False)
paired_output_folder = "../../data/splitted_data/beta"

First the data entries (without negative data) is analysed.

In [None]:
tcr_key = 'TRB_CDR3'

distinct_tcrs = df.drop_duplicates(subset=[tcr_key], keep="first", inplace=False)
unique_epitopes = df.drop_duplicates(subset=["Epitope"], keep=False, inplace=False)
unique_tcrs = df.drop_duplicates(subset=[tcr_key], keep=False, inplace=False)

print(f"distinct tcr's: {len(distinct_tcrs)} from {len(df)}")
print(f"unique tcr's: {len(unique_tcrs)} from {len(df)}")
print(f"unique epitopes: {len(unique_epitopes["Epitope"])} from {len(df)}")

Now a train and test set is created. The test set should consist only of TPP2 and TPP3 Tasks (TPP => TCR–Peptide/Epitope Pairing).
TPP2 means the epitope is seen in training but TCR is unseen.
TPP3 means neither the TCR nor the epitope is seen in training .

In [None]:
df_train = pd.merge(df, unique_tcrs, how='left', indicator=True)
df_train = df_train[df_train['_merge'] == 'left_only']
df_train["task"] = ""
train_epitopes = set(df_train["Epitope"])

df_test = unique_tcrs.copy()
df_test["task"] = df_test["Epitope"].apply(lambda x: 'TPP3' if x not in train_epitopes else 'TPP2')

number_of_TPP3 = (df_test['task'] == 'TPP3').sum()
number_of_TPP2 = (df_test['task'] == 'TPP2').sum()
test_ratio = len(df_test)/(len(df_test) + len(df_train))

print(f"train data has {len(df_train)} entries")
print(f"test data has {len(df_test)} entries")
print(f"test data has {number_of_TPP3} TPP3 tasks (unseen tcr & unseen epitope).")
print(f"test data has {number_of_TPP2} TPP2 tasks (unseen tcr & seen epitopes).")
print(f"the train/test ratio is {(1-test_ratio)}/{test_ratio}")

if the test ratio is below 0.3, we fill up the test data with TPP1 tasks (seen tcr & seen epitope). If the ratio is higher than 0.3, an exception get's thrown to prevent working with unbalanced data.

In [None]:
import math

aimed_test_ratio = 0.3

if(test_ratio > aimed_test_ratio):
  #raise Exception("The train/test ratio is too unbalanced.")
  abundant_test_data_count = math.ceil((test_ratio - aimed_test_ratio) * (len(df_test) + len(df_train)))
  print(f"{abundant_test_data_count} entries need to be shifted from test to train so the train/test ratio can be {1-aimed_test_ratio}/{aimed_test_ratio}")
  # Step 1: Filter and select rows
  filtered_rows = df_test[df_test["task"] == "TPP2"]
  if len(filtered_rows) < abundant_test_data_count:
      raise ValueError("Not enough entries with 'task' == 'TPP2' to move.")
  rows_to_move = filtered_rows.head(abundant_test_data_count)
  # Step 2: Append to df_train
  df_train = pd.concat([df_train, rows_to_move], ignore_index=True)
  # Step 3: Remove from df_test
  df_test = df_test.drop(rows_to_move.index)

elif(test_ratio < aimed_test_ratio):
  missing_test_data_count = math.ceil((aimed_test_ratio - test_ratio) * (len(df_test) + len(df_train)))
  print(f"{missing_test_data_count} entries need to be shifted from train to test so the train/test ratio can be {1-aimed_test_ratio}/{aimed_test_ratio}")

  for i in range(missing_test_data_count):
    # Find values that appear more than once in each column
    non_unique_epitopes = df_train['Epitope'].value_counts()
    non_unique_epitopes = non_unique_epitopes[non_unique_epitopes > 1].index.tolist()

    non_unique_TRB_CDR3 = df_train['TRB_CDR3'].value_counts()
    non_unique_TRB_CDR3 = non_unique_TRB_CDR3[non_unique_TRB_CDR3 > 1].index.tolist()

    # Filter df_train to only include rows where the Epitope and CDR3 values are not unique
    filtered_df = df_train[df_train['Epitope'].isin(non_unique_epitopes) & df_train['TRB_CDR3'].isin(non_unique_TRB_CDR3)]
    
    if not filtered_df.empty:
      first_row_index = filtered_df.index[0]
      df_train.loc[first_row_index, 'task'] = "TPP1"
      # Append this row to df_test
      df_test = pd.concat([df_test, pd.DataFrame([df_train.loc[first_row_index]])], ignore_index=True)
      # Drop this row from df_train using its index
      df_train = df_train.drop(first_row_index)
    else:
      raise Exception("The specific row does not exist in df_train.")

In [None]:
number_of_TPP1 = (df_test['task'] == 'TPP1').sum()
number_of_TPP2 = (df_test['task'] == 'TPP2').sum()
number_of_TPP3 = (df_test['task'] == 'TPP3').sum()
test_ratio = len(df_test)/(len(df_test) + len(df_train))

print(f"train data has {len(df_train)} entries")
print(f"test data has {len(df_test)} entries")
print(f"test data has {number_of_TPP1} TPP1 tasks (seen tcr & seen epitopes).")
print(f"test data has {number_of_TPP2} TPP2 tasks (unseen tcr & seen epitopes).")
print(f"test data has {number_of_TPP3} TPP3 tasks (unseen tcr & unseen epitope).")
print(f"the train/test ratio is {(1-test_ratio)}/{test_ratio}")

In [None]:
#df_test.drop(columns=["_merge"], inplace=True)
#df_train.drop(columns=["_merge"], inplace=True)

df_test.to_csv(paired_output_folder+"/test.tsv", sep="\t")
df_train.to_csv(paired_output_folder+"/train.tsv", sep="\t")