In [None]:
import pandas as pd

splitted_data_path = "../../data/splitted_data/beta"

df_train = pd.read_csv(f"{splitted_data_path}/train.tsv", sep="\t")
df_test = pd.read_csv(f"{splitted_data_path}/test.tsv", sep="\t")

In [None]:
# Create sets for quick lookup
epitopes_in_train = set(df_train['Epitope'])
trb_cdr3_in_train = set(df_train['TRB_CDR3'])

In [None]:
# Function to verify each row
def verify_task(row):
    epitope_exists = row['Epitope'] in epitopes_in_train
    trb_cdr3_exists = row['TRB_CDR3'] in trb_cdr3_in_train
    
    if epitope_exists and trb_cdr3_exists:
        return 'TPP1' == row['task']
    elif epitope_exists and not trb_cdr3_exists:
        return 'TPP2' == row['task']
    elif not epitope_exists and not trb_cdr3_exists:
        return 'TPP3' == row['task']
    return False  # This handles unexpected cases

# Apply the verification function
df_test['is_correct'] = df_test.apply(verify_task, axis=1)

In [None]:
# Check overall correctness
correctness_summary = df_test['is_correct'].value_counts()

# Optionally, identify rows with incorrect task settings
incorrect_rows = df_test[df_test['is_correct'] == False]
if len(incorrect_rows) > 0:
  print("Incorrectly set tasks:")
  print(incorrect_rows[['Epitope', 'TRB_CDR3', 'task']])
else:
  print("Task property seems to be right")

print("Correctness summary:")
print(correctness_summary)