In [None]:
import pandas as pd
from Levenshtein import distance
import wandb
import os
from dotenv import load_dotenv
from pandarallel import pandarallel

In [None]:
# -----------------------------------------------------------------------------
# W&B Setup
# -----------------------------------------------------------------------------
load_dotenv()
PROJECT_NAME = os.getenv("MAIN_PROJECT_NAME")
print(f"PROJECT_NAME: {PROJECT_NAME}")
run = wandb.init(project=PROJECT_NAME, job_type=f"download dataset", entity="ba-zhaw")
config = wandb.config


# Download corresponding artifact (= dataset) from W&B
precision = "allele" # gene or allele
download_path = "/home/ubuntu/BA_ZHAW/data/WnB_Download/paired"
output_path = f'/home/ubuntu/BA_ZHAW/data/EDA/paired/'
output_file_name = f'paired_{precision}_levenshtein.tsv'
dataset_name = f"paired_{precision}"
artifact = run.use_artifact(f"{dataset_name}:latest")
data_dir = artifact.download(download_path)

run.finish()

train_file_path = f"{data_dir}/{precision}/train.tsv"
test_file_path = f"{data_dir}/{precision}/test_reclassified_paired_specific.tsv" # because seen in this case is either alpha or beta chain
val_file_path = f"{data_dir}/{precision}/validation.tsv"

n = 2000
df_train = pd.read_csv(train_file_path, sep="\t")
#df_test = pd.read_csv(test_file_path, sep="\t")
df_test = pd.read_csv(test_file_path, sep="\t", skiprows=lambda i: i % n != 0)
df_validation = pd.read_csv(val_file_path, sep="\t")
df_seen = pd.concat([df_train, df_validation])

In [None]:
df_seen

In [None]:
df_test

In [None]:
levenshtein_min_column_cdr_alpha = 'min_levenshtein_cdr_alpha_to_seen'
levenshtein_min_column_cdr_beta = 'min_levenshtein_cdr_beta_to_seen'
levenshtein_min_column_epitope = 'min_levenshtein_epitope_to_seen'
levenshtein_max_column_cdr_alpha = 'max_levenshtein_cdr_alpha_to_seen'
levenshtein_max_column_cdr_beta = 'max_levenshtein_cdr_beta_to_seen'
levenshtein_max_column_epitope = 'max_levenshtein_epitope_to_seen'

In [None]:
df_test[levenshtein_min_column_cdr_alpha] = 0
df_test[levenshtein_min_column_cdr_beta] = 0
df_test[levenshtein_min_column_epitope] = 0
df_test[levenshtein_max_column_cdr_alpha] = 0
df_test[levenshtein_max_column_cdr_beta] = 0
df_test[levenshtein_max_column_epitope] = 0

def set_levenshtein(row):
  min_levenshtein_cdr_alpha = 100 # start value
  min_levenshtein_cdr_beta = 100 # start value
  min_levenshtein_epitope = 100 # start value
  max_levenshtein_cdr_alpha = 0 # start value
  max_levenshtein_cdr_beta = 0 # start value
  max_levenshtein_epitope = 0 # start value

  alpha_val = row['TRA_CDR3']
  beta_val = row['TRB_CDR3']

  for index, row_seen in df_seen.iterrows():
    # cdr3 distance 
    cdr_alpha_distance = distance(alpha_val, row_seen['TRA_CDR3'])
    cdr_beta_distance = distance(beta_val, row_seen['TRB_CDR3'])

    if min_levenshtein_cdr_alpha > cdr_alpha_distance:
      min_levenshtein_cdr_alpha = cdr_alpha_distance

    if min_levenshtein_cdr_beta > cdr_beta_distance:
      min_levenshtein_cdr_beta = cdr_beta_distance

    if max_levenshtein_cdr_alpha < cdr_alpha_distance:
      max_levenshtein_cdr_alpha = cdr_alpha_distance

    if max_levenshtein_cdr_beta < cdr_beta_distance:
      max_levenshtein_cdr_beta = cdr_beta_distance
    
    # epitope distance
    epitope_distance = distance(row['Epitope'], row_seen['Epitope'])
    
    if min_levenshtein_epitope > epitope_distance:
      min_levenshtein_epitope = epitope_distance
  
    if max_levenshtein_epitope < epitope_distance:
      max_levenshtein_epitope = epitope_distance
  
  print(f"Levenshtein distance (Task {row['task']}) min_cdr3_alpha={min_levenshtein_cdr_alpha}, min_cdr3_beta={min_levenshtein_cdr_beta}, max_cdr3_alpha={max_levenshtein_cdr_alpha}, max_cdr3_beta={max_levenshtein_cdr_beta}, epitope={min_levenshtein_epitope}")

  if row['task'] == 'TPP1' and ((min_levenshtein_cdr_alpha != 0 and min_levenshtein_cdr_beta != 0) or min_levenshtein_epitope != 0):
    raise Exception("Levenshtein distance from seen data is not 0!")
  if row['task'] == 'TPP2' and ((min_levenshtein_cdr_alpha == 0 or min_levenshtein_cdr_beta == 0) or min_levenshtein_epitope != 0):
    raise Exception("Error in TPP2 classification!")
  if row['task'] == 'TPP3' and ((min_levenshtein_cdr_alpha == 0 or min_levenshtein_cdr_beta == 0) or min_levenshtein_epitope == 0):
    raise Exception("Error in TPP3 classification!")
  if row['task'] == 'TPP4' and (min_levenshtein_cdr_alpha != 0 and min_levenshtein_cdr_beta != 0 or min_levenshtein_epitope == 0):
    raise Exception("Levenshtein distance from unseen data is 0!")
  
  return pd.Series([min_levenshtein_cdr_alpha, max_levenshtein_cdr_alpha, min_levenshtein_cdr_beta, max_levenshtein_cdr_beta, min_levenshtein_epitope, max_levenshtein_epitope])

pandarallel.initialize(progress_bar=True)
df_test[[levenshtein_min_column_cdr_alpha, levenshtein_max_column_cdr_alpha, levenshtein_min_column_cdr_beta, levenshtein_max_column_cdr_beta, levenshtein_min_column_epitope, levenshtein_max_column_epitope]] = df_test.parallel_apply(lambda x: set_levenshtein(x), axis=1)

In [None]:
df_test

In [None]:
if not os.path.exists(output_path):
  os.makedirs(output_path)
df_test.to_csv(f'{output_path}/{output_file_name}', sep="\t", index=False)
print("FINISHED SKRIPT")