In [57]:
import pandas as pd
from Levenshtein import distance
import wandb
import os
from dotenv import load_dotenv

In [58]:
# -----------------------------------------------------------------------------
# W&B Setup
# -----------------------------------------------------------------------------
load_dotenv()
PROJECT_NAME = os.getenv("MAIN_PROJECT_NAME")
print(f"PROJECT_NAME: {PROJECT_NAME}")
run = wandb.init(project=PROJECT_NAME, job_type=f"download dataset", entity="ba-zhaw")
config = wandb.config


# Download corresponding artifact (= dataset) from W&B
precision = "allele" # gene or allele
dataset_name = f"paired_{precision}"
artifact = run.use_artifact(f"{dataset_name}:latest")
data_dir = artifact.download(f"/teamspace/studios/this_studio/BA_ZHAW/data/WnB_Download/paired")

run.finish()

train_file_path = f"{data_dir}/{precision}/train.tsv"
test_file_path = f"{data_dir}/{precision}/test_reclassified_paired_specific.tsv" # because seen in this case is either alpha or beta chain
val_file_path = f"{data_dir}/{precision}/validation.tsv"

n = 2000
df_train = pd.read_csv(train_file_path, sep="\t")
#df_test = pd.read_csv(test_file_path, sep="\t")
df_test = pd.read_csv(test_file_path, sep="\t", skiprows=lambda i: i % n != 0)
df_validation = pd.read_csv(val_file_path, sep="\t")
df_seen = pd.concat([df_train, df_validation])

PROJECT_NAME: BA_Project_ZHAW


[34m[1mwandb[0m:   4 of 4 files downloaded.  


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

In [59]:
df_seen

Unnamed: 0,TCR_name,TRAV,TRAJ,TRA_CDR3,TRBV,TRBJ,TRB_CDR3,TRAC,TRBC,Epitope,MHC,Binding,task
0,1,TRAV19*01,TRAJ20*01,CALRDYKLSF,TRBV3-1*01,TRBJ1-4*01,CASSQTMTEGTPEKLFF,,,AVFDRKSDAK,HLA-A*11:01,1,
1,2,TRAV17*01,TRAJ36*01,CARDQTGANNLFF,TRBV19*01,TRBJ1-1*01,CASSIGTGNTEAFF,,,KLGGALQAK,HLA-A*03:01,1,
2,3,TRAV13-1*01,TRAJ44*01,CAASMGTGTASKLTF,TRBV9*01,TRBJ1-2*01,CASSPRGPAYGYTF,,,KLGGALQAK,HLA-A*03:01,1,
3,4,TRAV24*01,TRAJ4*01,CAFASFLGGYNKLIF,TRBV15*01,TRBJ2-5*01,CATSGLKETQYF,,,KLGGALQAK,HLA-A*03:01,1,
4,5,TRAV14/DV4*01,TRAJ32*01,CAMGGGYGGATNKLIF,TRBV9*01,TRBJ1-1*01,CASSASQFAEAFF,,,IVTDFSVIK,HLA-A*11:01,1,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
15603,15604,TRAV6*01,,CALSGYSTLTF,TRBV4-2*01,,CASSPYSNQPQHF,,,LLLDRLNQL,HLA-A*02:01,0,
15604,15605,,,CAVNSYGKLTF,,,CASSDGTLGNSPLHF,,,AVFDRKSDAK,HLA-A*11:01,0,
15605,15606,TRAV6,TRAJ6,CALPSGGSYIPTF,TRBV5-1,TRBJ2-1,CASYPSDGYNEQFF,,,SPRWYFYYL,HLA-B*07:02,0,
15606,15607,TRAV20,,CAVQAADSSASKIIF,TRBV7-2,,CASSFWAGGWTEAFF,,,ELAGIGILTV,HLA-A*02:01,0,


In [60]:
df_test

Unnamed: 0,TCR_name,TRAV,TRAJ,TRA_CDR3,TRBV,TRBJ,TRB_CDR3,TRAC,TRBC,Epitope,MHC,Binding,task
0,2000,TRAV27*01,TRAJ39*01,CAGGGAGNMLTF,TRBV7-7*01,TRBJ1-3*01,CASSLAGSGNTIYF,,,KLGGALQAK,HLA-A*03:01,1,TPP1
1,4000,TRAV1-2*01,TRAJ33*01,CAVSDSNYQLIW,TRBV6-5*01,TRBJ2-7*01,CASSYRSSGSYEQYF,,,KLGGALQAK,HLA-A*03:01,1,TPP1
2,6000,,,CCALDMWKF,,,CASSTNRGLFSGDTEAFF,,,KLGGALQAK,HLA-A*03:01,1,TPP2
3,8000,TRAV5*01,TRAJ6*01,CAEISPSGGSYIPTF,TRBV24-1*01,TRBJ1-2*01,CATSDLKVHSGNYGYTF,,,KVLEYVIKV,HLA-A*02:01,0,TPP1
4,10000,TRAV13-1*01,TRAJ45*01,CAAYYSGGGADGLTF,TRBV27*01,TRBJ2-3*01,CASSFLAGGYGDTQYF,,,SPRWYFYYL,HLA-B*07:02,0,TPP1
5,12000,TRAV29/DV5*01,,CAASAQGGTSYGKLTF,TRBV19*01,,CASRMGTSGSTDTQYF,,,AVFDRKSDAK,HLA-A*11:01,0,TPP1
6,14000,TRAV12-2,TRAJ49,CAVTGNQFYF,TRBV9,TRBJ2-7,CASSAGTGAYEQYF,,,KLWASPLHV,HLA-A*02:01,0,TPP1


In [61]:
levenshtein_min_column_cdr_alpha = 'min_levenshtein_cdr_alpha_to_seen'
levenshtein_min_column_cdr_beta = 'min_levenshtein_cdr_beta_to_seen'
levenshtein_min_column_epitope = 'min_levenshtein_epitope_to_seen'
levenshtein_max_column_cdr_alpha = 'max_levenshtein_cdr_alpha_to_seen'
levenshtein_max_column_cdr_beta = 'max_levenshtein_cdr_beta_to_seen'
levenshtein_max_column_epitope = 'max_levenshtein_epitope_to_seen'

In [62]:
df_test[levenshtein_min_column_cdr_alpha] = 0
df_test[levenshtein_min_column_cdr_beta] = 0
df_test[levenshtein_min_column_epitope] = 0
df_test[levenshtein_max_column_cdr_alpha] = 0
df_test[levenshtein_max_column_cdr_beta] = 0
df_test[levenshtein_max_column_epitope] = 0

def set_levenshtein(row):
  min_levenshtein_cdr_alpha = 100 # start value
  min_levenshtein_cdr_beta = 100 # start value
  min_levenshtein_epitope = 100 # start value
  max_levenshtein_cdr_alpha = 0 # start value
  max_levenshtein_cdr_beta = 0 # start value
  max_levenshtein_epitope = 0 # start value

  alpha_val = row['TRA_CDR3']
  beta_val = row['TRB_CDR3']

  for index, row_seen in df_seen.iterrows():
    # cdr3 distance 
    cdr_alpha_distance = distance(alpha_val, row_seen['TRA_CDR3'])
    cdr_beta_distance = distance(beta_val, row_seen['TRB_CDR3'])

    if min_levenshtein_cdr_alpha > cdr_alpha_distance:
      min_levenshtein_cdr_alpha = cdr_alpha_distance

    if min_levenshtein_cdr_beta > cdr_beta_distance:
      min_levenshtein_cdr_beta = cdr_beta_distance

    if max_levenshtein_cdr_alpha < cdr_alpha_distance:
      max_levenshtein_cdr_alpha = cdr_alpha_distance

    if max_levenshtein_cdr_beta < cdr_beta_distance:
      max_levenshtein_cdr_beta = cdr_beta_distance
    
    # epitope distance
    epitope_distance = distance(row['Epitope'], row_seen['Epitope'])
    
    if min_levenshtein_epitope > epitope_distance:
      min_levenshtein_epitope = epitope_distance
  
    if max_levenshtein_epitope < epitope_distance:
      max_levenshtein_epitope = epitope_distance
  
  print(f"Levenshtein distance (Task {row['task']}) min_cdr3_alpha={min_levenshtein_cdr_alpha}, min_cdr3_beta={min_levenshtein_cdr_beta}, max_cdr3_alpha={max_levenshtein_cdr_alpha}, max_cdr3_beta={max_levenshtein_cdr_beta}, epitope={min_levenshtein_epitope}")

  if row['task'] == 'TPP1' and ((min_levenshtein_cdr_alpha != 0 and min_levenshtein_cdr_beta != 0) or min_levenshtein_epitope != 0):
    raise Exception("Levenshtein distance from seen data is not 0!")
  if row['task'] == 'TPP2' and ((min_levenshtein_cdr_alpha == 0 or min_levenshtein_cdr_beta == 0) or min_levenshtein_epitope != 0):
    raise Exception("Error in TPP2 classification!")
  if row['task'] == 'TPP3' and ((min_levenshtein_cdr_alpha == 0 or min_levenshtein_cdr_beta == 0) or min_levenshtein_epitope == 0):
    raise Exception("Error in TPP3 classification!")
  if row['task'] == 'TPP4' and (min_levenshtein_cdr_alpha != 0 and min_levenshtein_cdr_beta != 0 or min_levenshtein_epitope == 0):
    raise Exception("Levenshtein distance from unseen data is 0!")
  
  return pd.Series([min_levenshtein_cdr_alpha, max_levenshtein_cdr_alpha, min_levenshtein_cdr_beta, max_levenshtein_cdr_beta, min_levenshtein_epitope, max_levenshtein_epitope])


df_test[[levenshtein_min_column_cdr_alpha, levenshtein_max_column_cdr_alpha, levenshtein_min_column_cdr_beta, levenshtein_max_column_cdr_beta, levenshtein_min_column_epitope, levenshtein_max_column_epitope]] = df_test.apply(lambda x: set_levenshtein(x), axis=1)

Levenshtein distance (Task TPP1) min_cdr3_alpha=0, min_cdr3_beta=0, max_cdr3_alpha=21, max_cdr3_beta=20, epitope=0
Levenshtein distance (Task TPP1) min_cdr3_alpha=0, min_cdr3_beta=1, max_cdr3_alpha=21, max_cdr3_beta=20, epitope=0
Levenshtein distance (Task TPP2) min_cdr3_alpha=4, min_cdr3_beta=5, max_cdr3_alpha=23, max_cdr3_beta=19, epitope=0
Levenshtein distance (Task TPP1) min_cdr3_alpha=0, min_cdr3_beta=0, max_cdr3_alpha=23, max_cdr3_beta=19, epitope=0
Levenshtein distance (Task TPP1) min_cdr3_alpha=0, min_cdr3_beta=0, max_cdr3_alpha=21, max_cdr3_beta=19, epitope=0
Levenshtein distance (Task TPP1) min_cdr3_alpha=0, min_cdr3_beta=2, max_cdr3_alpha=20, max_cdr3_beta=20, epitope=0
Levenshtein distance (Task TPP1) min_cdr3_alpha=0, min_cdr3_beta=0, max_cdr3_alpha=23, max_cdr3_beta=19, epitope=0


In [63]:
df_test

Unnamed: 0,TCR_name,TRAV,TRAJ,TRA_CDR3,TRBV,TRBJ,TRB_CDR3,TRAC,TRBC,Epitope,MHC,Binding,task,min_levenshtein_cdr_alpha_to_seen,min_levenshtein_cdr_beta_to_seen,min_levenshtein_epitope_to_seen,max_levenshtein_cdr_alpha_to_seen,max_levenshtein_cdr_beta_to_seen,max_levenshtein_epitope_to_seen
0,2000,TRAV27*01,TRAJ39*01,CAGGGAGNMLTF,TRBV7-7*01,TRBJ1-3*01,CASSLAGSGNTIYF,,,KLGGALQAK,HLA-A*03:01,1,TPP1,0,0,0,21,20,21
1,4000,TRAV1-2*01,TRAJ33*01,CAVSDSNYQLIW,TRBV6-5*01,TRBJ2-7*01,CASSYRSSGSYEQYF,,,KLGGALQAK,HLA-A*03:01,1,TPP1,0,1,0,21,20,21
2,6000,,,CCALDMWKF,,,CASSTNRGLFSGDTEAFF,,,KLGGALQAK,HLA-A*03:01,1,TPP2,4,5,0,23,19,21
3,8000,TRAV5*01,TRAJ6*01,CAEISPSGGSYIPTF,TRBV24-1*01,TRBJ1-2*01,CATSDLKVHSGNYGYTF,,,KVLEYVIKV,HLA-A*02:01,0,TPP1,0,0,0,23,19,20
4,10000,TRAV13-1*01,TRAJ45*01,CAAYYSGGGADGLTF,TRBV27*01,TRBJ2-3*01,CASSFLAGGYGDTQYF,,,SPRWYFYYL,HLA-B*07:02,0,TPP1,0,0,0,21,19,21
5,12000,TRAV29/DV5*01,,CAASAQGGTSYGKLTF,TRBV19*01,,CASRMGTSGSTDTQYF,,,AVFDRKSDAK,HLA-A*11:01,0,TPP1,0,2,0,20,20,20
6,14000,TRAV12-2,TRAJ49,CAVTGNQFYF,TRBV9,TRBJ2-7,CASSAGTGAYEQYF,,,KLWASPLHV,HLA-A*02:01,0,TPP1,0,0,0,23,19,21


In [6]:
path = f'/teamspace/studios/this_studio/BA_ZHAW/data/EDA/paired/paired_{precision}_levenshtein.tsv'
if not os.path.exists(path):
  os.makedirs(path)
df_test.to_csv(path, sep="\t")
print("FINISHED SKRIPT")

OSError: Cannot save file into a non-existent directory: '/teamspace/studios/this_studio/BA_ZHAW/data/EDA/beta'