In [1]:
import pandas as pd
from Levenshtein import distance
import wandb
import os
from dotenv import load_dotenv

In [2]:
# -----------------------------------------------------------------------------
# W&B Setup
# -----------------------------------------------------------------------------
load_dotenv()
PROJECT_NAME = os.getenv("MAIN_PROJECT_NAME")
print(f"PROJECT_NAME: {PROJECT_NAME}")
run = wandb.init(project=PROJECT_NAME, job_type=f"download dataset", entity="ba-zhaw")
config = wandb.config


# Download corresponding artifact (= dataset) from W&B
precision = "allele" # gene or allele
dataset_name = f"beta_{precision}"
artifact = run.use_artifact(f"{dataset_name}:latest")
data_dir = artifact.download(f"/teamspace/studios/this_studio/BA_ZHAW/data/WnB_Download/beta")

run.finish()

train_file_path = f"{data_dir}/{precision}/train.tsv"
test_file_path = f"{data_dir}/{precision}/test.tsv"
val_file_path = f"{data_dir}/{precision}/validation.tsv"

n = 10000
df_train = pd.read_csv(train_file_path, sep="\t")
df_test = pd.read_csv(test_file_path, sep="\t")
#df_test = pd.read_csv(test_file_path, sep="\t", skiprows=lambda i: i % n != 0)
df_validation = pd.read_csv(val_file_path, sep="\t")
df_seen = pd.concat([df_train, df_validation])

PROJECT_NAME: BA_Project_ZHAW


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mgabricyr[0m ([33mba-zhaw[0m). Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m:   3 of 3 files downloaded.  


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

In [3]:
levenshtein_min_column_cdr = 'min_levenshtein_cdr_to_seen'
levenshtein_min_column_epitope = 'min_levenshtein_epitope_to_seen'
levenshtein_max_column_cdr = 'max_levenshtein_cdr_to_seen'
levenshtein_max_column_epitope = 'max_levenshtein_epitope_to_seen'

In [4]:
df_test[levenshtein_min_column_cdr] = 0
df_test[levenshtein_min_column_epitope] = 0
df_test[levenshtein_max_column_cdr] = 0
df_test[levenshtein_max_column_epitope] = 0

def set_levenshtein(row):
  min_levenshtein_cdr = 100 # start value
  min_levenshtein_epitope = 100 # start value
  max_levenshtein_cdr = 0 # start value
  max_levenshtein_epitope = 0 # start value

  for index, row_seen in df_seen.iterrows():
    # cdr3 distance 
    cdr_distance = distance(row['TRB_CDR3'], row_seen['TRB_CDR3'])

    if min_levenshtein_cdr > cdr_distance:
      min_levenshtein_cdr = cdr_distance

    if max_levenshtein_cdr < cdr_distance:
      max_levenshtein_cdr = cdr_distance
    
    # epitope distance
    epitope_distance = distance(row['Epitope'], row_seen['Epitope'])
    
    if min_levenshtein_epitope > epitope_distance:
      min_levenshtein_epitope = epitope_distance
  
    if max_levenshtein_epitope < epitope_distance:
      max_levenshtein_epitope = epitope_distance
  
  print(f"Levenshtein distance (Task {row['task']}) cdr3={min_levenshtein_cdr}, epitope={min_levenshtein_epitope}")
  if row['task'] == 'TPP1' and (min_levenshtein_cdr != 0 or min_levenshtein_epitope != 0):
    raise Exception("Levenshtein distance from seen data is not 0!")
  if row['task'] == 'TPP2' and (min_levenshtein_cdr == 0 or min_levenshtein_epitope != 0):
    raise Exception("Error in TPP2 classification!")
  if row['task'] == 'TPP3' and (min_levenshtein_cdr == 0 or min_levenshtein_epitope == 0):
    raise Exception("Error in TPP3 classification!")
  if row['task'] == 'TPP4' and (min_levenshtein_cdr != 0 or min_levenshtein_epitope == 0):
    raise Exception("Levenshtein distance from unseen data is 0!")
  
  return pd.Series([min_levenshtein_cdr, max_levenshtein_cdr, min_levenshtein_epitope, max_levenshtein_epitope])


df_test[[levenshtein_min_column_cdr, levenshtein_max_column_cdr, levenshtein_min_column_epitope, levenshtein_max_column_epitope]] = df_test.apply(lambda x: set_levenshtein(x), axis=1)

Levenshtein distance (Task TPP1) cdr3=0, epitope=0
Levenshtein distance (Task TPP2) cdr3=3, epitope=0
Levenshtein distance (Task TPP1) cdr3=0, epitope=0
Levenshtein distance (Task TPP2) cdr3=2, epitope=0
Levenshtein distance (Task TPP2) cdr3=3, epitope=0


In [5]:
df_test

Unnamed: 0.1,Unnamed: 0,TCR_name,TRBV,TRBJ,TRB_CDR3,TRBC,Epitope,MHC,Binding,task,min_levenshtein_cdr_to_seen,min_levenshtein_epitope_to_seen,max_levenshtein_cdr_to_seen,max_levenshtein_epitope_to_seen
0,9999,10000,TRBV7-8,TRBJ2-7,CASSFGAGLTYEQYF,,LPRRSGAAGA,HLA-B,1,TPP1,0,0,30,39
1,19999,20000,TRBV19,TRBJ2-2,CASSAMGTALNTGELFF,,YLDAYNMMI,,1,TPP2,3,0,28,39
2,29999,30000,TRBV12-4,TRBJ2-7,CASSPRQAAYEQYF,,LPRRSGAAGA,HLA-B,0,TPP1,0,0,31,39
3,39999,40000,TRBV7-9,TRBJ1-5,CASSLGFSGNQPQHF,,KLGGALQAK,HLA-A,0,TPP2,2,0,27,39
4,49999,50000,TRBV15,TRBJ2-3,CATSRDAPPRVSTDTQYF,,MIELSLIDFYLCFLAFLLFLVLIML,,0,TPP2,3,0,27,37


In [6]:
path = f'/teamspace/studios/this_studio/BA_ZHAW/data/EDA/beta/beta_{precision}_levenshtein.tsv'
if not os.path.exists(path):
  os.makedirs(path)
df_test.to_csv(path, sep="\t")

OSError: Cannot save file into a non-existent directory: '/teamspace/studios/this_studio/BA_ZHAW/data/EDA/beta'