In [None]:
import os

In [None]:
# set precision of mhc and V/J values (gene or allele)
precision = 'allele'

In [None]:
# this function is not thread safe
def create_folders_if_not_exists(folders):
  for path in folders:
    if not os.path.exists(path):
      os.makedirs(path)

In [None]:
pipeline_data = './data'
pipeline_data_plain = f'{pipeline_data}/plain_datasets'
pipeline_data_cleaned = f'{pipeline_data}/cleaned_datasets'
pipeline_data_concatenated = f'{pipeline_data}/concatenated_datasets'
pipeline_data_splitted = f'{pipeline_data}/splitted_datasets'
pipeline_data_temp_bucket = f'{pipeline_data}/temp'

pipeline_folders = [pipeline_data, pipeline_data_plain, pipeline_data_cleaned, pipeline_data_concatenated, pipeline_data_splitted, pipeline_data_temp_bucket]

create_folders_if_not_exists(pipeline_folders)

## Data Preparation

### IEDB

In [None]:
# prepare directories
IEDB_data_plain = f'{pipeline_data_plain}/IEDB'
IEDB_data_cleaned = f'{pipeline_data_cleaned}/IEDB'
IEDB_data_fitted = f'{pipeline_data_temp_bucket}/IEDB'

IEDB_folders = [IEDB_data_plain, IEDB_data_cleaned, IEDB_data_fitted]
create_folders_if_not_exists(IEDB_folders)

In [None]:
# prepare parameters for notebook IEDB fit data
path_prefix_plain = IEDB_data_plain
path_prefix_fitted = IEDB_data_fitted
mhc_I_input_beta = f"{path_prefix_plain}/MHCI_IEDB_beta_export.csv"
mhc_I_output_beta = f"{path_prefix_fitted}/IEDB_beta_fitted.csv"
mhc_I_input_paired = f"{path_prefix_plain}/MHCI_IEDB_paired_export.csv"
mhc_I_output_paired = f"{path_prefix_fitted}/IEDB_paired_fitted.csv"

# fit IEDB data
%run ./data_scripts/IEDB/IEDB_fitted_dataset.ipynb

In [None]:
# prepare parameters for notebook IEDB clean data
path_prefix_fitted = IEDB_data_fitted
path_prefix_cleaned =  IEDB_data_cleaned
fitted_file_beta = "IEDB_beta_fitted.csv"
fitted_file_paired = "IEDB_paired_fitted.csv"
cleaned_file_beta = "IEDB_cleaned_data_beta.csv"
cleaned_file_paired = "IEDB_cleaned_data_paired.csv"

# clean IEDB data
%run ./data_scripts/IEDB/IEDB_clean_dataset.ipynb

In [None]:
IEDB_cleaned_beta_output = f'{IEDB_data_cleaned}/{cleaned_file_beta}'
IEDB_cleaned_paired_output = f'{IEDB_data_cleaned}/{cleaned_file_paired}'

### McPAS

In [None]:
# prepare directories
McPas_data_plain = f'{pipeline_data_plain}/McPas'
McPas_data_cleaned = f'{pipeline_data_cleaned}/McPas'
McPas_data_fitted = f'{pipeline_data_temp_bucket}/McPas'

McPas_folders = [McPas_data_plain, McPas_data_cleaned, McPas_data_fitted]
create_folders_if_not_exists(McPas_folders)

In [None]:
# prepare parameters for notebook McPAS fit data
input_file = f'{McPas_data_plain}/McPAS-TCR.csv'
path_prefix_fitted = McPas_data_fitted
fitted_file = 'McPAS_fitted.tsv'

# fit McPAS data
%run ./data_scripts/McPas-TCR/fit_data_mcpastcr_both.ipynb

In [None]:
# prepare parameters for notebook McPAS clean data
fitted_input_file = f'{McPas_data_fitted}/{fitted_file}'
path_prefix_cleaned = McPas_data_cleaned
cleaned_file_paired = 'McPAS_cleaned_data_paired.tsv'
cleaned_file_beta = 'McPAS_cleaned_data_beta.tsv'

# clean McPAS data
%run ./data_scripts/McPas-TCR/clean_data_mcpastcr_both.ipynb

In [None]:
McPAS_cleaned_beta_output = f'{McPas_data_cleaned}/{cleaned_file_beta}'
McPAS_cleaned_paired_output = f'{McPas_data_cleaned}/{cleaned_file_paired}'

### VDJdb

In [None]:
# prepare directories
VDJdb_data_plain = f'{pipeline_data_plain}/VDJdb'
VDJdb_data_cleaned = f'{pipeline_data_cleaned}/VDJdb'
VDJdb_data_fitted = f'{pipeline_data_temp_bucket}/VDJdb'

VDJdb_folders = [VDJdb_data_plain, VDJdb_data_cleaned, VDJdb_data_fitted]
create_folders_if_not_exists(VDJdb_folders)

fitted_beta_file = 'VDJdb_beta_fitted.tsv'
fitted_paired_file = 'VDJdb_paired_fitted.tsv'

In [None]:
# prepare parameters for notebook VDJdb fit data paired
input_file = f'{VDJdb_data_plain}/VDJdb_paired_only.tsv'
path_prefix_fitted = VDJdb_data_fitted
fitted_file = fitted_paired_file

# fit paired VDJdb data
%run ./data_scripts/VDJdb/fit_data_vdjdb_paired.ipynb

In [None]:
# prepare parameters for notebook VDJdb fit data beta
input_file = f'{VDJdb_data_plain}/VDJdb_beta_only.tsv'
path_prefix_fitted = VDJdb_data_fitted
fitted_file = fitted_beta_file

# fit beta VDJdb data
%run ./data_scripts/VDJdb/fit_data_vdjdb_beta.ipynb

In [None]:
# prepare parameters for notebook VDJdb clean data paired
input_file = f'{VDJdb_data_fitted}/{fitted_paired_file}'
cleaned_file_paired = 'VDJdb_cleaned_data_paired.tsv'
output_file = f'{VDJdb_data_cleaned}/{cleaned_file_paired}'

# clean paired VDJdb data
%run ./data_scripts/VDJdb/clean_data_vdjdb_paired.ipynb

In [None]:
# prepare parameters for notebook VDJdb clean data beta
input_file = f'{VDJdb_data_fitted}/{fitted_beta_file}'
cleaned_file_beta = 'VDJdb_cleaned_data_beta.tsv'
output_file = f'{VDJdb_data_cleaned}/{cleaned_file_beta}'

# clean beta VDJdb data
%run ./data_scripts/VDJdb/clean_data_vdjdb_beta.ipynb

In [None]:
VDJdb_cleaned_beta_output = f'{VDJdb_data_cleaned}/{cleaned_file_beta}'
VDJdb_cleaned_paired_output = f'{VDJdb_data_cleaned}/{cleaned_file_paired}'

## Data Concatenation
The concatenation includes further cleaning and advanced removal of duplicated rows.

In [None]:
# prepare parameters for concatenation
custom_dataset_path = f'{pipeline_data_concatenated}/{precision}/'

# beta input files
vdjdb_beta_read_path = VDJdb_cleaned_beta_output
mcpastcr_beta_read_path = McPAS_cleaned_beta_output
iedb_beta_read_path = IEDB_cleaned_beta_output
# paired input files
vdjdb_paired_read_path = VDJdb_cleaned_paired_output
mcpastcr_paired_read_path = McPAS_cleaned_paired_output
iedb_paired_read_path = IEDB_cleaned_paired_output
# output files
output_file_beta = 'beta_concatenated.tsv'
output_file_paired = 'paired_concatenated.tsv'

create_folders_if_not_exists([custom_dataset_path])

%run ./data_scripts/concatDatasets.ipynb

In [None]:
concatenated_paired = f'{custom_dataset_path}/{output_file_paired}'
concatenated_beta = f'{custom_dataset_path}/{output_file_beta}'

## Data split
The split creates 3 datasets. Train, Validation and Test. 

In [None]:
# prepare parameters for split of paired dataset
input_file = concatenated_paired
paired_output_folder = f'{pipeline_data_splitted}/{precision}/paired'
validation_file_name = 'validation.tsv'
test_file_name = 'test.tsv'
train_file_name = 'train.tsv'
aimed_test_ratio = 0.3 # this means 30% of the concatenated dataset will be for test and validation (fifty/fifty)

create_folders_if_not_exists([paired_output_folder])

# do the split
%run ./data_scripts/data_preparation/split_paired.ipynb

In [None]:
# prepare parameters for split of beta dataset
input_file = concatenated_beta
beta_output_folder = f'{pipeline_data_splitted}/{precision}/beta'
aimed_test_ratio = 0.3 # this means 30% of the concatenated dataset will be for test and validation (fifty/fifty)

create_folders_if_not_exists([beta_output_folder])

# do the split
%run ./data_scripts/data_preparation/split_beta.ipynb

## Negative Data

In [None]:
# prepare parameters for paired dataset
read_path_train = f'{paired_output_folder}/{train_file_name}'
read_path_test = f'{paired_output_folder}/{test_file_name}'
read_path_validation = f'{paired_output_folder}/{validation_file_name}'
temp_path = f'{pipeline_data_temp_bucket}/negative_samples/paired/'
output_path = paired_output_folder  # we are not interested in the positive only data so we override them with positive/negative dataset
train_output_name = train_file_name
validation_output_name = validation_file_name
test_output_name = test_file_name

create_folders_if_not_exists([temp_path])

%run ./data_scripts/negative_samples/negative_samples_paired.ipynb

In [None]:
# prepare parameters for beta dataset
read_path_train = f'{beta_output_folder}/{train_file_name}'
read_path_test = f'{beta_output_folder}/{test_file_name}'
read_path_validation = f'{beta_output_folder}/{validation_file_name}'
temp_path = f'{pipeline_data_temp_bucket}/negative_samples/beta/'
output_path = beta_output_folder  # we are not interested in the positive only data so we override them with positive/negative dataset
train_output_name = train_file_name
validation_output_name = validation_file_name
test_output_name = test_file_name

create_folders_if_not_exists([temp_path])

%run ./data_scripts/negative_samples/negative_samples_beta.ipynb

## Task Classification
The classification in the split notebook correct for positive only data. After adding negative data, some classifications might be wrong.

In [None]:
# do the classification for paired data
paired = True
train_data_path = f'{paired_output_folder}/{train_file_name}'
test_data_path = f'{paired_output_folder}/{test_file_name}'
validation_data_path = f'{paired_output_folder}/{validation_file_name}'

%run ./data_scripts/data_preparation/classification.ipynb

In [None]:
# extended classification for paired data
test_path = f'{paired_output_folder}/{test_file_name}'
train_path = f'{paired_output_folder}/{train_file_name}'
validation_path = f'{paired_output_folder}/{validation_file_name}'
output_path = f'{paired_output_folder}/test_reclassified_paired_specific.tsv'
paired_data_path = paired_output_folder
alpha_cdr3_name = 'TRA_CDR3'
beta_cdr3_name = 'TRB_CDR3'
epitope_name = 'Epitope'
task_name = 'task'

%run ./data_scripts/data_preparation/paired_reclassification.ipynb

In [None]:
# do the classification for beta data
paired = False
train_data_path = f'{beta_output_folder}/{train_file_name}'
test_data_path = f'{beta_output_folder}/{test_file_name}'
validation_data_path = f'{beta_output_folder}/{validation_file_name}'

%run ./data_scripts/data_preparation/classification.ipynb

In the next two cells the classification is checked. If the output says "Classification is correct", everything is fine.

In [None]:
# check task classification paired
splitted_data_path = paired_output_folder

%run ./data_scripts/data_preparation/check_task_classification_paired.ipynb

In [None]:
# check task classification beta
splitted_data_path = beta_output_folder

%run ./data_scripts/data_preparation/check_task_classification_beta.ipynb

## Upload dataset

In [None]:
# raise Exception("Prevent upload")
# from dotenv import load_dotenv, find_dotenv
# load_dotenv()

# # upload paired data
# path_to_data = f'{pipeline_data_splitted}/{precision}/paired'
# dataset_name = f'paired_{precision}'
# main_project_name = os.getenv("MAIN_PROJECT_NAME")

# %run ./data_scripts/upload_datasets.ipynb

In [None]:
# # upload beta data
# path_to_data = f'{pipeline_data_splitted}/{precision}/beta'
# dataset_name = f'beta_{precision}'

# %run ./data_scripts/upload_datasets.ipynb

## Create Embeddings

In [None]:
path_paired_test = f"data/splitted_datasets/{precision}/paired/test.tsv"
path_paired_validation = f"data/splitted_datasets/{precision}/paired/validation.tsv"
path_paired_train = f"data/splitted_datasets/{precision}/paired/train.tsv"
path_beta_test = f"data/splitted_datasets/{precision}/beta/test.tsv"
path_beta_validation = f"data/splitted_datasets/{precision}/beta/validation.tsv"
path_beta_train = f"data/splitted_datasets/{precision}/beta/train.tsv"

# paired test
%run ./data_scripts/generateEmbeddings.py paired {path_paired_test} TRA_paired_embeddings.npz TRA_CDR3 embeddings_
%run ./data_scripts/generateEmbeddings.py paired {path_paired_test} TRB_paired_embeddings.npz TRB_CDR3 embeddings_
%run ./data_scripts/generateEmbeddings.py paired {path_paired_test} Epitope_paired_embeddings.npz Epitope embeddings_

# paired validation
%run ./data_scripts/generateEmbeddings.py paired {path_paired_validation} TRA_paired_embeddings.npz TRA_CDR3 embeddings_
%run ./data_scripts/generateEmbeddings.py paired {path_paired_validation} TRB_paired_embeddings.npz TRB_CDR3 embeddings_
%run ./data_scripts/generateEmbeddings.py paired {path_paired_validation} Epitope_paired_embeddings.npz Epitope embeddings_

# paired train
%run ./data_scripts/generateEmbeddings.py paired {path_paired_train} TRA_paired_embeddings.npz TRA_CDR3 embeddings_
%run ./data_scripts/generateEmbeddings.py paired {path_paired_train} TRB_paired_embeddings.npz TRB_CDR3 embeddings_
%run ./data_scripts/generateEmbeddings.py paired {path_paired_train} Epitope_paired_embeddings.npz Epitope embeddings_

# beta test
%run beta ./data_scripts/generateEmbeddings.py {path_beta_test} TRB_beta_embeddings.npz TRB_CDR3 embeddings_
%run beta ./data_scripts/generateEmbeddings.py {path_beta_test} Epitope_beta_embeddings.npz Epitope embeddings_

# beta validation
%run beta ./data_scripts/generateEmbeddings.py {path_beta_validation} TRB_beta_embeddings.npz TRB_CDR3 embeddings_
%run beta ./data_scripts/generateEmbeddings.py {path_beta_validation} Epitope_beta_embeddings.npz Epitope embeddings_

# beta train
%run beta ./data_scripts/generateEmbeddings.py {path_beta_train} TRB_beta_embeddings.npz TRB_CDR3 embeddings_
%run beta ./data_scripts/generateEmbeddings.py {path_beta_train} Epitope_beta_embeddings.npz Epitope embeddings_



## Create Physicochemical Properties

In [None]:
!python ./data_scripts/generatePhysicoParallel.py paired {pipeline_data_splitted}/{precision}/paired test ./data/physicoProperties {precision}
!python ./data_scripts/generatePhysicoParallel.py paired {pipeline_data_splitted}/{precision}/paired validation ./data/physicoProperties {precision}
!python ./data_scripts/generatePhysicoParallel.py paired {pipeline_data_splitted}/{precision}/paired train ./data/physicoProperties {precision}

!python ./data_scripts/generatePhysicoParallel.py beta {pipeline_data_splitted}/{precision}/beta test ./data/physicoProperties {precision}
!python ./data_scripts/generatePhysicoParallel.py beta {pipeline_data_splitted}/{precision}/beta validation ./data/physicoProperties {precision}
!python ./data_scripts/generatePhysicoParallel.py beta {pipeline_data_splitted}/{precision}/beta train ./data/physicoProperties {precision}

### Scale Physicochemical Properties

In [None]:
base_path = "./data/physicoProperties"
chain = "paired"
#%run ./data_scripts/scale_physicos.ipynb

chain = "beta"
%run ./data_scripts/scale_physicos.ipynb