In [1]:
import os
import pandas as pd

In [2]:
# set precision of mhc and V/J values (gene or allele)
precision = 'gene'

In [3]:
# this function is not thread safe
def create_folders_if_not_exists(folders):
  for path in folders:
    if not os.path.exists(path):
      os.makedirs(path)

In [4]:
pipeline_data = './data'
pipeline_data_plain = f'{pipeline_data}/plain_datasets'
pipeline_data_cleaned = f'{pipeline_data}/cleaned_datasets'
pipeline_data_concatenated = f'{pipeline_data}/concatenated_datasets'
pipeline_data_splitted = f'{pipeline_data}/splitted_datasets'
pipeline_data_temp_bucket = f'{pipeline_data}/temp'

pipeline_folders = [pipeline_data, pipeline_data_plain, pipeline_data_cleaned, pipeline_data_concatenated, pipeline_data_splitted, pipeline_data_temp_bucket]

create_folders_if_not_exists(pipeline_folders)

## Data Preparation

### IEDB

In [6]:
# prepare directories
IEDB_data_plain = f'{pipeline_data_plain}/IEDB'
IEDB_data_cleaned = f'{pipeline_data_cleaned}/IEDB'
IEDB_data_fitted = f'{pipeline_data_temp_bucket}/IEDB'

IEDB_folders = [IEDB_data_plain, IEDB_data_cleaned, IEDB_data_fitted]
create_folders_if_not_exists(IEDB_folders)

In [7]:
# prepare parameters for notebook IEDB fit data
path_prefix_plain = IEDB_data_plain
path_prefix_fitted = IEDB_data_fitted
mhc_I_input_beta = f"{path_prefix_plain}/MHCI_IEDB_beta_export.csv"
mhc_I_output_beta = f"{path_prefix_fitted}/IEDB_beta_fitted.csv"
mhc_I_input_paired = f"{path_prefix_plain}/MHCI_IEDB_paired_export.csv"
mhc_I_output_paired = f"{path_prefix_fitted}/IEDB_paired_fitted.csv"

# fit IEDB data
#%run ./data_scripts/IEDB/IEDB_fitted_dataset.ipynb

In [8]:
# prepare parameters for notebook IEDB clean data
path_prefix_fitted = IEDB_data_fitted
path_prefix_cleaned =  IEDB_data_cleaned
fitted_file_beta = "IEDB_beta_fitted.csv"
fitted_file_paired = "IEDB_paired_fitted.csv"
cleaned_file_beta = "IEDB_cleaned_data_beta.csv"
cleaned_file_paired = "IEDB_cleaned_data_paired.csv"

# clean IEDB data
#%run ./data_scripts/IEDB/IEDB_clean_dataset.ipynb

In [9]:
IEDB_cleaned_beta_output = f'{IEDB_data_cleaned}/{cleaned_file_beta}'
IEDB_cleaned_paired_output = f'{IEDB_data_cleaned}/{cleaned_file_paired}'

### McPAS

In [10]:
# prepare directories
McPas_data_plain = f'{pipeline_data_plain}/McPas'
McPas_data_cleaned = f'{pipeline_data_cleaned}/McPas'
McPas_data_fitted = f'{pipeline_data_temp_bucket}/McPas'

McPas_folders = [McPas_data_plain, McPas_data_cleaned, McPas_data_fitted]
create_folders_if_not_exists(McPas_folders)

In [11]:
# prepare parameters for notebook McPAS fit data
input_file = f'{McPas_data_plain}/McPAS-TCR.csv'
path_prefix_fitted = McPas_data_fitted
fitted_file = 'McPAS_fitted.tsv'

# fit McPAS data
%run ./data_scripts/McPas-TCR/fit_data_mcpastcr_both.ipynb

In [12]:
# prepare parameters for notebook McPAS clean data
fitted_input_file = f'{McPas_data_fitted}/{fitted_file}'
path_prefix_cleaned = McPas_data_cleaned
cleaned_file_paired = 'McPAS_cleaned_data_paired.tsv'
cleaned_file_beta = 'McPAS_cleaned_data_beta.tsv'

# clean McPAS data
%run ./data_scripts/McPas-TCR/clean_data_mcpastcr_both.ipynb

MHC Class I has 10078 entries
whole dataframe has 13701 entries
filtered to only use MHC Class I. Length of dataset: 10078


  mcpastcr_cleaned_both_df = mcpastcr_cleaned_both_df[~mask]


In [13]:
McPAS_cleaned_beta_output = f'{McPas_data_cleaned}/{cleaned_file_beta}'
McPAS_cleaned_paired_output = f'{McPas_data_cleaned}/{cleaned_file_paired}'

### VDJdb

In [14]:
# prepare directories
VDJdb_data_plain = f'{pipeline_data_plain}/VDJdb'
VDJdb_data_cleaned = f'{pipeline_data_cleaned}/VDJdb'
VDJdb_data_fitted = f'{pipeline_data_temp_bucket}/VDJdb'

VDJdb_folders = [VDJdb_data_plain, VDJdb_data_cleaned, VDJdb_data_fitted]
create_folders_if_not_exists(VDJdb_folders)

fitted_beta_file = 'VDJdb_beta_fitted.tsv'
fitted_paired_file = 'VDJdb_paired_fitted.tsv'

In [15]:
# prepare parameters for notebook VDJdb fit data paired
input_file = f'{VDJdb_data_plain}/VDJdb_paired_only.tsv'
path_prefix_fitted = VDJdb_data_fitted
fitted_file = fitted_paired_file

# fit paired VDJdb data
%run ./data_scripts/VDJdb/fit_data_vdjdb_paired.ipynb

In [16]:
# prepare parameters for notebook VDJdb fit data beta
input_file = f'{VDJdb_data_plain}/VDJdb_beta_only.tsv'
path_prefix_fitted = VDJdb_data_fitted
fitted_file = fitted_beta_file

# fit beta VDJdb data
%run ./data_scripts/VDJdb/fit_data_vdjdb_beta.ipynb

In [17]:
# prepare parameters for notebook VDJdb clean data paired
input_file = f'{VDJdb_data_fitted}/{fitted_paired_file}'
cleaned_file_paired = 'VDJdb_cleaned_data_paired.tsv'
output_file = f'{VDJdb_data_cleaned}/{cleaned_file_paired}'

# clean paired VDJdb data
%run ./data_scripts/VDJdb/clean_data_vdjdb_paired.ipynb

MHC Class I has 27414 entries
whole dataframe has 28119 entries
filtered to only use MHC Class I. Length of dataset: 27414


In [18]:
# prepare parameters for notebook VDJdb clean data beta
input_file = f'{VDJdb_data_fitted}/{fitted_beta_file}'
cleaned_file_beta = 'VDJdb_cleaned_data_beta.tsv'
output_file = f'{VDJdb_data_cleaned}/{cleaned_file_beta}'

# clean beta VDJdb data
%run ./data_scripts/VDJdb/clean_data_vdjdb_beta.ipynb

MHC Class I has 46507 entries
whole dataframe has 49042 entries
filtered to only use MHC Class I. Length of dataset: 46507


In [19]:
VDJdb_cleaned_beta_output = f'{VDJdb_data_cleaned}/{cleaned_file_beta}'
VDJdb_cleaned_paired_output = f'{VDJdb_data_cleaned}/{cleaned_file_paired}'

## Data Concatenation
The concatenation includes further cleaning and advanced removal of duplicated rows.

In [20]:
# prepare parameters for concatenation
custom_dataset_path = f'{pipeline_data_concatenated}/{precision}/'

# beta input files
vdjdb_beta_read_path = VDJdb_cleaned_beta_output
mcpastcr_beta_read_path = McPAS_cleaned_beta_output
iedb_beta_read_path = IEDB_cleaned_beta_output
# paired input files
vdjdb_paired_read_path = VDJdb_cleaned_paired_output
mcpastcr_paired_read_path = McPAS_cleaned_paired_output
iedb_paired_read_path = IEDB_cleaned_paired_output
# output files
output_file_beta = 'beta_concatenated.tsv'
output_file_paired = 'paired_concatenated.tsv'

create_folders_if_not_exists([custom_dataset_path])

%run ./data_scripts/concatDatasets.ipynb

length of beta_df: 231627




The following script removes a lot of rows. They are kept and some of them get added again later
distinct entries (all columns, keep=first). 36836 entries removed.
removed all duplicates (CDR3, Epitope) from distinct values (most_important_columns, keep=False). 47581 entries removed.
beta removed entries df length: 47581


Number of groups formed: 18337


  duplicates_to_add = pd.concat([duplicates_to_add, group[group['is_duplicated'] == False]])


32617 can be re-added to the no-duplicated dataframe
from the plain dataset which has 231622 entries, 51800 entries have been removed.
for beta dataset :
size difference is: 51800
  179822 information score cleaned: 5.202255563835348
  231622 information score dropout: 5.0485964200291855
final_beta_df length = 179822
length of paired_df: 54338




The following script removes a lot of rows. They are kept and some of them get added again later
distinct entries (all columns, keep=first). 6090 entries removed.
removed all duplicates from distinct values (cultivated columns, keep=False). 32381 entries removed.
paired removed entries df length: 32381




  duplicates_to_add = pd.concat([duplicates_to_add, group[group['is_duplicated'] == False]])


32337 can be re-added to the no-duplicated dataframe
from the plain dataset which has 54295 entries, 6134 entries have been removed.
for paired dataset:
size difference is: 6134
  48161 information score cleaned: 7.147442951765952
  54295 information score dropout: 7.243300488074408
final_paired_df length: 48161


In [21]:
concatenated_paired = f'{custom_dataset_path}/{output_file_paired}'
concatenated_beta = f'{custom_dataset_path}/{output_file_beta}'

## Data split
The split creates 3 datasets. Train, Validation and Test. 

In [22]:
# prepare parameters for split of paired dataset
input_file = concatenated_paired
paired_output_folder = f'{pipeline_data_splitted}/{precision}/paired'
validation_file_name = 'validation.tsv'
test_file_name = 'test.tsv'
train_file_name = 'train.tsv'
aimed_test_ratio = 0.3 # this means 30% of the concatenated dataset will be for test and validation (fifty/fifty)

create_folders_if_not_exists([paired_output_folder])

# do the split
%run ./data_scripts/data_preparation/split_paired.ipynb

distinct tcr's: 29339 from 48161
unique tcr's: 13525 from 48161
unique epitopes: 616 from 48161
train data has 34636 entries
test data has 13525 entries
test data has 0 TPP1 tasks (unseen tcr & seen epitopes).
test data has 11253 TPP2 tasks (unseen tcr & seen epitopes).
test data has 2272 TPP3 tasks (unseen tcr & unseen epitope).
the train/test ratio is 0.7191711135566121/0.2808288864433878
924 entries need to be shifted from train to test so the train/test ratio can be 0.7/0.3
924 entries from train will be moved to test (TPP1)
df_train size before: 34636
number of tpp1 before: 0
number of tpp2 before: 11253
df_train size after: 33712
number of tpp1 after: 924
number of tpp2 after: 11253
5164 entries will be shifted from test to train so the tpp1/tpp2 ratio can be 0.5/0.5
5165 entries need to be shifted from train to test so the tpp1/tpp2 ratio can be 0.5/0.5
5165 entries from train will be moved to test (TPP1)
df_train size before: 38876
number of tpp1 before: 924
number of tpp2 befo

  df_train = pd.concat([df_train, rows_to_move], ignore_index=True)


df_train size after: 33711
number of tpp1 after: 6089
number of tpp2 after: 6089
train data has 33711 entries
test data has 14450 entries
test data has 6089 TPP1 tasks (seen tcr & seen epitopes).
test data has 6089 TPP2 tasks (unseen tcr & seen epitopes).
test data has 2272 TPP3 tasks (unseen tcr & unseen epitope).
the train/test ratio is 0.6999647017296152/0.3000352982703848
test data has 7226 entries
validation data has 7224 entries
train data has 33711 entries
test data has 3045 TPP1 tasks (seen tcr & seen epitopes).
test data has 3891 TPP2 tasks (unseen tcr & seen epitopes).
test data has 290 TPP3 tasks (unseen tcr & unseen epitope).
the test ratio is 0.849961587176346/0.150038412823654
the validation ratio is 0.8500031145532693/0.14999688544673076


In [23]:
# prepare parameters for split of beta dataset
input_file = concatenated_beta
beta_output_folder = f'{pipeline_data_splitted}/{precision}/beta'
aimed_test_ratio = 0.3 # this means 30% of the concatenated dataset will be for test and validation (fifty/fifty)

create_folders_if_not_exists([beta_output_folder])

# do the split
%run ./data_scripts/data_preparation/split_beta.ipynb

distinct tcr's: 152160 from 179822
unique tcr's: 139540 from 179822
unique epitopes: 678 from 179822
train data has 40282 entries
test data has 139540 entries
test data has 0 TPP1 tasks (unseen tcr & seen epitopes).
test data has 137217 TPP2 tasks (unseen tcr & seen epitopes).
test data has 2323 TPP3 tasks (unseen tcr & unseen epitope).
the train/test ratio is 0.22401041029462465/0.7759895897053753
85594 entries will be shifted from test to train so the train/test ratio can be 0.7/0.3
25811 entries will be shifted from test to train so the tpp1/tpp2 ratio can be 0.5/0.5
25812 entries need to be shifted from train to test so the tpp1/tpp2 ratio can be 0.5/0.5
train data has 125875 entries
test data has 53947 entries
test data has 25812 TPP1 tasks (seen tcr & seen epitopes).
test data has 25812 TPP2 tasks (unseen tcr & seen epitopes).
test data has 2323 TPP3 tasks (unseen tcr & unseen epitope).
the train/test ratio is 0.6999977755780716/0.30000222442192837
test data has 26974 entries
val

## Negative Data

In [25]:
# pip install transformers

In [26]:
# pip install sentencepiece

In [27]:
#pip install --upgrade transformers


In [28]:
#from transformers import T5Tokenizer
#tokenizer = T5Tokenizer.from_pretrained('Rostlab/prot_t5_xl_half_uniref50-enc', do_lower_case=False)


In [24]:
# prepare parameters for paired dataset
read_path_train = f'{paired_output_folder}/{train_file_name}'
read_path_test = f'{paired_output_folder}/{test_file_name}'
read_path_validation = f'{paired_output_folder}/{validation_file_name}'
temp_path = f'{pipeline_data_temp_bucket}/negative_samples/paired/'
output_path = paired_output_folder  # we are not interested in the positive only data so we override them with positive/negative dataset
train_output_name = train_file_name
validation_output_name = validation_file_name
test_output_name = test_file_name

create_folders_if_not_exists([temp_path])

%run ./data_scripts/negative_samples/negative_samples_paired.ipynb

  from .autonotebook import tqdm as notebook_tqdm


Using device: cpu
Loading: Rostlab/prot_t5_xl_half_uniref50-enc
Casting model to full precision for running on CPU ...


In [25]:
# prepare parameters for beta dataset
read_path_train = f'{beta_output_folder}/{train_file_name}'
read_path_test = f'{beta_output_folder}/{test_file_name}'
read_path_validation = f'{beta_output_folder}/{validation_file_name}'
temp_path = f'{pipeline_data_temp_bucket}/negative_samples/beta/'
output_path = beta_output_folder  # we are not interested in the positive only data so we override them with positive/negative dataset
train_output_name = train_file_name
validation_output_name = validation_file_name
test_output_name = test_file_name

create_folders_if_not_exists([temp_path])

%run ./data_scripts/negative_samples/negative_samples_beta.ipynb

Using device: cpu
Loading: Rostlab/prot_t5_xl_half_uniref50-enc
Casting model to full precision for running on CPU ...


## Task Classification
The classification in the split notebook correct for positive only data. After adding negative data, some classifications might be wrong.

In [5]:
paired_output_folder = f'{pipeline_data_splitted}/{precision}/paired'
validation_file_name = 'validation.tsv'
test_file_name = 'test.tsv'
train_file_name = 'train.tsv'
beta_output_folder = f'{pipeline_data_splitted}/{precision}/beta'

In [6]:
# do the classification for paired data
paired = True
train_data_path = f'{paired_output_folder}/{train_file_name}'
test_data_path = f'{paired_output_folder}/{test_file_name}'
validation_data_path = f'{paired_output_folder}/{validation_file_name}'

%run ./data_scripts/data_preparation/classification.ipynb

In [7]:
# extended classification for paired data
test_path = f'{paired_output_folder}/{test_file_name}'
train_path = f'{paired_output_folder}/{train_file_name}'
validation_path = f'{paired_output_folder}/{validation_file_name}'
output_path = f'{paired_output_folder}/test_reclassified_paired_specific.tsv'
paired_data_path = paired_output_folder
alpha_cdr3_name = 'TRA_CDR3'
beta_cdr3_name = 'TRB_CDR3'
epitope_name = 'Epitope'
task_name = 'task'

%run ./data_scripts/data_preparation/paired_reclassification.ipynb

gene
./data/splitted_datasets/gene/paired/train.tsv
       TCR_name          TRAV    TRAJ             TRA_CDR3      TRBV     TRBJ  \
0             1    TRAV29/DV5  TRAJ37       CAASALGNTGKLIF   TRBV4-1  TRBJ1-1   
1             2         TRAV5  TRAJ20        CAEIRANDYKLSF   TRBV5-1  TRBJ1-2   
2             3        TRAV17  TRAJ17     CAALDGIKAAGNKLTF    TRBV19  TRBJ1-1   
3             4           NaN     NaN         CAFLGGANNLFF       NaN      NaN   
4             5        TRAV17   TRAJ7       CATGLYYGNNRLAF  TRBV11-2  TRBJ2-2   
...         ...           ...     ...                  ...       ...      ...   
14410     14411       TRAV1-2   TRAJ3       CAVKTPSSASKIIF  TRBV20-1  TRBJ1-1   
14411     14412         TRAV3  TRAJ13       CAVSLSGGYQKVTF    TRBV19  TRBJ1-2   
14412     14413      TRAV38-1  TRAJ40         CAYTSGTYKYIF    TRBV27  TRBJ2-3   
14413     14414  TRAV38-2/DV8  TRAJ39          CAYSAGNMLTF    TRBV27  TRBJ2-3   
14414     14415        TRAV19  TRAJ57  CALSEEKVITQGGSEKLV

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mfrohoari[0m ([33mpa_cancerimmunotherapy[0m). Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: Adding directory to artifact (./data/splitted_datasets/gene/paired)... Done. 0.0s


VBox(children=(Label(value='0.008 MB of 0.008 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

In [8]:
# do the classification for beta data
paired = False
train_data_path = f'{beta_output_folder}/{train_file_name}'
test_data_path = f'{beta_output_folder}/{test_file_name}'
validation_data_path = f'{beta_output_folder}/{validation_file_name}'

%run ./data_scripts/data_preparation/classification.ipynb

In the next two cells the classification is checked. If the output says "Classification is correct", everything is fine.

In [29]:
# check task classification paired
splitted_data_path = paired_output_folder

%run ./data_scripts/data_preparation/check_task_classification_paired.ipynb

train+validate data has 81381 entries
test data has 14415 entries
test data has 5879 TPP1 tasks (seen tcr & seen epitopes).
test data has 7816 TPP2 tasks (unseen tcr & seen epitopes).
test data has 546 TPP3 tasks (unseen tcr & unseen epitope).
test data has 174 TPP4 tasks (seen tcr & unseen epitope).
the train/test ratio is 0.8495239884755105/0.15047601152448953
Classification is correct.
Correctness summary:
is_correct
True    14415
Name: count, dtype: int64


In [30]:
# check task classification beta
splitted_data_path = beta_output_folder

%run ./data_scripts/data_preparation/check_task_classification_beta.ipynb

train data has 251686 entries
test data has 53858 entries
test data has 25602 TPP1 tasks (seen tcr & seen epitopes).
test data has 27699 TPP2 tasks (unseen tcr & seen epitopes).
test data has 437 TPP3 tasks (unseen tcr & unseen epitope).
test data has 120 TPP4 tasks (seen tcr & unseen epitope).
the train/test ratio is 0.8501459364557336/0.14985406354426647
Classification is correct.
Correctness summary:
is_correct
True    53858
Name: count, dtype: int64


## Upload dataset

In [9]:
from dotenv import load_dotenv, find_dotenv
load_dotenv()

# upload paired data
path_to_data = f'{pipeline_data_splitted}/{precision}/paired'
dataset_name = f'paired_{precision}'
#main_project_name = os.getenv("MAIN_PROJECT_NAME")
main_project_name = f"dataset-{precision}"

%run ./data_scripts/upload_datasets.ipynb

uploading dataset to dataset-gene


[34m[1mwandb[0m: Adding directory to artifact (./data/splitted_datasets/gene/paired)... Done. 0.0s


In [10]:
# upload beta data
path_to_data = f'{pipeline_data_splitted}/{precision}/beta'
dataset_name = f'beta_{precision}'

%run ./data_scripts/upload_datasets.ipynb

uploading dataset to dataset-gene


[34m[1mwandb[0m: Adding directory to artifact (./data/splitted_datasets/gene/beta)... Done. 0.1s


VBox(children=(Label(value='0.009 MB of 0.009 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

## Create Embeddings

In [1]:
import torch
print(torch.cuda.is_available())  # Sollte True zurückgeben
print(torch.version.cuda)  # Sollte die richtige CUDA-Version anzeigen


True
11.8


In [5]:
path_paired_test = f"data/splitted_datasets/{precision}/paired/test.tsv"
path_paired_validation = f"data/splitted_datasets/{precision}/paired/validation.tsv"
path_paired_train = f"data/splitted_datasets/{precision}/paired/train.tsv"
path_beta_test = f"data/splitted_datasets/{precision}/beta/test.tsv"
path_beta_validation = f"data/splitted_datasets/{precision}/beta/validation.tsv"
path_beta_train = f"data/splitted_datasets/{precision}/beta/train.tsv"

path_paired = f"{pipeline_data}/embeddings/temp/{precision}/paired_concatenated.tsv"
create_folders_if_not_exists([os.path.dirname(path_paired)])
df_paired_test = pd.read_csv(path_paired_test, sep="\t", index_col=False)
df_paired_validation = pd.read_csv(path_paired_validation, sep="\t", index_col=False)
df_paired_train = pd.read_csv(path_paired_train, sep="\t", index_col=False)
df_paired = pd.concat([df_paired_test, df_paired_validation, df_paired_train])
df_paired.to_csv(path_paired, sep="\t", index=False)

# # paired
#%run ./data_scripts/generateEmbeddings.py paired {path_paired} {pipeline_data}/embeddings/paired/{precision}/TRA_paired_embeddings.npz TRA_CDR3
#%run ./data_scripts/generateEmbeddings.py paired {path_paired} {pipeline_data}/embeddings/paired/{precision}/TRB_paired_embeddings.npz TRB_CDR3
#%run ./data_scripts/generateEmbeddings.py paired {path_paired} {pipeline_data}/embeddings/paired/{precision}/Epitope_paired_embeddings.npz Epitope

path_beta = f"{pipeline_data}/embeddings/temp/{precision}/beta_concatenated.tsv"
create_folders_if_not_exists([os.path.dirname(path_beta)])
df_beta_test = pd.read_csv(path_beta_test, sep="\t", index_col=False)
df_beta_validation = pd.read_csv(path_beta_validation, sep="\t", index_col=False)
df_beta_train = pd.read_csv(path_beta_train, sep="\t", index_col=False)
df_beta = pd.concat([df_beta_test, df_beta_validation, df_beta_train])
df_beta.to_csv(path_beta, sep="\t", index=False)

# beta
#%run ./data_scripts/generateEmbeddings.py beta {path_beta} {pipeline_data}/embeddings/beta/{precision}/TRB_beta_embeddings.npz TRB_CDR3
%run ./data_scripts/generateEmbeddings.py beta {path_beta} {pipeline_data}/embeddings/beta/{precision}/Epitope_beta_embeddings.npz Epitope


Using GPU: Tesla T4
Loading: Rostlab/prot_t5_xl_half_uniref50-enc


  return torch.load(checkpoint_file, map_location="cpu")


Model is on device: cuda:0


## Create Physicochemical Properties

In [5]:
!python ./data_scripts/generatePhysicoParallel.py paired {pipeline_data_splitted}/{precision}/paired test ./data/physicoProperties {precision}
!python ./data_scripts/generatePhysicoParallel.py paired {pipeline_data_splitted}/{precision}/paired validation ./data/physicoProperties {precision}
!python ./data_scripts/generatePhysicoParallel.py paired {pipeline_data_splitted}/{precision}/paired train ./data/physicoProperties {precision}

!python ./data_scripts/generatePhysicoParallel.py beta {pipeline_data_splitted}/{precision}/beta test ./data/physicoProperties {precision}
!python ./data_scripts/generatePhysicoParallel.py beta {pipeline_data_splitted}/{precision}/beta validation ./data/physicoProperties {precision}
!python ./data_scripts/generatePhysicoParallel.py beta {pipeline_data_splitted}/{precision}/beta train ./data/physicoProperties {precision}

/bin/bash: line 1: python: command not found
/bin/bash: line 1: python: command not found
/bin/bash: line 1: python: command not found
/bin/bash: line 1: python: command not found
/bin/bash: line 1: python: command not found
/bin/bash: line 1: python: command not found


### Scale Physicochemical Properties

In [7]:
!pip install peptides


[1;31merror[0m: [1mexternally-managed-environment[0m

[31m×[0m This environment is externally managed
[31m╰─>[0m To install Python packages system-wide, try apt install
[31m   [0m python3-xyz, where xyz is the package you are trying to
[31m   [0m install.
[31m   [0m 
[31m   [0m If you wish to install a non-Debian-packaged Python package,
[31m   [0m create a virtual environment using python3 -m venv path/to/venv.
[31m   [0m Then use path/to/venv/bin/python and path/to/venv/bin/pip. Make
[31m   [0m sure you have python3-full installed.
[31m   [0m 
[31m   [0m If you wish to install a non-Debian packaged Python application,
[31m   [0m it may be easiest to use pipx install xyz, which will manage a
[31m   [0m virtual environment for you. Make sure you have pipx installed.
[31m   [0m 
[31m   [0m See /usr/share/doc/python3.12/README.venv for more information.

[1;35mnote[0m: If you believe this is a mistake, please contact your Python installation or OS dist

In [6]:
base_path = "./data/physicoProperties"
chain = "paired"
%run ./data_scripts/scale_physicos.ipynb

chain = "beta"
%run ./data_scripts/scale_physicos.ipynb

ModuleNotFoundError: No module named 'peptides'

ModuleNotFoundError: No module named 'peptides'

In [None]:
print(epitope_scaler.n_features_in_)
assert epitope_scaler.n_features_in_ == number_of_pyhsico_features


In [10]:
# Datenlänge Überprüfung....

## concatenated_datasets
data_path_paired_gene = 'data/concatenated_datasets/gene/paired_concatenated.tsv'
data_path_paired_allele = 'data/concatenated_datasets/allele/paired_concatenated.tsv'
data_path_beta_gene = 'data/concatenated_datasets/gene/beta_concatenated.tsv'
data_path_beta_allele = 'data/concatenated_datasets/allele/beta_concatenated.tsv'

# Datensätze laden
paired_gene_df = pd.read_csv(data_path_paired_gene)
paired_allele_df = pd.read_csv(data_path_paired_allele, sep='\t')
beta_gene_df = pd.read_csv(data_path_beta_gene)
beta_allele_df = pd.read_csv(data_path_beta_allele, sep='\t')

# Anzahl der Zeilen berechnen
paired_gene_length = len(paired_gene_df)
paired_allele_length = len(paired_allele_df)
beta_gene_length = len(beta_gene_df)
beta_allele_length = len(beta_allele_df)

# Ergebnisse anzeigen
print(f'Anzahl der Zeilen in Paired Gene: {paired_gene_length}')
print(f'Anzahl der Zeilen in Paired Allele: {paired_allele_length}')
print(f'Anzahl der Zeilen in Beta Gene: {beta_gene_length}')
print(f'Anzahl der Zeilen in Beta Allele: {beta_allele_length}')

Anzahl der Zeilen in Paired Gene: 48161
Anzahl der Zeilen in Paired Allele: 52167
Anzahl der Zeilen in Beta Gene: 179822
Anzahl der Zeilen in Beta Allele: 199492


  beta_allele_df = pd.read_csv(data_path_beta_allele, sep='\t')


In [12]:
# Datenlänge Überprüfung....

## splitted datasets

import pandas as pd

# Beispielpfade für Train-, Test-, und Validierungsdatensätze für alle vier Kategorien
base_path = 'data/splitted_datasets'

# Definierte Pfade für alle vier Kategorien
datasets = {
    "paired_gene": {
        "train": f"{base_path}/gene/paired/train.tsv",
        "test": f"{base_path}/gene/paired/test.tsv",
        "validation": f"{base_path}/gene/paired/validation.tsv"
    },
    "paired_allele": {
        "train": f"{base_path}/allele/paired/train.tsv",
        "test": f"{base_path}/allele/paired/test.tsv",
        "validation": f"{base_path}/allele/paired/validation.tsv"
    },
    "beta_gene": {
        "train": f"{base_path}/gene/beta/train.tsv",
        "test": f"{base_path}/gene/beta/test.tsv",
        "validation": f"{base_path}/gene/beta/validation.tsv"
    },
    "beta_allele": {
        "train": f"{base_path}/allele/beta/train.tsv",
        "test": f"{base_path}/allele/beta/test.tsv",
        "validation": f"{base_path}/allele/beta/validation.tsv"
    }
}

# Berechnung der Anzahl der Zeilen für jedes Set
results = {}
for dataset_name, paths in datasets.items():
    # Daten laden
    train_df = pd.read_csv(paths["train"], sep='\t')
    test_df = pd.read_csv(paths["test"], sep='\t')
    validation_df = pd.read_csv(paths["validation"], sep='\t')
    
    # Anzahl der Zeilen berechnen
    train_length = len(train_df)
    test_length = len(test_df)
    validation_length = len(validation_df)
    total_length = train_length + test_length + validation_length

    # Ergebnisse speichern
    results[dataset_name] = {
        "Train": train_length,
        "Test": test_length,
        "Validation": validation_length,
        "Total": total_length
    }

# Ergebnisse anzeigen
for dataset, lengths in results.items():
    print(f'--- {dataset.replace("_", " ").title()} ---')
    print(f'Anzahl der Zeilen im Trainingsdatensatz: {lengths["Train"]}')
    print(f'Anzahl der Zeilen im Testdatensatz: {lengths["Test"]}')
    print(f'Anzahl der Zeilen im Validierungsdatensatz: {lengths["Validation"]}')
    print(f'Gesamtanzahl der Zeilen (Train + Test + Validation): {lengths["Total"]}\n')

# Optional: Ergebnisse in einer Übersichtstabelle darstellen
summary_data = []
for dataset, lengths in results.items():
    summary_data.append({
        "Dataset": dataset.replace("_", " ").title(),
        "Train": lengths["Train"],
        "Test": lengths["Test"],
        "Validation": lengths["Validation"],
        "Total": lengths["Total"]
    })

summary_df = pd.DataFrame(summary_data)
print(summary_df)



--- Paired Gene ---
Anzahl der Zeilen im Trainingsdatensatz: 66958
Anzahl der Zeilen im Testdatensatz: 14415
Anzahl der Zeilen im Validierungsdatensatz: 14423
Gesamtanzahl der Zeilen (Train + Test + Validation): 95796

--- Paired Allele ---
Anzahl der Zeilen im Trainingsdatensatz: 72623
Anzahl der Zeilen im Testdatensatz: 15609
Anzahl der Zeilen im Validierungsdatensatz: 15608
Gesamtanzahl der Zeilen (Train + Test + Validation): 103840

--- Beta Gene ---
Anzahl der Zeilen im Trainingsdatensatz: 251686
Anzahl der Zeilen im Testdatensatz: 53858
Anzahl der Zeilen im Validierungsdatensatz: 53859
Gesamtanzahl der Zeilen (Train + Test + Validation): 359403

--- Beta Allele ---
Anzahl der Zeilen im Trainingsdatensatz: 279012
Anzahl der Zeilen im Testdatensatz: 59798
Anzahl der Zeilen im Validierungsdatensatz: 59771
Gesamtanzahl der Zeilen (Train + Test + Validation): 398581

         Dataset   Train   Test  Validation   Total
0    Paired Gene   66958  14415       14423   95796
1  Paired Allel