In [1]:
import os
import pandas as pd

In [27]:
import sys
!"{sys.executable}" -m pip install tidytcells


Collecting tidytcells
  Using cached tidytcells-2.1.3-py3-none-any.whl.metadata (3.7 kB)
Using cached tidytcells-2.1.3-py3-none-any.whl (90 kB)
Installing collected packages: tidytcells
Successfully installed tidytcells-2.1.3


In [2]:
# set precision of mhc and V/J values (gene or allele)
precision = 'allele'

In [3]:
# this function is not thread safe
def create_folders_if_not_exists(folders):
  for path in folders:
    if not os.path.exists(path):
      os.makedirs(path)

In [4]:
pipeline_data = './data_10x'
pipeline_data_plain = f'{pipeline_data}/plain_datasets'
pipeline_data_cleaned = f'{pipeline_data}/cleaned_datasets'
pipeline_data_concatenated = f'{pipeline_data}/concatenated_datasets'
pipeline_data_splitted = f'{pipeline_data}/splitted_datasets'
pipeline_data_temp_bucket = f'{pipeline_data}/temp'

pipeline_folders = [pipeline_data, pipeline_data_plain, pipeline_data_cleaned, pipeline_data_concatenated, pipeline_data_splitted, pipeline_data_temp_bucket]

create_folders_if_not_exists(pipeline_folders)

## Data Preparation

### IEDB

In [6]:
# prepare directories
IEDB_data_plain = f'{pipeline_data_plain}/IEDB'
IEDB_data_cleaned = f'{pipeline_data_cleaned}/IEDB'
IEDB_data_fitted = f'{pipeline_data_temp_bucket}/IEDB'

IEDB_folders = [IEDB_data_plain, IEDB_data_cleaned, IEDB_data_fitted]
create_folders_if_not_exists(IEDB_folders)

In [7]:
# prepare parameters for notebook IEDB fit data
path_prefix_plain = IEDB_data_plain
path_prefix_fitted = IEDB_data_fitted
mhc_I_input_beta = f"{path_prefix_plain}/MHCI_IEDB_beta_export.csv"
mhc_I_output_beta = f"{path_prefix_fitted}/IEDB_beta_fitted.csv"
mhc_I_input_paired = f"{path_prefix_plain}/MHCI_IEDB_paired_export.csv"
mhc_I_output_paired = f"{path_prefix_fitted}/IEDB_paired_fitted.csv"

# fit IEDB data
#%run ./data_scripts/IEDB/IEDB_fitted_dataset.ipynb

In [8]:
# prepare parameters for notebook IEDB clean data
path_prefix_fitted = IEDB_data_fitted
path_prefix_cleaned =  IEDB_data_cleaned
fitted_file_beta = "IEDB_beta_fitted.csv"
fitted_file_paired = "IEDB_paired_fitted.csv"
cleaned_file_beta = "IEDB_cleaned_data_beta.csv"
cleaned_file_paired = "IEDB_cleaned_data_paired.csv"

# clean IEDB data
#%run ./data_scripts/IEDB/IEDB_clean_dataset.ipynb

In [9]:
IEDB_cleaned_beta_output = f'{IEDB_data_cleaned}/{cleaned_file_beta}'
IEDB_cleaned_paired_output = f'{IEDB_data_cleaned}/{cleaned_file_paired}'

### McPAS

In [10]:
# prepare directories
McPas_data_plain = f'{pipeline_data_plain}/McPas'
McPas_data_cleaned = f'{pipeline_data_cleaned}/McPas'
McPas_data_fitted = f'{pipeline_data_temp_bucket}/McPas'

McPas_folders = [McPas_data_plain, McPas_data_cleaned, McPas_data_fitted]
create_folders_if_not_exists(McPas_folders)

In [11]:
# prepare parameters for notebook McPAS fit data
input_file = f'{McPas_data_plain}/McPAS-TCR.csv'
path_prefix_fitted = McPas_data_fitted
fitted_file = 'McPAS_fitted.tsv'

# fit McPAS data
%run ./data_scripts/McPas-TCR/fit_data_mcpastcr_both.ipynb

In [12]:
# prepare parameters for notebook McPAS clean data
fitted_input_file = f'{McPas_data_fitted}/{fitted_file}'
path_prefix_cleaned = McPas_data_cleaned
cleaned_file_paired = 'McPAS_cleaned_data_paired.tsv'
cleaned_file_beta = 'McPAS_cleaned_data_beta.tsv'

# clean McPAS data
%run ./data_scripts/McPas-TCR/clean_data_mcpastcr_both.ipynb

MHC Class I has 10078 entries
whole dataframe has 13701 entries
filtered to only use MHC Class I. Length of dataset: 10078


  mcpastcr_cleaned_both_df = mcpastcr_cleaned_both_df[~mask]


In [13]:
McPAS_cleaned_beta_output = f'{McPas_data_cleaned}/{cleaned_file_beta}'
McPAS_cleaned_paired_output = f'{McPas_data_cleaned}/{cleaned_file_paired}'

### VDJdb

In [14]:
# prepare directories
VDJdb_data_plain = f'{pipeline_data_plain}/VDJdb'
VDJdb_data_cleaned = f'{pipeline_data_cleaned}/VDJdb'
VDJdb_data_fitted = f'{pipeline_data_temp_bucket}/VDJdb'

VDJdb_folders = [VDJdb_data_plain, VDJdb_data_cleaned, VDJdb_data_fitted]
create_folders_if_not_exists(VDJdb_folders)

fitted_beta_file = 'VDJdb_beta_fitted.tsv'
fitted_paired_file = 'VDJdb_paired_fitted.tsv'

In [15]:
# prepare parameters for notebook VDJdb fit data paired
input_file = f'{VDJdb_data_plain}/VDJdb_paired_only.tsv'
path_prefix_fitted = VDJdb_data_fitted
fitted_file = fitted_paired_file

# fit paired VDJdb data
%run ./data_scripts/VDJdb/fit_data_vdjdb_paired.ipynb

In [16]:
# prepare parameters for notebook VDJdb fit data beta
input_file = f'{VDJdb_data_plain}/VDJdb_beta_only.tsv'
path_prefix_fitted = VDJdb_data_fitted
fitted_file = fitted_beta_file

# fit beta VDJdb data
%run ./data_scripts/VDJdb/fit_data_vdjdb_beta.ipynb

In [17]:
# prepare parameters for notebook VDJdb clean data paired
input_file = f'{VDJdb_data_fitted}/{fitted_paired_file}'
cleaned_file_paired = 'VDJdb_cleaned_data_paired.tsv'
output_file = f'{VDJdb_data_cleaned}/{cleaned_file_paired}'

# clean paired VDJdb data
%run ./data_scripts/VDJdb/clean_data_vdjdb_paired.ipynb

MHC Class I has 27414 entries
whole dataframe has 28119 entries
filtered to only use MHC Class I. Length of dataset: 27414


In [18]:
# prepare parameters for notebook VDJdb clean data beta
input_file = f'{VDJdb_data_fitted}/{fitted_beta_file}'
cleaned_file_beta = 'VDJdb_cleaned_data_beta.tsv'
output_file = f'{VDJdb_data_cleaned}/{cleaned_file_beta}'

# clean beta VDJdb data
%run ./data_scripts/VDJdb/clean_data_vdjdb_beta.ipynb

MHC Class I has 46507 entries
whole dataframe has 49042 entries
filtered to only use MHC Class I. Length of dataset: 46507


In [19]:
VDJdb_cleaned_beta_output = f'{VDJdb_data_cleaned}/{cleaned_file_beta}'
VDJdb_cleaned_paired_output = f'{VDJdb_data_cleaned}/{cleaned_file_paired}'

### 10X

In [20]:
''' >>>>>>> alt
base_path = '../data_10x/' 

# Einlesen der CSV-Dateien für die Donors
donor1_consensus = pd.read_csv(base_path + 'vdj_v1_hs_aggregated_donor1_consensus_annotations.csv')
donor2_consensus = pd.read_csv(base_path + 'vdj_v1_hs_aggregated_donor2_consensus_annotations.csv')
donor3_consensus = pd.read_csv(base_path + 'vdj_v1_hs_aggregated_donor3_consensus_annotations.csv')
donor4_consensus = pd.read_csv(base_path + 'vdj_v1_hs_aggregated_donor4_consensus_annotations.csv')

# Daten prüfen
print("Donor 1 Consensus:", donor1_consensus.shape)
print("Donor 2 Consensus:", donor2_consensus.shape)
print("Donor 3 Consensus:", donor3_consensus.shape)
print("Donor 4 Consensus:", donor4_consensus.shape)


# Zusammenführen aller Donors in ein gemeinsames DataFrame
all_donors_consensus = pd.concat([donor1_consensus, donor2_consensus, donor3_consensus, donor4_consensus])

# Optional: Reset index, falls nötig (nicht zwingend erforderlich, aber hilft bei der Verwaltung der Zeilenindices)
all_donors_consensus.reset_index(drop=True, inplace=True)

# Gesamtstruktur anzeigen, um sicherzustellen, dass alles korrekt zusammengeführt wurde
print("Kombinierte Donors:", all_donors_consensus.shape)

# Beispielhafter Blick auf die kombinierten Daten
print(all_donors_consensus.head())'''


' >>>>>>> alt\nbase_path = \'../data_10x/\' \n\n# Einlesen der CSV-Dateien für die Donors\ndonor1_consensus = pd.read_csv(base_path + \'vdj_v1_hs_aggregated_donor1_consensus_annotations.csv\')\ndonor2_consensus = pd.read_csv(base_path + \'vdj_v1_hs_aggregated_donor2_consensus_annotations.csv\')\ndonor3_consensus = pd.read_csv(base_path + \'vdj_v1_hs_aggregated_donor3_consensus_annotations.csv\')\ndonor4_consensus = pd.read_csv(base_path + \'vdj_v1_hs_aggregated_donor4_consensus_annotations.csv\')\n\n# Daten prüfen\nprint("Donor 1 Consensus:", donor1_consensus.shape)\nprint("Donor 2 Consensus:", donor2_consensus.shape)\nprint("Donor 3 Consensus:", donor3_consensus.shape)\nprint("Donor 4 Consensus:", donor4_consensus.shape)\n\n\n# Zusammenführen aller Donors in ein gemeinsames DataFrame\nall_donors_consensus = pd.concat([donor1_consensus, donor2_consensus, donor3_consensus, donor4_consensus])\n\n# Optional: Reset index, falls nötig (nicht zwingend erforderlich, aber hilft bei der Verwalt

In [21]:
'''# Speichern der kombinierten Donor-Daten in einer CSV-Datei
output_path = '../data_10x/combined_donors_consensus_annotations.csv'
all_donors_consensus.to_csv(output_path, index=False)'''

"# Speichern der kombinierten Donor-Daten in einer CSV-Datei\noutput_path = '../data_10x/combined_donors_consensus_annotations.csv'\nall_donors_consensus.to_csv(output_path, index=False)"

In [22]:
### NEU

base_path = '../data_10x/' 

# Einlesen der CSV-Dateien für die Donors
donor1_consensus = pd.read_csv(base_path + 'vdj_v1_hs_aggregated_donor1_binarized_matrix.csv')
donor2_consensus = pd.read_csv(base_path + 'vdj_v1_hs_aggregated_donor2_binarized_matrix.csv')
donor3_consensus = pd.read_csv(base_path + 'vdj_v1_hs_aggregated_donor3_binarized_matrix.csv')
donor4_consensus = pd.read_csv(base_path + 'vdj_v1_hs_aggregated_donor4_binarized_matrix.csv')

# Daten prüfen
print("Donor 1 Consensus:", donor1_consensus.shape)
print("Donor 2 Consensus:", donor2_consensus.shape)
print("Donor 3 Consensus:", donor3_consensus.shape)
print("Donor 4 Consensus:", donor4_consensus.shape)


# Zusammenführen aller Donors in ein gemeinsames DataFrame
all_donors_consensus = pd.concat([donor1_consensus, donor2_consensus, donor3_consensus, donor4_consensus])

# Optional: Reset index, falls nötig (nicht zwingend erforderlich, aber hilft bei der Verwaltung der Zeilenindices)
all_donors_consensus.reset_index(drop=True, inplace=True)

# Gesamtstruktur anzeigen, um sicherzustellen, dass alles korrekt zusammengeführt wurde
print("Kombinierte Donors:", all_donors_consensus.shape)

# Beispielhafter Blick auf die kombinierten Daten
print(all_donors_consensus.head())

# Entfernen von Duplikaten
initial_count = all_donors_consensus.shape[0]
all_donors_consensus.drop_duplicates(inplace=True)
final_count = all_donors_consensus.shape[0]
duplicates_removed = initial_count - final_count

# Anzahl der Duplikate anzeigen
print("Anzahl der entfernten Duplikate:", duplicates_removed)
print("Kombinierte Donors nach Duplikatentfernung:", all_donors_consensus.shape)


Donor 1 Consensus: (46526, 118)
Donor 2 Consensus: (77854, 118)
Donor 3 Consensus: (37824, 118)
Donor 4 Consensus: (27308, 118)
Kombinierte Donors: (189512, 118)
               barcode   donor  \
0   AAACCTGAGACAAAGG-4  donor1   
1  AAACCTGAGACTGTAA-34  donor1   
2   AAACCTGAGAGCCCAA-5  donor1   
3  AAACCTGAGAGCTGCA-24  donor1   
4   AAACCTGAGAGGGATA-8  donor1   

                                  cell_clono_cdr3_aa  \
0  TRA:CAASVSIWTGTASKLTF;TRA:CAAWDMEYGNKLVF;TRB:C...   
1                                    TRB:CASDTPVGQFF   
2                 TRA:CASYTDKLIF;TRB:CASSGGSISTDTQYF   
3                                 TRB:CASSGGQSSYEQYF   
4          TRA:CAASGYGNTGRRALTF;TRB:CASSQDPAGGYNEQFF   

                                  cell_clono_cdr3_nt     CD3  CD19  CD45RA  \
0  TRA:TGTGCAGCAAGCGTTAGTATTTGGACCGGCACTGCCAGTAAA...  2125.0   0.0   912.0   
1              TRB:TGTGCCAGCGATACCCCGGTTGGGCAGTTCTTC  1023.0   0.0  2028.0   
2  TRA:TGTGCTTCCTACACCGACAAGCTCATCTTT;TRB:TGCGCCA...  1598.0  

In [23]:
# Speichern der kombinierten Donor-Daten in einer CSV-Datei
output_path = '../data_10x/combined_donors_consensus_annotations.csv'
all_donors_consensus.to_csv(output_path, index=False)

In [24]:
#zweiter Datensatz v, d, j, c Infos

base_path = '../data_10x/' 

# Einlesen der CSV-Dateien für die Donors
donor1_meta = pd.read_csv(base_path + 'vdj_v1_hs_aggregated_donor1_all_contig_annotations.csv')
donor2_meta = pd.read_csv(base_path + 'vdj_v1_hs_aggregated_donor2_all_contig_annotations.csv')
donor3_meta = pd.read_csv(base_path + 'vdj_v1_hs_aggregated_donor3_all_contig_annotations.csv')
donor4_meta = pd.read_csv(base_path + 'vdj_v1_hs_aggregated_donor4_all_contig_annotations.csv')

# Daten prüfen
print("Donor 1:", donor1_meta.shape)
print("Donor 2:", donor2_meta.shape)
print("Donor 3:", donor3_meta.shape)
print("Donor 4:", donor4_meta.shape)


# Zusammenführen aller Donors in ein gemeinsames DataFrame
all_donors_meta = pd.concat([donor1_meta, donor2_meta, donor3_meta, donor4_meta])

# Optional: Reset index, falls nötig (nicht zwingend erforderlich, aber hilft bei der Verwaltung der Zeilenindices)
all_donors_meta.reset_index(drop=True, inplace=True)

# Gesamtstruktur anzeigen, um sicherzustellen, dass alles korrekt zusammengeführt wurde
print("Kombinierte Donors:", all_donors_meta.shape)

# Beispielhafter Blick auf die kombinierten Daten
print(all_donors_meta.head())

# Entfernen von Duplikaten
initial_count = all_donors_meta.shape[0]
all_donors_meta.drop_duplicates(inplace=True)
final_count = all_donors_meta.shape[0]
duplicates_removed = initial_count - final_count

# Anzahl der Duplikate anzeigen
print("Anzahl der entfernten Duplikate:", duplicates_removed)
print("Kombinierte Donors nach Duplikatentfernung:", all_donors_meta.shape)

  donor2_meta = pd.read_csv(base_path + 'vdj_v1_hs_aggregated_donor2_all_contig_annotations.csv')
  donor3_meta = pd.read_csv(base_path + 'vdj_v1_hs_aggregated_donor3_all_contig_annotations.csv')
  donor4_meta = pd.read_csv(base_path + 'vdj_v1_hs_aggregated_donor4_all_contig_annotations.csv')


Donor 1: (187144, 18)
Donor 2: (299037, 18)
Donor 3: (429296, 18)
Donor 4: (309855, 18)
Kombinierte Donors: (1225332, 18)
              barcode  is_cell                    contig_id  high_confidence  \
0  AAACCTGAGACAAAGG-4     True  AAACCTGAGACAAAGG-4_contig_1             True   
1  AAACCTGAGACAAAGG-4     True  AAACCTGAGACAAAGG-4_contig_2             True   
2  AAACCTGAGACAAAGG-4     True  AAACCTGAGACAAAGG-4_contig_3             True   
3  AAACCTGAGACAAAGG-4     True  AAACCTGAGACAAAGG-4_contig_4            False   
4  AAACCTGAGACAAAGG-4     True  AAACCTGAGACAAAGG-4_contig_5            False   

   length chain     v_gene d_gene   j_gene c_gene  full_length productive  \
0     722   TRB   TRBV10-3  TRBD2  TRBJ2-1  TRBC2         True       True   
1     605   TRA  TRAV29DV5    NaN   TRAJ44   TRAC         True       True   
2     738   TRA    TRAV8-6    NaN   TRAJ47   TRAC         True       True   
3     468   TRB        NaN    NaN  TRBJ2-3  TRBC2        False        NaN   
4     488   

In [25]:
# Speichern der kombinierten Donor-Daten in einer CSV-Datei
output_path = '../data_10x/meta.csv'
all_donors_meta.to_csv(output_path, index=False)

## Data Concatenation
The concatenation includes further cleaning and advanced removal of duplicated rows.

In [28]:
# prepare parameters for concatenation
custom_dataset_path = f'{pipeline_data_concatenated}/{precision}/'

# beta input files
vdjdb_beta_read_path = VDJdb_cleaned_beta_output
mcpastcr_beta_read_path = McPAS_cleaned_beta_output
iedb_beta_read_path = IEDB_cleaned_beta_output
# paired input files
vdjdb_paired_read_path = VDJdb_cleaned_paired_output
mcpastcr_paired_read_path = McPAS_cleaned_paired_output
iedb_paired_read_path = IEDB_cleaned_paired_output
# output files
output_file_beta = 'beta_concatenated.tsv'
output_file_paired = 'paired_concatenated.tsv'

create_folders_if_not_exists([custom_dataset_path])

%run ./data_scripts/concatDatasets.ipynb

length of beta_df: 231627




The following script removes a lot of rows. They are kept and some of them get added again later
distinct entries (all columns, keep=first). 36836 entries removed.
removed all duplicates (CDR3, Epitope) from distinct values (most_important_columns, keep=False). 47581 entries removed.
beta removed entries df length: 47581


Number of groups formed: 18337


  duplicates_to_add = pd.concat([duplicates_to_add, group[group['is_duplicated'] == False]])


32617 can be re-added to the no-duplicated dataframe
from the plain dataset which has 231622 entries, 51800 entries have been removed.
for beta dataset :
size difference is: 51800
  179822 information score cleaned: 5.202255563835348
  231622 information score dropout: 5.0485964200291855
final_beta_df length = 179822
length of paired_df: 54338




The following script removes a lot of rows. They are kept and some of them get added again later
distinct entries (all columns, keep=first). 6090 entries removed.
removed all duplicates from distinct values (cultivated columns, keep=False). 32381 entries removed.
paired removed entries df length: 32381




  duplicates_to_add = pd.concat([duplicates_to_add, group[group['is_duplicated'] == False]])


32337 can be re-added to the no-duplicated dataframe
from the plain dataset which has 54295 entries, 6134 entries have been removed.
for paired dataset:
size difference is: 6134
  48161 information score cleaned: 7.147442951765952
  54295 information score dropout: 7.243300488074408
final_paired_df length: 48161


In [29]:
# prepare parameters for concatenation
custom_dataset_path = f'{pipeline_data_concatenated}/{precision}/'
# output files
output_file_beta = 'beta_concatenated.tsv'
output_file_paired = 'paired_concatenated.tsv'

concatenated_paired = f'{custom_dataset_path}/{output_file_paired}'
concatenated_beta = f'{custom_dataset_path}/{output_file_beta}'

## Data split
The split creates 3 datasets. Train, Validation and Test. 

In [30]:
# prepare parameters for split of paired dataset
input_file = concatenated_paired
paired_output_folder = f'{pipeline_data_splitted}/{precision}/paired'
validation_file_name = 'validation.tsv'
test_file_name = 'test.tsv'
train_file_name = 'train.tsv'
aimed_test_ratio = 0.3 # this means 30% of the concatenated dataset will be for test and validation (fifty/fifty)

create_folders_if_not_exists([paired_output_folder])

# do the split
%run ./data_scripts/data_preparation/split_paired.ipynb

distinct tcr's: 29339 from 48161
unique tcr's: 13525 from 48161
unique epitopes: 616 from 48161
train data has 34636 entries
test data has 13525 entries
test data has 0 TPP1 tasks (unseen tcr & seen epitopes).
test data has 11253 TPP2 tasks (unseen tcr & seen epitopes).
test data has 2272 TPP3 tasks (unseen tcr & unseen epitope).
the train/test ratio is 0.7191711135566121/0.2808288864433878
924 entries need to be shifted from train to test so the train/test ratio can be 0.7/0.3
924 entries from train will be moved to test (TPP1)
df_train size before: 34636
number of tpp1 before: 0
number of tpp2 before: 11253
df_train size after: 33712
number of tpp1 after: 924
number of tpp2 after: 11253
5164 entries will be shifted from test to train so the tpp1/tpp2 ratio can be 0.5/0.5
5165 entries need to be shifted from train to test so the tpp1/tpp2 ratio can be 0.5/0.5
5165 entries from train will be moved to test (TPP1)
df_train size before: 38876
number of tpp1 before: 924
number of tpp2 befo

  df_train = pd.concat([df_train, rows_to_move], ignore_index=True)


df_train size after: 33711
number of tpp1 after: 6089
number of tpp2 after: 6089
train data has 33711 entries
test data has 14450 entries
test data has 6089 TPP1 tasks (seen tcr & seen epitopes).
test data has 6089 TPP2 tasks (unseen tcr & seen epitopes).
test data has 2272 TPP3 tasks (unseen tcr & unseen epitope).
the train/test ratio is 0.6999647017296152/0.3000352982703848
test data has 7226 entries
validation data has 7224 entries
train data has 33711 entries
test data has 3045 TPP1 tasks (seen tcr & seen epitopes).
test data has 3891 TPP2 tasks (unseen tcr & seen epitopes).
test data has 290 TPP3 tasks (unseen tcr & unseen epitope).
the test ratio is 0.849961587176346/0.150038412823654
the validation ratio is 0.8500031145532693/0.14999688544673076


In [31]:
# prepare parameters for split of beta dataset
input_file = concatenated_beta
beta_output_folder = f'{pipeline_data_splitted}/{precision}/beta'
aimed_test_ratio = 0.3 # this means 30% of the concatenated dataset will be for test and validation (fifty/fifty)

create_folders_if_not_exists([beta_output_folder])

# do the split
%run ./data_scripts/data_preparation/split_beta.ipynb

distinct tcr's: 152160 from 179822
unique tcr's: 139540 from 179822
unique epitopes: 678 from 179822
train data has 40282 entries
test data has 139540 entries
test data has 0 TPP1 tasks (unseen tcr & seen epitopes).
test data has 137217 TPP2 tasks (unseen tcr & seen epitopes).
test data has 2323 TPP3 tasks (unseen tcr & unseen epitope).
the train/test ratio is 0.22401041029462465/0.7759895897053753
85594 entries will be shifted from test to train so the train/test ratio can be 0.7/0.3
25811 entries will be shifted from test to train so the tpp1/tpp2 ratio can be 0.5/0.5
25812 entries need to be shifted from train to test so the tpp1/tpp2 ratio can be 0.5/0.5
train data has 125875 entries
test data has 53947 entries
test data has 25812 TPP1 tasks (seen tcr & seen epitopes).
test data has 25812 TPP2 tasks (unseen tcr & seen epitopes).
test data has 2323 TPP3 tasks (unseen tcr & unseen epitope).
the train/test ratio is 0.6999977755780716/0.30000222442192837
test data has 26974 entries
val

## Negative Data

In [32]:
#Daten einlesen

combined_donors_path = '../data_10x/combined_donors_consensus_annotations.csv'
all_donors_consensus = pd.read_csv(combined_donors_path, sep=',')

#print("Consensus: ", all_donors_consensus.head())

all_donors_meta_path = '../data_10x/meta.csv'
all_donors_meta = pd.read_csv(all_donors_meta_path, sep=',')

#print("Meta: ", all_donors_meta.head())

  all_donors_meta = pd.read_csv(all_donors_meta_path, sep=',')


### Beta

In [33]:
#Dieser Code für ganzen Datensatz laufen lassen
import re
import pandas as pd

# Annahme: all_donors_consensus und all_donors_meta sind bereits geladen und gefiltert

# Festlegen der Batch-Größe für die Verarbeitung
batch_size = 1000  # Passe diese Zahl je nach Speicherressourcen an

# Identifizieren von Epitope-Spalten, aber ohne "NR(B0801)_AAKGRGAAL_NC_binder"
epitope_columns = [col for col in all_donors_consensus.columns if 'Cancer_binder' in col and col != "NR(B0801)_AAKGRGAAL_NC_binder"]

# Liste für alle Batch-Ergebnisse
all_batches = []

# Verarbeite `all_donors_consensus` in Batches
for batch_start in range(0, len(all_donors_consensus), batch_size):
    print("Batch Start: ", batch_start)
    # Batch definieren
    batch = all_donors_consensus.iloc[batch_start:batch_start + batch_size]
    
    # Filtern auf Zeilen, die 'TRB:' enthalten
    batch_trb = batch[batch['cell_clono_cdr3_aa'].str.contains("TRB:", na=False)]

    # Liste, um Zeilen für diesen Batch zu speichern
    expanded_rows = []
    
    # Iteriere durch jede Zeile im Batch
    for _, row in batch_trb.iterrows():
        for col in epitope_columns:
            # Extrahiere MHC und Epitope
            match = re.match(r'([A-Z0-9]+)_([A-Z]+)_.*_binder', col)
            if match:
                mhc_raw, epitope = match.groups()
                mhc_formatted = f'HLA-{mhc_raw[0]}*{mhc_raw[1:3]}:{mhc_raw[3:]}'

                # Füge `Epitope` und `MHC` zur Zeile hinzu
                new_row = row.copy()
                new_row['Epitope'] = epitope
                new_row['MHC'] = mhc_formatted

                # Füge neue Zeile zur Batch-Liste hinzu
                expanded_rows.append(new_row)
    
    # Erstelle einen DataFrame aus dem Batch
    batch_df = pd.DataFrame(expanded_rows)
    all_batches.append(batch_df)  # Speichere den Batch in der Liste

# Kombiniere alle Batch-Ergebnisse zu einem DataFrame
expanded_df = pd.concat(all_batches, ignore_index=True)

# Nur die TRB-Chain-Einträge in `all_donors_meta` beibehalten
all_donors_meta_trb = all_donors_meta[all_donors_meta['chain'] == 'TRB']

# Zusammenführen der beiden DataFrames basierend auf der 'barcode' Spalte
merged_df = pd.merge(all_donors_meta_trb, expanded_df[['barcode', 'Epitope', 'MHC']], on='barcode', how='inner')

# Spalten umbenennen und Format anpassen
merged_df = merged_df.rename(columns={
    'barcode': 'TCR_name',
    'v_gene': 'TRBV',
    'j_gene': 'TRBJ',
    'c_gene': 'TRBC',
    'cdr3': 'TRB_CDR3'
})

# Fehlende Spalten auffüllen
desired_columns = ['TCR_name', 'TRBV', 'TRBJ', 'TRB_CDR3', 'TRBC', 'Epitope', 'MHC', 'Binding', 'task']
for col in desired_columns:
    if col not in merged_df.columns:
        merged_df[col] = 'nan' if col == 'task' else '0'

# Nur die gewünschten Spalten beibehalten und Zeilen mit `None` in `TRB_CDR3` entfernen
final_df = merged_df[desired_columns]
final_df = final_df[final_df['TRB_CDR3'] != 'None']

final_df = final_df[final_df['TRB_CDR3'].notna() & (final_df['TRB_CDR3'] != '')]

# Ausgabe des ersten Teils des Ergebnisses zur Überprüfung
print(final_df.head())

# Speichern des kombinierten DataFrames
output_path = '../data_10x/combined_output_with_epitope_mhc_TRB_only_expanded-gene.csv'
final_df.to_csv(output_path, index=False)


Batch Start:  0
Batch Start:  1000
Batch Start:  2000
Batch Start:  3000
Batch Start:  4000
Batch Start:  5000
Batch Start:  6000
Batch Start:  7000
Batch Start:  8000
Batch Start:  9000
Batch Start:  10000
Batch Start:  11000
Batch Start:  12000
Batch Start:  13000
Batch Start:  14000
Batch Start:  15000
Batch Start:  16000
Batch Start:  17000
Batch Start:  18000
Batch Start:  19000
Batch Start:  20000
Batch Start:  21000
Batch Start:  22000
Batch Start:  23000
Batch Start:  24000
Batch Start:  25000
Batch Start:  26000
Batch Start:  27000
Batch Start:  28000
Batch Start:  29000
Batch Start:  30000
Batch Start:  31000
Batch Start:  32000
Batch Start:  33000
Batch Start:  34000
Batch Start:  35000
Batch Start:  36000
Batch Start:  37000
Batch Start:  38000
Batch Start:  39000
Batch Start:  40000
Batch Start:  41000
Batch Start:  42000
Batch Start:  43000
Batch Start:  44000
Batch Start:  45000
Batch Start:  46000
Batch Start:  47000
Batch Start:  48000
Batch Start:  49000
Batch Start: 

In [34]:
# Mit bestehendem Train, Validation und Test File zusammenführen
import pandas as pd
from sklearn.model_selection import train_test_split

# Lade den `combined_output_with_epitope_mhc_TRB_only_expanded.csv`-Datensatz
beta_10x = pd.read_csv("../data_10x/combined_output_with_epitope_mhc_TRB_only_expanded.csv", sep=',')

# Schritt 1: Splitte `beta_10x` in train, validation und test
train_split, test_split = train_test_split(beta_10x, test_size=0.3, random_state=42)
validation_split, test_split = train_test_split(test_split, test_size=0.5, random_state=42)

# Schritt 2: Lade die existierenden positiven Daten aus train, validation und test Dateien
train_preneg = pd.read_csv(f"./data_10x/splitted_datasets/{precision}/beta/train_prenegsamples.tsv", sep='\t')
validation_preneg = pd.read_csv(f"./data_10x/splitted_datasets/{precision}/beta/validation_prenegsamples.tsv", sep='\t')
test_preneg = pd.read_csv(f"./data_10x/splitted_datasets/{precision}/beta/test_prenegsamples.tsv", sep='\t')

# Anzahl positiver Samples in train_split, validation_split und test_split
num_train_pos = len(train_preneg)
num_validation_pos = len(validation_preneg)
num_test_pos = len(test_preneg)

# Schritt 3: Berechne die Anzahl negativer Samples, um das Verhältnis zu erreichen
# Training set: 1:1 ratio
train_neg_needed = num_train_pos
train_neg_sampled = train_split.sample(n=train_neg_needed, random_state=42, replace=False)

# Validation set: 1:5 ratio
validation_neg_needed = 5* num_validation_pos
validation_neg_sampled = validation_split.sample(n=validation_neg_needed, random_state=42, replace=False)

# Test set: 1:5 ratio
test_neg_needed = 5* num_test_pos
test_neg_sampled = test_split.sample(n=test_neg_needed, random_state=42, replace=False)

# Schritt 4: Kombiniere die negativen und positiven Daten für train, validation und test
train_combined = pd.concat([train_preneg, train_neg_sampled], ignore_index=True)
validation_combined = pd.concat([validation_preneg, validation_neg_sampled], ignore_index=True)
test_combined = pd.concat([test_preneg, test_neg_sampled], ignore_index=True)

# Schritt 5: Speichern der kombinierten Datensätze
output_dir = f'./data_10x/splitted_datasets/{precision}/beta/'
train_combined.to_csv(output_dir + "train.tsv", sep='\t', index=False)
validation_combined.to_csv(output_dir + "validation.tsv", sep='\t', index=False)
test_combined.to_csv(output_dir + "test.tsv", sep='\t', index=False)

print("Datensätze wurden erfolgreich kombiniert und gespeichert.")

Datensätze wurden erfolgreich kombiniert und gespeichert.


In [35]:
print("TRB_CDR3 value types:")
print(final_df["TRB_CDR3"].apply(type).value_counts())


TRB_CDR3 value types:
TRB_CDR3
<class 'str'>    2126313
Name: count, dtype: int64


### Paired

In [36]:
#Dieser Code für ganzen Datensatz laufen lassen
import re
import pandas as pd

# Annahme: all_donors_consensus und all_donors_meta sind bereits geladen und gefiltert

# Festlegen der Batch-Größe für die Verarbeitung
batch_size = 1000

# Identifizieren von Epitope-Spalten, aber ohne "NR(B0801)_AAKGRGAAL_NC_binder"
epitope_columns = [col for col in all_donors_consensus.columns if 'Cancer_binder' in col and col != "NR(B0801)_AAKGRGAAL_NC_binder"]

# Liste für alle Batch-Ergebnisse
all_batches = []

# Verarbeite `all_donors_consensus` in Batches
for batch_start in range(0, len(all_donors_consensus), batch_size):
    print("Batch Start: ", batch_start)
    # Definiere Batch
    batch = all_donors_consensus.iloc[batch_start:batch_start + batch_size]
    
    # Filtern auf Zeilen, die sowohl 'TRA:' als auch 'TRB:' in 'cell_clono_cdr3_aa' enthalten
    batch_paired = batch[
        batch['cell_clono_cdr3_aa'].str.contains("TRA:", na=False) &
        batch['cell_clono_cdr3_aa'].str.contains("TRB:", na=False)
    ]

    # Liste, um Zeilen für diesen Batch zu speichern
    expanded_rows = []
    
    # Iteriere durch jede Zeile im Batch
    for _, row in batch_paired.iterrows():
        for col in epitope_columns:
            # Extrahiere MHC und Epitope
            match = re.match(r'([A-Z0-9]+)_([A-Z]+)_.*_binder', col)
            if match:
                mhc_raw, epitope = match.groups()
                mhc_formatted = f'HLA-{mhc_raw[0]}*{mhc_raw[1:3]}:{mhc_raw[3:]}'

                # Füge `Epitope` und `MHC` zur Zeile hinzu
                new_row = row.copy()
                new_row['Epitope'] = epitope
                new_row['MHC'] = mhc_formatted

                # Neue Zeile zur Batch-Liste hinzufügen
                expanded_rows.append(new_row)

    # Erstelle einen DataFrame aus dem Batch und füge ihn zur Gesamtliste hinzu
    batch_df = pd.DataFrame(expanded_rows)
    all_batches.append(batch_df)

# Kombiniere alle Batch-Ergebnisse zu einem DataFrame
expanded_df = pd.concat(all_batches, ignore_index=True)

# Nur die Paired-Einträge in `all_donors_meta` beibehalten
# Filtern auf Barcodes, die sowohl eine TRA- als auch eine TRB-Kette haben
paired_barcodes = all_donors_meta.groupby('barcode').filter(
    lambda x: set(x['chain']) == {'TRA', 'TRB'}
)['barcode'].unique()
all_donors_meta_paired = all_donors_meta[all_donors_meta['barcode'].isin(paired_barcodes)]

# Split `all_donors_meta_paired` nach `chain` in separate DataFrames für TRA und TRB
alpha_chain = all_donors_meta_paired[all_donors_meta_paired['chain'] == 'TRA'].rename(
    columns={'v_gene': 'TRAV', 'j_gene': 'TRAJ', 'cdr3': 'TRA_CDR3', 'c_gene': 'TRAC'}
)
beta_chain = all_donors_meta_paired[all_donors_meta_paired['chain'] == 'TRB'].rename(
    columns={'v_gene': 'TRBV', 'j_gene': 'TRBJ', 'cdr3': 'TRB_CDR3', 'c_gene': 'TRBC'}
)

# Zusammenführen von alpha_chain und beta_chain anhand der gemeinsamen 'barcode'-Spalte
paired_meta = pd.merge(alpha_chain, beta_chain, on='barcode', suffixes=('_alpha', '_beta'))

# Zusammenführen von `paired_meta` mit `expanded_df` anhand der 'barcode'-Spalte
merged_df = pd.merge(paired_meta, expanded_df[['barcode', 'Epitope', 'MHC']], on='barcode', how='inner')

# Spalten umbenennen und Format anpassen
merged_df = merged_df.rename(columns={'barcode': 'TCR_name'})

# Fehlende Spalten auffüllen
desired_columns = [
    'TCR_name', 'TRAV', 'TRAJ', 'TRA_CDR3', 'TRBV', 'TRBJ', 'TRB_CDR3', 'TRAC', 'TRBC', 
    'Epitope', 'MHC', 'Binding', 'task'
]
for col in desired_columns:
    if col not in merged_df.columns:
        merged_df[col] = 'nan' if col == 'task' else '0'

# Nur die gewünschten Spalten beibehalten und Zeilen mit `None` in `TRB_CDR3` entfernen
final_df = merged_df[desired_columns]
final_df = final_df[final_df['TRB_CDR3'] != 'None']

final_df = final_df[
    final_df['TRB_CDR3'].notna() & (final_df['TRB_CDR3'] != '') &
    final_df['TRA_CDR3'].notna() & (final_df['TRA_CDR3'] != '')
]

# Ausgabe des ersten Teils des Ergebnisses zur Überprüfung
print(final_df.head())

# Optional: Speichern des kombinierten DataFrames
output_path = '../data_10x/combined_output_with_epitope_mhc_paired_only_expanded-gene.csv'
final_df.to_csv(output_path, index=False)


Batch Start:  0
Batch Start:  1000
Batch Start:  2000
Batch Start:  3000
Batch Start:  4000
Batch Start:  5000
Batch Start:  6000
Batch Start:  7000
Batch Start:  8000
Batch Start:  9000
Batch Start:  10000
Batch Start:  11000
Batch Start:  12000
Batch Start:  13000
Batch Start:  14000
Batch Start:  15000
Batch Start:  16000
Batch Start:  17000
Batch Start:  18000
Batch Start:  19000
Batch Start:  20000
Batch Start:  21000
Batch Start:  22000
Batch Start:  23000
Batch Start:  24000
Batch Start:  25000
Batch Start:  26000
Batch Start:  27000
Batch Start:  28000
Batch Start:  29000
Batch Start:  30000
Batch Start:  31000
Batch Start:  32000
Batch Start:  33000
Batch Start:  34000
Batch Start:  35000
Batch Start:  36000
Batch Start:  37000
Batch Start:  38000
Batch Start:  39000
Batch Start:  40000
Batch Start:  41000
Batch Start:  42000
Batch Start:  43000
Batch Start:  44000
Batch Start:  45000
Batch Start:  46000
Batch Start:  47000
Batch Start:  48000
Batch Start:  49000
Batch Start: 

In [37]:
# Mit bestehendem Train, Validation und Test File zusammenführen
import pandas as pd
from sklearn.model_selection import train_test_split

# Lade den `combined_output_with_epitope_mhc_TRB_only_expanded.csv`-Datensatz
beta_10x = pd.read_csv("../data_10x/combined_output_with_epitope_mhc_paired_only_expanded.csv", sep=',')

# Schritt 1: Splitte negativen Samples `beta_10x` in train, validation und test
train_split, test_split = train_test_split(beta_10x, test_size=0.3, random_state=42)
validation_split, test_split = train_test_split(test_split, test_size=0.5, random_state=42)

# Schritt 2: Lade die existierenden positiven Samples aus train, validation und test Dateien
train_preneg = pd.read_csv(f"./data_10x/splitted_datasets/{precision}/paired/train_prenegsamples.tsv", sep='\t')
validation_preneg = pd.read_csv(f"./data_10x/splitted_datasets/{precision}/paired/validation_prenegsamples.tsv", sep='\t')
test_preneg = pd.read_csv(f"./data_10x/splitted_datasets/{precision}/paired/test_prenegsamples.tsv", sep='\t')

# Anzahl positiver Samples in train_split, validation_split und test_split
num_train_pos = len(train_preneg)
num_validation_pos = len(validation_preneg)
num_test_pos = len(test_preneg)

# Schritt 3: Berechne die Anzahl negativer Samples, um das Verhältnis zu erreichen
# Training set: 1:1 ratio
train_neg_needed = num_train_pos
train_neg_sampled = train_split.sample(n=train_neg_needed, random_state=42, replace=False)

# Validation set: 1:5 ratio
validation_neg_needed = 5* num_validation_pos
validation_neg_sampled = validation_split.sample(n=validation_neg_needed, random_state=42, replace=False)

# Test set: 1:5 ratio
test_neg_needed = 5* num_test_pos
test_neg_sampled = test_split.sample(n=test_neg_needed, random_state=42, replace=False)

# Schritt 4: Kombiniere die negativen und positiven Daten für train, validation und test
train_combined = pd.concat([train_preneg, train_neg_sampled], ignore_index=True)
validation_combined = pd.concat([validation_preneg, validation_neg_sampled], ignore_index=True)
test_combined = pd.concat([test_preneg, test_neg_sampled], ignore_index=True)

# Schritt 5: Speichern der kombinierten Datensätze
output_dir = f'./data_10x/splitted_datasets/{precision}/paired/'
train_combined.to_csv(output_dir + "train.tsv", sep='\t', index=False)
validation_combined.to_csv(output_dir + "validation.tsv", sep='\t', index=False)
test_combined.to_csv(output_dir + "test.tsv", sep='\t', index=False)

print("Datensätze wurden erfolgreich kombiniert und gespeichert.")

Datensätze wurden erfolgreich kombiniert und gespeichert.


In [38]:
print(final_df.dtypes)


TCR_name    object
TRAV        object
TRAJ        object
TRA_CDR3    object
TRBV        object
TRBJ        object
TRB_CDR3    object
TRAC        object
TRBC        object
Epitope     object
MHC         object
Binding     object
task        object
dtype: object


In [39]:
print("TRA_CDR3 value types:")
print(final_df["TRA_CDR3"].apply(type).value_counts())

print("TRB_CDR3 value types:")
print(final_df["TRB_CDR3"].apply(type).value_counts())


TRA_CDR3 value types:
TRA_CDR3
<class 'str'>    2028672
Name: count, dtype: int64
TRB_CDR3 value types:
TRB_CDR3
<class 'str'>    2028672
Name: count, dtype: int64


## Mix generierte negative Samples mit 10X 1:1 ratio

In [None]:
# Datei-Paths 10X
train_file_10x_path = f"./splitted_datasets/{precision}/paired/train.tsv"
validation_file_10x_path = f"./splitted_datasets/{precision}/paired/validation.tsv"
test_file_10x_path = f"./splitted_datasets/{precision}/paired/test.tsv"

# Datei-Paths BA
train_file_path = f'./../../BA/BA_ZHAW/data/splitted_datasets/allele/paired/train.tsv'
validation_file_path = f'./../../BA/BA_ZHAW/data/splitted_datasets/allele/paired/validation.tsv'
test_file_path = f'./../../BA/BA_ZHAW/data/splitted_datasets/allele/paired/test.tsv'

## Task Classification >>> wichtig: bei paired_reclassification.ipynb _10x zu unterst hinzufügen bei main_project...
The classification in the split notebook correct for positive only data. After adding negative data, some classifications might be wrong.

In [40]:
paired_output_folder = f'{pipeline_data_splitted}/{precision}/paired'
validation_file_name = 'validation.tsv'
test_file_name = 'test.tsv'
train_file_name = 'train.tsv'
beta_output_folder = f'{pipeline_data_splitted}/{precision}/beta'

In [41]:
# do the classification for paired data
paired = True
train_data_path = f'{paired_output_folder}/{train_file_name}'
test_data_path = f'{paired_output_folder}/{test_file_name}'
validation_data_path = f'{paired_output_folder}/{validation_file_name}'

%run ./data_scripts/data_preparation/classification.ipynb

In [42]:
# extended classification for paired data
test_path = f'{paired_output_folder}/{test_file_name}'
train_path = f'{paired_output_folder}/{train_file_name}'
validation_path = f'{paired_output_folder}/{validation_file_name}'
output_path = f'{paired_output_folder}/test_reclassified_paired_specific.tsv'
paired_data_path = paired_output_folder
alpha_cdr3_name = 'TRA_CDR3'
beta_cdr3_name = 'TRB_CDR3'
epitope_name = 'Epitope'
task_name = 'task'

%run ./data_scripts/data_preparation/paired_reclassification.ipynb

gene
./data_10x/splitted_datasets/gene/paired/train.tsv
                  TCR_name        TRAV    TRAJ          TRA_CDR3      TRBV  \
0                    17414  TRAV29/DV5  TRAJ37    CAASALGNTGKLIF   TRBV4-1   
1                    20147       TRAV5  TRAJ20     CAEIRANDYKLSF   TRBV5-1   
2                    16532      TRAV17  TRAJ17  CAALDGIKAAGNKLTF    TRBV19   
3                    21795         NaN     NaN      CAFLGGANNLFF       NaN   
4                     8426      TRAV17   TRAJ7    CATGLYYGNNRLAF  TRBV11-2   
...                    ...         ...     ...               ...       ...   
43351   TAGAGCTCATTCCTCG-5      TRAV20  TRAJ52  CAVRRAGGTSYGKLTF   TRBV3-1   
43352  TCTTCGGCAAACGTGG-40    TRAV38-1  TRAJ26   CAFNERDNYGQNFVF  TRBV10-1   
43353  GTCACGGAGGTAGCCA-20     TRAV1-2   TRAJ4       CAVRNYNKLIF   TRBV4-3   
43354  ACGTCAACATACTCTT-23     TRAV8-3   TRAJ7     CAVFLYGNNRLAF  TRBV29-1   
43355  GAGTCCGAGGAGTAGA-25       TRAV5   TRAJ6     CAEMEGGSYIPTF   TRBV4-2   

       

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mfrohoari[0m ([33mpa_cancerimmunotherapy[0m). Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: Adding directory to artifact (./data_10x/splitted_datasets/gene/paired)... Done. 0.5s


VBox(children=(Label(value='1.338 MB of 19.980 MB uploaded\r'), FloatProgress(value=0.06694487096815765, max=1…

In [43]:
# do the classification for beta data
paired = False
train_data_path = f'{beta_output_folder}/{train_file_name}'
test_data_path = f'{beta_output_folder}/{test_file_name}'
validation_data_path = f'{beta_output_folder}/{validation_file_name}'

%run ./data_scripts/data_preparation/classification.ipynb

  df_train = pd.read_csv(train_data_path, sep="\t")
  df_test = pd.read_csv(test_data_path, sep="\t")


In the next two cells the classification is checked. If the output says "Classification is correct", everything is fine.

In [44]:
# check task classification paired
splitted_data_path = paired_output_folder

%run ./data_scripts/data_preparation/check_task_classification_paired.ipynb

train+validate data has 110766 entries
test data has 43356 entries
test data has 22545 TPP1 tasks (seen tcr & seen epitopes).
test data has 20522 TPP2 tasks (unseen tcr & seen epitopes).
test data has 289 TPP3 tasks (unseen tcr & unseen epitope).
test data has 0 TPP4 tasks (seen tcr & unseen epitope).
the train/test ratio is 0.7186903881340756/0.2813096118659244
Incorrectly set tasks:
          Epitope TRA_CDR3           TRB_CDR3  task
7228    SLLMWITQV      NaN  CASSLDLSGGITDTQYF  TPP2
7248   YLNDHLEPWI      NaN     CSARDGRREYEQYF  TPP2
7275    RIAAWMATY      NaN      CASSIRSSYEQYF  TPP2
7281   CLLWSFQTSA      NaN     CASSSRTGYDGYTF  TPP2
7292   ELAGIGILTV      NaN   CASSPGARHLEETQYF  TPP2
...           ...      ...                ...   ...
43291   KTWGQYWQV      NaN      CSVDLEANYGYTF  TPP2
43316   KVLEYVIKV      NaN        CASRGAGELFF  TPP2
43331   RIAAWMATY      NaN     CASSFSGNTGELFF  TPP2
43338   SLLMWITQV      NaN    CASSQRPSEVGELFF  TPP2
43345  YLNDHLEPWI      NaN   CASSPGGLAGA

In [45]:
# check task classification beta
splitted_data_path = beta_output_folder

%run ./data_scripts/data_preparation/check_task_classification_beta.ipynb

  df_train = pd.read_csv(f"{splitted_data_path}/{train_file_name}", sep="\t")


train data has 251750 entries
test data has 161844 entries
test data has 140580 TPP1 tasks (seen tcr & seen epitopes).
test data has 20961 TPP2 tasks (unseen tcr & seen epitopes).
test data has 298 TPP3 tasks (unseen tcr & unseen epitope).
test data has 5 TPP4 tasks (seen tcr & unseen epitope).
the train/test ratio is 0.7187434831570021/0.28125651684299796
Classification is correct.
Correctness summary:
is_correct
True    161844
Name: count, dtype: int64


In [None]:
import pandas as pd

# Datei-Paths
train_file_path = f"./splitted_datasets/{precision}/paired/train.tsv"
validation_file_path = f"./splitted_datasets/{precision}/paired/validation.tsv"
test_file_path = f"./splitted_datasets/{precision}/paired/test.tsv"

# Spalten, die überprüft und bereinigt werden sollen
columns_to_check = ["TCR_name", "TRA_CDR3", "TRB_CDR3"]

# Funktion zur Bereinigung von NaN-Werten
def remove_nan_values(file_path, columns):
    print(f"Cleaning file: {file_path}")
    df = pd.read_csv(file_path, sep='\t')
    
    # Entferne Zeilen mit NaN-Werten in den angegebenen Spalten
    for column in columns:
        if column in df.columns:
            original_length = len(df)
            df = df[df[column].notna()]
            cleaned_length = len(df)
            print(f"Column '{column}' - Removed {original_length - cleaned_length} rows with NaN values")
        else:
            print(f"Column {column} not found in {file_path}")
    
    # Speichere die bereinigte Datei
    df.to_csv(file_path, sep='\t', index=False)
    print(f"Cleaned file saved: {file_path}")

# Dateien bereinigen
for file_path in [train_file_path, validation_file_path, test_file_path]:
    remove_nan_values(file_path, columns_to_check)


In [None]:
import pandas as pd

# Datei-Paths
train_file_path = f"./splitted_datasets/{precision}/paired/train.tsv"
validation_file_path = f"./splitted_datasets/{precision}/paired/validation.tsv"
test_file_path = f"./splitted_datasets/{precision}/paired/test.tsv"

# Spalten, die überprüft werden sollen
columns_to_check = ["TCR_name", "TRA_CDR3", "TRB_CDR3"]

# Funktion zur Identifikation von nicht-String-Werten
def find_non_string_values(file_path, columns):
    print(f"Checking file: {file_path}")
    df = pd.read_csv(file_path, sep='\t')
    
    for column in columns:
        if column in df.columns:
            non_string_values = df[~df[column].apply(lambda x: isinstance(x, str))][column]
            print(f"Column '{column}' - Non-string values found: {len(non_string_values)}")
            if len(non_string_values) > 0:
                print(non_string_values)
        else:
            print(f"Column {column} not found in {file_path}")

# Dateien überprüfen
for file_path in [train_file_path, validation_file_path, test_file_path]:
    find_non_string_values(file_path, columns_to_check)


## Upload dataset

In [56]:
import os
print(os.listdir(path_to_data))


['.ipynb_checkpoints', 'validation_prenegsamples.tsv', 'test.tsv', 'train.tsv', 'validation.tsv', 'train_prenegsamples.tsv', 'test_prenegsamples.tsv']


In [57]:
from dotenv import load_dotenv, find_dotenv
load_dotenv()

# upload paired data
path_to_data = f'{pipeline_data_splitted}/{precision}/paired'
dataset_name = f'paired_{precision}'
#main_project_name = os.getenv("MAIN_PROJECT_NAME")
main_project_name = f"dataset-{precision}_10x"

%run ./data_scripts/upload_datasets.ipynb

uploading dataset to dataset-gene_10x


[34m[1mwandb[0m: Adding directory to artifact (./data_10x/splitted_datasets/gene/paired)... Done. 0.2s


VBox(children=(Label(value='1.181 MB of 13.461 MB uploaded\r'), FloatProgress(value=0.08776903208456056, max=1…

In [58]:
# upload beta data
path_to_data = f'{pipeline_data_splitted}/{precision}/beta'
dataset_name = f'beta_{precision}'

%run ./data_scripts/upload_datasets.ipynb

uploading dataset to dataset-gene_10x


[34m[1mwandb[0m: Adding directory to artifact (./data_10x/splitted_datasets/gene/beta)... Done. 0.1s


## Create Embeddings >> vorher noch datacheck laufen lassen, um NAs Werte rauslöschen

In [54]:
import torch
print(torch.cuda.is_available())  # Sollte True zurückgeben
print(torch.version.cuda)  # Sollte die richtige CUDA-Version anzeigen

True
12.4


In [55]:
path_paired_test = f"data_10x/splitted_datasets/{precision}/paired/test.tsv"
path_paired_validation = f"data_10x/splitted_datasets/{precision}/paired/validation.tsv"
path_paired_train = f"data_10x/splitted_datasets/{precision}/paired/train.tsv"
path_beta_test = f"data_10x/splitted_datasets/{precision}/beta/test.tsv"
path_beta_validation = f"data_10x/splitted_datasets/{precision}/beta/validation.tsv"
path_beta_train = f"data_10x/splitted_datasets/{precision}/beta/train.tsv"

path_paired = f"{pipeline_data}/embeddings/temp/{precision}/paired_concatenated.tsv"
create_folders_if_not_exists([os.path.dirname(path_paired)])
df_paired_test = pd.read_csv(path_paired_test, sep="\t", index_col=False)
df_paired_validation = pd.read_csv(path_paired_validation, sep="\t", index_col=False)
df_paired_train = pd.read_csv(path_paired_train, sep="\t", index_col=False)
df_paired = pd.concat([df_paired_test, df_paired_validation, df_paired_train])
df_paired.to_csv(path_paired, sep="\t", index=False)

# paired
%run ./data_scripts/generateEmbeddings.py paired {path_paired} {pipeline_data}/embeddings/paired/{precision}/TRA_paired_embeddings.npz TRA_CDR3
%run ./data_scripts/generateEmbeddings.py paired {path_paired} {pipeline_data}/embeddings/paired/{precision}/TRB_paired_embeddings.npz TRB_CDR3
%run ./data_scripts/generateEmbeddings.py paired {path_paired} {pipeline_data}/embeddings/paired/{precision}/Epitope_paired_embeddings.npz Epitope

path_beta = f"{pipeline_data}/embeddings/temp/{precision}/beta_concatenated.tsv"
create_folders_if_not_exists([os.path.dirname(path_beta)])
df_beta_test = pd.read_csv(path_beta_test, sep="\t", index_col=False)
df_beta_validation = pd.read_csv(path_beta_validation, sep="\t", index_col=False)
df_beta_train = pd.read_csv(path_beta_train, sep="\t", index_col=False)
df_beta = pd.concat([df_beta_test, df_beta_validation, df_beta_train])
df_beta.to_csv(path_beta, sep="\t", index=False)

# beta
#%run ./data_scripts/generateEmbeddings.py beta {path_beta} {pipeline_data}/embeddings/beta/{precision}/TRB_beta_embeddings.npz TRB_CDR3
#%run ./data_scripts/generateEmbeddings.py beta {path_beta} {pipeline_data}/embeddings/beta/{precision}/Epitope_beta_embeddings.npz Epitope

Using GPU: Tesla T4
Loading: Rostlab/prot_t5_xl_half_uniref50-enc
Model is on device: cuda:0
Processing Batch:  0 64
Processing Batch:  64 128
Processing Batch:  128 192
Processing Batch:  192 256
Processing Batch:  256 320
Processing Batch:  320 384
Processing Batch:  384 448
Processing Batch:  448 512
Processing Batch:  512 576
Processing Batch:  576 640
Processing Batch:  640 704
Processing Batch:  704 768
Processing Batch:  768 832
Processing Batch:  832 896
Processing Batch:  896 960
Processing Batch:  960 1024
Processing Batch:  1024 1088
Processing Batch:  1088 1152
Processing Batch:  1152 1216
Processing Batch:  1216 1280
Processing Batch:  1280 1344
Processing Batch:  1344 1408
Processing Batch:  1408 1472
Processing Batch:  1472 1536
Processing Batch:  1536 1600
Processing Batch:  1600 1664
Processing Batch:  1664 1728
Processing Batch:  1728 1792
Processing Batch:  1792 1856
Processing Batch:  1856 1920
Processing Batch:  1920 1984
Processing Batch:  1984 2048
Processing Bat

  df_beta_train = pd.read_csv(path_beta_train, sep="\t", index_col=False)


In [51]:
from transformers import T5Tokenizer, T5EncoderModel


## Create Physicochemical Properties

In [52]:
!python ./data_scripts/generatePhysicoParallel.py paired {pipeline_data_splitted}/{precision}/paired test ./data_10x/physicoProperties {precision}
!python ./data_scripts/generatePhysicoParallel.py paired {pipeline_data_splitted}/{precision}/paired validation ./data_10x/physicoProperties {precision}
!python ./data_scripts/generatePhysicoParallel.py paired {pipeline_data_splitted}/{precision}/paired train ./data_10x/physicoProperties {precision}

!python ./data_scripts/generatePhysicoParallel.py beta {pipeline_data_splitted}/{precision}/beta test ./data_10x/physicoProperties {precision}
!python ./data_scripts/generatePhysicoParallel.py beta {pipeline_data_splitted}/{precision}/beta validation ./data_10x/physicoProperties {precision}
!python ./data_scripts/generatePhysicoParallel.py beta {pipeline_data_splitted}/{precision}/beta train ./data_10x/physicoProperties {precision}

Traceback (most recent call last):
  File "/home/ubuntu/PA-Cancer-Immunotherapy-Transformer/BA_ZHAW/./data_scripts/generatePhysicoParallel.py", line 1, in <module>
    import peptides
ModuleNotFoundError: No module named 'peptides'
Traceback (most recent call last):
  File "/home/ubuntu/PA-Cancer-Immunotherapy-Transformer/BA_ZHAW/./data_scripts/generatePhysicoParallel.py", line 1, in <module>
    import peptides
ModuleNotFoundError: No module named 'peptides'
Traceback (most recent call last):
  File "/home/ubuntu/PA-Cancer-Immunotherapy-Transformer/BA_ZHAW/./data_scripts/generatePhysicoParallel.py", line 1, in <module>
    import peptides
ModuleNotFoundError: No module named 'peptides'
Traceback (most recent call last):
  File "/home/ubuntu/PA-Cancer-Immunotherapy-Transformer/BA_ZHAW/./data_scripts/generatePhysicoParallel.py", line 1, in <module>
    import peptides
ModuleNotFoundError: No module named 'peptides'
Traceback (most recent call last):
  File "/home/ubuntu/PA-Cancer-Immuno

### Scale Physicochemical Properties

In [53]:
base_path = "./data_10x/physicoProperties"
chain = "paired"
%run ./data_scripts/scale_physicos.ipynb

chain = "beta"
%run ./data_scripts/scale_physicos.ipynb

ModuleNotFoundError: No module named 'peptides'

ModuleNotFoundError: No module named 'peptides'

In [8]:
import pandas as pd

# Beispielpfade für Train-, Test-, und Validierungsdatensätze für alle vier Kategorien
base_path = 'data_10x/splitted_datasets'

# Definierte Pfade für alle vier Kategorien
datasets = {
    "paired_gene": {
        "train": f"{base_path}/gene/paired/train.tsv",
        "test": f"{base_path}/gene/paired/test.tsv",
        "validation": f"{base_path}/gene/paired/validation.tsv"
    },
    "paired_allele": {
        "train": f"{base_path}/allele/paired/train.tsv",
        "test": f"{base_path}/allele/paired/test.tsv",
        "validation": f"{base_path}/allele/paired/validation.tsv"
    },
    "beta_gene": {
        "train": f"{base_path}/gene/beta/train.tsv",
        "test": f"{base_path}/gene/beta/test.tsv",
        "validation": f"{base_path}/gene/beta/validation.tsv"
    },
    "beta_allele": {
        "train": f"{base_path}/allele/beta/train.tsv",
        "test": f"{base_path}/allele/beta/test.tsv",
        "validation": f"{base_path}/allele/beta/validation.tsv"
    }
}

# Berechnung der Anzahl der Zeilen für jedes Set
results = {}
for dataset_name, paths in datasets.items():
    # Daten laden
    train_df = pd.read_csv(paths["train"], sep='\t')
    test_df = pd.read_csv(paths["test"], sep='\t')
    validation_df = pd.read_csv(paths["validation"], sep='\t')
    
    # Anzahl der Zeilen berechnen
    train_length = len(train_df)
    test_length = len(test_df)
    validation_length = len(validation_df)
    total_length = train_length + test_length + validation_length
    
    # Zähle die Anzahl der Bindings 1 und 0 in jedem Datensatz
    train_binding_counts = train_df['Binding'].value_counts()
    test_binding_counts = test_df['Binding'].value_counts()
    validation_binding_counts = validation_df['Binding'].value_counts()
    
    # Zähle die Anzahl der TPP1, TPP2, TPP3 Einträge in jedem Datensatz
    train_task_counts = train_df['task'].value_counts()
    test_task_counts = test_df['task'].value_counts()
    validation_task_counts = validation_df['task'].value_counts()

    # Ergebnisse speichern
    results[dataset_name] = {
        "Train": train_length,
        "Train_Binding_1": train_binding_counts.get(1, 0),
        "Train_Binding_0": train_binding_counts.get(0, 0),
        "Train_TPP1": train_task_counts.get("TPP1", 0),
        "Train_TPP2": train_task_counts.get("TPP2", 0),
        "Train_TPP3": train_task_counts.get("TPP3", 0),
        "Train_TPP4": train_task_counts.get("TPP4", 0),
        "Test": test_length,
        "Test_Binding_1": test_binding_counts.get(1, 0),
        "Test_Binding_0": test_binding_counts.get(0, 0),
        "Test_TPP1": test_task_counts.get("TPP1", 0),
        "Test_TPP2": test_task_counts.get("TPP2", 0),
        "Test_TPP3": test_task_counts.get("TPP3", 0),
        "Test_TPP4": test_task_counts.get("TPP4", 0),
        "Validation": validation_length,
        "Validation_Binding_1": validation_binding_counts.get(1, 0),
        "Validation_Binding_0": validation_binding_counts.get(0, 0),
        "Validation_TPP1": validation_task_counts.get("TPP1", 0),
        "Validation_TPP2": validation_task_counts.get("TPP2", 0),
        "Validation_TPP3": validation_task_counts.get("TPP3", 0),
        "Validation_TPP4": validation_task_counts.get("TPP4", 0),
        "Total": total_length
    }

# Ergebnisse anzeigen
for dataset, lengths in results.items():
    print(f'--- {dataset.replace("_", " ").title()} ---')
    print(f'Anzahl der Zeilen im Trainingsdatensatz: {lengths["Train"]} (Binding=1: {lengths["Train_Binding_1"]}, Binding=0: {lengths["Train_Binding_0"]}, TPP1: {lengths["Train_TPP1"]}, TPP2: {lengths["Train_TPP2"]}, TPP3: {lengths["Train_TPP3"]})')
    print(f'Anzahl der Zeilen im Testdatensatz: {lengths["Test"]} (Binding=1: {lengths["Test_Binding_1"]}, Binding=0: {lengths["Test_Binding_0"]}, TPP1: {lengths["Test_TPP1"]}, TPP2: {lengths["Test_TPP2"]}, TPP3: {lengths["Test_TPP3"]})')
    print(f'Anzahl der Zeilen im Validierungsdatensatz: {lengths["Validation"]} (Binding=1: {lengths["Validation_Binding_1"]}, Binding=0: {lengths["Validation_Binding_0"]}, TPP1: {lengths["Validation_TPP1"]}, TPP2: {lengths["Validation_TPP2"]}, TPP3: {lengths["Validation_TPP3"]})')
    print(f'Gesamtanzahl der Zeilen (Train + Test + Validation): {lengths["Total"]}\n')

# Optional: Ergebnisse in einer Übersichtstabelle darstellen
summary_data = []
for dataset, lengths in results.items():
    summary_data.append({
        "Dataset": dataset.replace("_", " ").title(),
        "Train": lengths["Train"],
        "Train_Binding_1": lengths["Train_Binding_1"],
        "Train_Binding_0": lengths["Train_Binding_0"],
        "Train_TPP1": lengths["Train_TPP1"],
        "Train_TPP2": lengths["Train_TPP2"],
        "Train_TPP3": lengths["Train_TPP3"],
        "Train_TPP4": lengths["Train_TPP4"],
        "Test": lengths["Test"],
        "Test_Binding_1": lengths["Test_Binding_1"],
        "Test_Binding_0": lengths["Test_Binding_0"],
        "Test_TPP1": lengths["Test_TPP1"],
        "Test_TPP2": lengths["Test_TPP2"],
        "Test_TPP3": lengths["Test_TPP3"],
        "Test_TPP4": lengths["Test_TPP4"],
        "Validation": lengths["Validation"],
        "Validation_Binding_1": lengths["Validation_Binding_1"],
        "Validation_Binding_0": lengths["Validation_Binding_0"],
        "Validation_TPP1": lengths["Validation_TPP1"],
        "Validation_TPP2": lengths["Validation_TPP2"],
        "Validation_TPP3": lengths["Validation_TPP3"],
        "Validation_TPP4": lengths["Validation_TPP4"],
        "Total": lengths["Total"]
    })

summary_df = pd.DataFrame(summary_data)
print(summary_df)


  train_df = pd.read_csv(paths["train"], sep='\t')
  train_df = pd.read_csv(paths["train"], sep='\t')


--- Paired Gene ---
Anzahl der Zeilen im Trainingsdatensatz: 61498 (Binding=1: 33711, Binding=0: 27787, TPP1: 0, TPP2: 0, TPP3: 0)
Anzahl der Zeilen im Testdatensatz: 36806 (Binding=1: 7226, Binding=0: 29580, TPP1: 22545, TPP2: 13972, TPP3: 289)
Anzahl der Zeilen im Validierungsdatensatz: 36934 (Binding=1: 7224, Binding=0: 29710, TPP1: 0, TPP2: 0, TPP3: 0)
Gesamtanzahl der Zeilen (Train + Test + Validation): 135238

--- Paired Allele ---
Anzahl der Zeilen im Trainingsdatensatz: 66590 (Binding=1: 36515, Binding=0: 30075, TPP1: 0, TPP2: 0, TPP3: 0)
Anzahl der Zeilen im Testdatensatz: 39891 (Binding=1: 7826, Binding=0: 32065, TPP1: 25231, TPP2: 14579, TPP3: 81)
Anzahl der Zeilen im Validierungsdatensatz: 40021 (Binding=1: 7826, Binding=0: 32195, TPP1: 0, TPP2: 0, TPP3: 0)
Gesamtanzahl der Zeilen (Train + Test + Validation): 146502

--- Beta Gene ---
Anzahl der Zeilen im Trainingsdatensatz: 251750 (Binding=1: 125875, Binding=0: 125875, TPP1: 0, TPP2: 0, TPP3: 0)
Anzahl der Zeilen im Testda