In [2]:
import pandas as pd
from sklearn.model_selection import KFold
import pickle
from tcrpeg.TCRpeg import TCRpeg


In [3]:
# Create 5 Folds each from the allele and gene data from the PA-Transformers project. 
# IMPORTANT: Here we only used the test.tsv

# File paths
file_paths = [
    "data_for_inference/allele/beta/test.tsv",
    "data_for_inference/gene/beta/test.tsv"
]
precisions = ['allele', 'gene']

# Load, process, and split each file
for i, file_path in enumerate(file_paths):
    print(f"Processing {file_path}")
    
    # 1. Load the .tsv file
    df = pd.read_csv(file_path, sep='\t')
    
    # 2. Delete specific columns (replace 'column_to_delete' with actual column names)
    columns_to_delete = ['TCR_name', 'TRBV', 'TRBJ', 'TRBC', 'MHC', 'task']
    df.drop(columns=columns_to_delete, inplace=True, errors='ignore')
    
    # 3. Rename columns (replace with actual mappings)
    columns_to_rename = {
        'TRB_CDR3': 'CDR3.beta',
        'Binding': 'Label'
    }
    df.rename(columns=columns_to_rename, inplace=True)

    print('Satarting length= ', len(df))
    df = df[df['CDR3.beta'].apply(len) <= 30] 
    print('Length after removing CDR3.beta len > 30 = ', len(df))
    df = df[df['Epitope'].apply(len) <= 30] 
    print('Length after removing Epitope len > 30 = ', len(df))

    unique_chars = set(''.join(df['CDR3.beta']))  # Replace 'sequence_column' with your actual column name
    # print("Unique characters in sequences:", unique_chars)
    
    df_train = df['CDR3.beta'].values
    #create the TCRpeg class
    model = TCRpeg(hidden_size=64,num_layers = 3,load_data=True,embedding_path='pa_embeddings/TCRpeg/tcrpeg/data/embedding_32.txt',path_train=df_train)
    #create the TCRpeg model. 
    model.create_model()
    
    unmapped_chars = [ch for ch in unique_chars if ch not in model.aa2idx]
    print("Unmapped characters:", unmapped_chars)
    if unmapped_chars:
        
        # Create a regex pattern to match any of these characters
        pattern = f"[{''.join(unmapped_chars)}]"
        
        # Remove rows with any of the unmapped characters in 'CDR3.beta' column
        df = df[~df['CDR3.beta'].str.contains(pattern)]
        print('Length after removing unmapped chars= ', len(df))
    
    
    # 5. Generate 5 folds for cross-validation and save them in 'processed_data'
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    # Loop through each fold and save it as a separate file
    for fold, (_, fold_index) in enumerate(kf.split(df)):
        fold_data = df.iloc[fold_index]
        path_save_fold = f"processed_data/PA/{precisions[i]}_fold_{fold}.csv"
        fold_data.to_csv(path_save_fold, index=False)  # Saves each fold
        print(f"Saved fold {fold} as {precisions[i]}_fold_{fold}.csv in {path_save_fold}")



Processing data_for_inference/allele/beta/test.tsv
Satarting length=  59798
Length after removing CDR3.beta len > 30 =  59798
Length after removing Epitope len > 30 =  59455
Have loaded the data, total training seqs : 59455
Unmapped characters: []
Saved fold 0 as allele_fold_0.csv in processed_data/PA/allele_fold_0.csv
Saved fold 1 as allele_fold_1.csv in processed_data/PA/allele_fold_1.csv
Saved fold 2 as allele_fold_2.csv in processed_data/PA/allele_fold_2.csv
Saved fold 3 as allele_fold_3.csv in processed_data/PA/allele_fold_3.csv
Saved fold 4 as allele_fold_4.csv in processed_data/PA/allele_fold_4.csv
Processing data_for_inference/gene/beta/test.tsv
Satarting length=  53858
Length after removing CDR3.beta len > 30 =  53858
Length after removing Epitope len > 30 =  53484
Have loaded the data, total training seqs : 53484
Unmapped characters: []
Saved fold 0 as gene_fold_0.csv in processed_data/PA/gene_fold_0.csv
Saved fold 1 as gene_fold_1.csv in processed_data/PA/gene_fold_1.csv
Sav

In [None]:
# run the inference_pa.py

In [4]:
# Inference with the respective folds of 'test.tsv'  ( file_path = f"processed_data/PA/{precision}_fold_{i}.csv" )
# and the embedding_path = f"models/PA/{precision}_embeddings.pkl"
! python inference_pa.py --split StrictTCR --dataset pMTnet --device gpu --gpu_id 0

You chose the dataset: pMTnet
The split method is: StrictTCR
  GTE.load_state_dict(torch.load(model_path, map_location=device))
Fold: 0, AUC: 0.5352, AUPR: 0.5254
  GTE.load_state_dict(torch.load(model_path, map_location=device))
Fold: 1, AUC: 0.5269, AUPR: 0.5273
  GTE.load_state_dict(torch.load(model_path, map_location=device))
Fold: 2, AUC: 0.5313, AUPR: 0.5199
  GTE.load_state_dict(torch.load(model_path, map_location=device))
Fold: 3, AUC: 0.5399, AUPR: 0.5512
  GTE.load_state_dict(torch.load(model_path, map_location=device))
Fold: 4, AUC: 0.5347, AUPR: 0.5587


In [None]:
# Experiment: 
# Dataset: VDJdb  
# Model: pMTnet   model_path = f"models/{fixed_model}/{split}/{fixed_model}_{train_folds}_{i}.pth"
# 

In [1]:
! python inference_experiment.py --split StrictTCR --dataset VDJdb --device gpu --gpu_id 0

You chose the dataset: VDJdb
The split method is: StrictTCR
  GTE.load_state_dict(torch.load(model_path, map_location=device))
Fold: 0, AUC: 0.7412, AUPR: 0.3907
  GTE.load_state_dict(torch.load(model_path, map_location=device))
Fold: 1, AUC: 0.7615, AUPR: 0.4089
  GTE.load_state_dict(torch.load(model_path, map_location=device))
Fold: 2, AUC: 0.7099, AUPR: 0.3680
  GTE.load_state_dict(torch.load(model_path, map_location=device))
Fold: 3, AUC: 0.7498, AUPR: 0.4023
  GTE.load_state_dict(torch.load(model_path, map_location=device))
Fold: 4, AUC: 0.7521, AUPR: 0.4103


In [2]:
# =============================================================================
# Inference of Test data from BA (unseen) on the models gene and allele, those ones generated with train and validation data from BA aswell.
# A file inference_pa_one_model will be adapted for this purpose

! python inference_pa_one_model.py --device gpu --gpu_id 0

PROJECT_NAME: dataset-inference_GNN
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mfrohoari[0m ([33mpa_cancerimmunotherapy[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Tracking run with wandb version 0.18.7
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/home/ubuntu/PA-Cancer-Immunotherapy/GNN/wandb/run-20241205_103656-ur4gvpur[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mExperiment - GNN[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/pa_cancerimmunotherapy/dataset-inference_GNN[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/pa_cancerimmunotherapy/dataset-inference_GNN/runs/ur4gvpur[0m
torch.cuda.is_available:  True
Processing file:  ./data_for_inference/allele/beta/test.tsv
  GTE.load_state_dict(torch.load(model