In [1]:
from Stitchr import stitchrfunctions as fxn
from Stitchr import stitchr as st
import pandas as pd
import wandb
import pandas as pd
from dotenv import load_dotenv
import os
import subprocess
import numpy as np

This is the official documentatin if one would like to use Stitchr in an individual pipeline: 
[Stitchr Doc](https://github.com/JamieHeather/stitchr/blob/main/docs/importing.rst)

In [2]:
chain = "TRB"
species = "HUMAN"

tcr_dat, functionality, partial = fxn.get_imgt_data(chain, st.gene_types, species)
codons = fxn.get_optimal_codons('', species)

V_segment = "TRBV9*01"
J_segment = "TRBJ2-2*01"
CDR3_sequence_beta = "CASSENTANTGELFF"

tcr_bits = {"v": V_segment, "j": J_segment, "cdr3": CDR3_sequence_beta,
            "l": "TRBV7-3*01", "c": "TRBC1*01",
            "skip_c_checks": False, "species": species, "seamless": False,
            "5_prime_seq": "", "3_prime_seq": "", "name": "TCR"}

stitched = st.stitch(tcr_bits, tcr_dat, functionality, partial, codons, 3, "")
stitched

(['TCR',
  'TRBV9*01',
  'TRBJ2-2*01',
  'TRBC1*01',
  'CASSENTANTGELFF',
  'TRBV7-3*01(L)'],
 'ATGGGCACCAGGCTCCTCTGCTGGGCAGCCCTGTGCCTCCTGGGGGCAGATCACACAGATTCTGGAGTCACACAAACCCCAAAGCACCTGATCACAGCAACTGGACAGCGAGTGACGCTGAGATGCTCCCCTAGGTCTGGAGACCTCTCTGTGTACTGGTACCAACAGAGCCTGGACCAGGGCCTCCAGTTCCTCATTCAGTATTATAATGGAGAAGAGAGAGCAAAAGGAAACATTCTTGAACGATTCTCCGCACAACAGTTCCCTGACTTGCACTCTGAACTAAACCTGAGCTCTCTGGAGCTGGGGGACTCAGCTTTGTATTTCTGTGCCAGCAGCGAGAACACCGCCAACACCGGGGAGCTGTTTTTTGGAGAAGGCTCTAGGCTGACCGTACTGGAGGACCTGAACAAGGTGTTCCCACCCGAGGTCGCTGTGTTTGAGCCATCAGAAGCAGAGATCTCCCACACCCAAAAGGCCACACTGGTGTGCCTGGCCACAGGCTTCTTCCCCGACCACGTGGAGCTGAGCTGGTGGGTGAATGGGAAGGAGGTGCACAGTGGGGTCAGCACGGACCCGCAGCCCCTCAAGGAGCAGCCCGCCCTCAATGACTCCAGATACTGCCTGAGCAGCCGCCTGAGGGTCTCGGCCACCTTCTGGCAGAACCCCCGCAACCACTTCCGCTGTCAAGTCCAGTTCTACGGGCTCTCGGAGAATGACGAGTGGACCCAGGATAGGGCCAAACCCGTCACCCAGATCGTCAGCGCCGAGGCCTGGGGTAGAGCAGACTGTGGCTTTACCTCGGTGTCCTACCAGCAAGGGGTCCTGTCTGCCACCATCCTCTATGAGATCCTGCTAGGGAAGGCCACCCTGTATGCTGTGCTGGTCAGCGCCCTTGTGTTGA

However the documentation mentions that for high throughput (as we must use) one should use [*thimble*](https://github.com/JamieHeather/stitchr/blob/main/docs/thimble.rst)

In the follwoing we try to convert a whole subset of the VDJdb using this approach:

In [3]:
load_dotenv()
PROJECT_NAME = os.getenv("MAIN_PROJECT_NAME")
MCPAS_TCR = os.getenv("MCPAS_TCR_ARTIFACT")
VDJDB_BETA = os.getenv("VDJDB_BETA_ARTIFACT")
VDJDB_PAIRED = os.getenv("VDJDB_PAIRED_ARTIFACT")
wandb.init(project=PROJECT_NAME, job_type="data-Stitchr-pre-processing")

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33meggervali[0m ([33mba-zhaw[0m). Use [1m`wandb login --relogin`[0m to force relogin


Here we first create only beta-chain,only alpha-chain or only paired-chain datasets and store them in a .tsv!

This first try is by using only paired data! Then we transform the dataset according to the documentation so that it fits the [template for the documentation](https://github.com/JamieHeather/stitchr/tree/main/templates)

In [4]:
vdjdb_paired_artifact = wandb.use_artifact(f"{VDJDB_PAIRED}:latest")
vdjdb_paired_table = vdjdb_paired_artifact.get(f"{VDJDB_PAIRED}_table.table.json")
vdjdb_paired_df = pd.DataFrame(data=vdjdb_paired_table.data, columns=vdjdb_paired_table.columns)
vdjdb_paired_df

[34m[1mwandb[0m: Downloading large artifact VDJdb_paired_only:latest, 54.56MB. 1 files... 
[34m[1mwandb[0m:   1 of 1 files downloaded.  
Done. 0:0:2.7


Unnamed: 0,complex.id,Gene,CDR3,V,J,Species,MHC A,MHC B,MHC class,Epitope,Epitope gene,Epitope species,Reference,Method,Meta,CDR3fix,Score
0,1,TRB,CASSYLPGQGDHYSNQPQHF,TRBV13*01,TRBJ1-5*01,HomoSapiens,HLA-B*08,B2M,MHCI,FLKEKGGL,Nef,HIV-1,PMID:15596521,"{""frequency"": """", ""identification"": ""tetramer-...","{""cell.subset"": ""CD8+"", ""clone.id"": """", ""donor...","{""cdr3"": ""CASSYLPGQGDHYSNQPQHF"", ""cdr3_old"": ""...",2
1,2,TRB,CASSFEPGQGFYSNQPQHF,TRBV13*01,TRBJ1-5*01,HomoSapiens,HLA-B*08,B2M,MHCI,FLKEKGGL,Nef,HIV-1,PMID:15596521,"{""frequency"": """", ""identification"": ""tetramer-...","{""cell.subset"": ""CD8+"", ""clone.id"": """", ""donor...","{""cdr3"": ""CASSFEPGQGFYSNQPQHF"", ""cdr3_old"": ""C...",2
2,3,TRB,CASSYEPGQVSHYSNQPQHF,TRBV13*01,TRBJ1-5*01,HomoSapiens,HLA-B*08,B2M,MHCI,FLKEKGGL,Nef,HIV-1,PMID:15596521,"{""frequency"": """", ""identification"": ""tetramer-...","{""cell.subset"": ""CD8+"", ""clone.id"": """", ""donor...","{""cdr3"": ""CASSYEPGQVSHYSNQPQHF"", ""cdr3_old"": ""...",2
3,4,TRB,CASSALASLNEQFF,TRBV14*01,TRBJ2-1*01,HomoSapiens,HLA-B*08,B2M,MHCI,FLKEKGGL,Nef,HIV-1,PMID:15596521,"{""frequency"": """", ""identification"": ""tetramer-...","{""cell.subset"": ""CD8+"", ""clone.id"": """", ""donor...","{""cdr3"": ""CASSALASLNEQFF"", ""cdr3_old"": ""CASSAL...",2
4,5,TRB,CASSYLPGQGDHYSNQPQHF,TRBV13*01,TRBJ1-5*01,HomoSapiens,HLA-B*08,B2M,MHCI,FLKEQGGL,Nef,HIV-1,PMID:15596521,"{""frequency"": """", ""identification"": ""tetramer-...","{""cell.subset"": ""CD8+"", ""clone.id"": """", ""donor...","{""cdr3"": ""CASSYLPGQGDHYSNQPQHF"", ""cdr3_old"": ""...",2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56233,30590,TRA,CMDEGGSNYKLTF,TRAV26-1*01,TRAJ53*01,HomoSapiens,HLA-DQA1*05:01,HLA-DQB1*02:01,MHCII,PQPELPYPQPQL,Gluten,Wheat,PMID:33927715,"{""frequency"": """", ""identification"": ""tetramer-...","{""cell.subset"": ""CD4+"", ""clone.id"": """", ""donor...","{""cdr3"": ""CMDEGGSNYKLTF"", ""cdr3_old"": ""CMDEGGS...",0
56234,30591,TRA,CSLYNNNDMRF,TRAV26-1*01,TRAJ43*01,HomoSapiens,HLA-DQA1*05:01,HLA-DQB1*02:01,MHCII,PQPELPYPQPQL,Gluten,Wheat,PMID:33927715,"{""frequency"": """", ""identification"": ""tetramer-...","{""cell.subset"": ""CD4+"", ""clone.id"": """", ""donor...","{""cdr3"": ""CSLYNNNDMRF"", ""cdr3_old"": ""CSLYNNNDM...",0
56235,30592,TRA,CALSTDSWGKLQF,TRAV6*01,TRAJ24*01,HomoSapiens,HLA-DQA1*05:01,HLA-DQB1*02:01,MHCII,PQQPFPQPEQPFP,Gluten,Wheat,PMID:33927715,"{""frequency"": """", ""identification"": ""tetramer-...","{""cell.subset"": ""CD4+"", ""clone.id"": """", ""donor...","{""cdr3"": ""CALSTDSWGKLQF"", ""cdr3_old"": ""CALSTDS...",0
56236,30593,TRA,CAPQGATNKLIF,TRAV12-2*01,TRAJ32*01,HomoSapiens,HLA-DQA1*05:01,HLA-DQB1*02:01,MHCII,PQQPFPQPEQPFP,Gluten,Wheat,PMID:33927715,"{""frequency"": """", ""identification"": ""tetramer-...","{""cell.subset"": ""CD4+"", ""clone.id"": """", ""donor...","{""cdr3"": ""CAPQGATNKLIF"", ""cdr3_old"": ""CAPQGATN...",2


In [5]:
# Splitting the DataFrame into TRB and TRA based on the line transition
line_transition = 28120  # Line where information about beta chains termiante
vdjdb_TRB_df = vdjdb_paired_df.iloc[:line_transition-1].copy()  # TRB chains information
vdjdb_TRA_df = vdjdb_paired_df.iloc[line_transition-1:].copy()  # TRA chains information

# Renaming columns for TRB DataFrame
vdjdb_TRB_df.rename(columns={
    "complex.id": "TCR_name", 
    "V": "TRBV", 
    "J": 'TRBJ', 
    'CDR3': 'TRB_CDR3'
}, inplace=True)

vdjdb_TRA_df.rename(columns={
    'complex.id': 'TCR_name', 
    'V': 'TRAV', 
    'J': 'TRAJ', 
    'CDR3': 'TRA_CDR3'
}, inplace=True)

In [6]:
vdjdb_TRB_df.columns

Index(['TCR_name', 'Gene', 'TRB_CDR3', 'TRBV', 'TRBJ', 'Species', 'MHC A',
       'MHC B', 'MHC class', 'Epitope', 'Epitope gene', 'Epitope species',
       'Reference', 'Method', 'Meta', 'CDR3fix', 'Score'],
      dtype='object')

In [7]:
vdjdb_TRA_df.columns

Index(['TCR_name', 'Gene', 'TRA_CDR3', 'TRAV', 'TRAJ', 'Species', 'MHC A',
       'MHC B', 'MHC class', 'Epitope', 'Epitope gene', 'Epitope species',
       'Reference', 'Method', 'Meta', 'CDR3fix', 'Score'],
      dtype='object')

In [8]:
vdjdb_paired_one_line_df = pd.merge(vdjdb_TRB_df, vdjdb_TRA_df, on=['TCR_name', "Species", "MHC A", "MHC B", "MHC class", "Epitope", "Epitope gene", "Epitope species"])
vdjdb_paired_one_line_df.rename(columns={
    'Gene_x': 'Gene_TRB',
    'Gene_y': 'Gene_TRA',
    'Reference_x': 'Reference_TRB',
    'Method_x': 'Method_TRB',
    'Meta_x': 'Meta_TRB',
    'CDR3fix_x': 'CDR3fix_TRB',
    'Score_x': 'Score_TRB',
    'Reference_y': 'Reference_TRA',
    'Method_y': 'Method_TRA',
    'Meta_y': 'Meta_TRA',
    'CDR3fix_y': 'CDR3fix_TRA',
    'Score_y': 'Score_TRA',
}, inplace=True)
# complete dataframe with all columns (Stitchr + native from VDJdb) => However, not needed so no save neccessary until now!
# vdjdb_paired_one_line_df.to_csv("../data/VDJdb/temp/vdjdb_paired_one_line.tsv", sep="\t", index=False)
vdjdb_paired_one_line_df

Unnamed: 0,TCR_name,Gene_TRB,TRB_CDR3,TRBV,TRBJ,Species,MHC A,MHC B,MHC class,Epitope,...,Score_TRB,Gene_TRA,TRA_CDR3,TRAV,TRAJ,Reference_TRA,Method_TRA,Meta_TRA,CDR3fix_TRA,Score_TRA
0,1,TRB,CASSYLPGQGDHYSNQPQHF,TRBV13*01,TRBJ1-5*01,HomoSapiens,HLA-B*08,B2M,MHCI,FLKEKGGL,...,2,TRA,CIVRAPGRADMRF,TRAV26-1*01,TRAJ43*01,PMID:15596521,"{""frequency"": """", ""identification"": ""tetramer-...","{""cell.subset"": ""CD8+"", ""clone.id"": """", ""donor...","{""cdr3"": ""CIVRAPGRADMRF"", ""cdr3_old"": ""CIVRAPG...",2
1,2,TRB,CASSFEPGQGFYSNQPQHF,TRBV13*01,TRBJ1-5*01,HomoSapiens,HLA-B*08,B2M,MHCI,FLKEKGGL,...,2,TRA,CAVPSGAGSYQLTF,TRAV20*01,TRAJ28*01,PMID:15596521,"{""frequency"": """", ""identification"": ""tetramer-...","{""cell.subset"": ""CD8+"", ""clone.id"": """", ""donor...","{""cdr3"": ""CAVPSGAGSYQLTF"", ""cdr3_old"": ""CAVPSG...",2
2,3,TRB,CASSYEPGQVSHYSNQPQHF,TRBV13*01,TRBJ1-5*01,HomoSapiens,HLA-B*08,B2M,MHCI,FLKEKGGL,...,2,TRA,CAVKASGSRLT,TRAV2*01,,PMID:15596521,"{""frequency"": """", ""identification"": ""tetramer-...","{""cell.subset"": ""CD8+"", ""clone.id"": """", ""donor...","{""cdr3"": ""CAVKASGSRLT"", ""cdr3_old"": ""CAVKASGSR...",2
3,4,TRB,CASSALASLNEQFF,TRBV14*01,TRBJ2-1*01,HomoSapiens,HLA-B*08,B2M,MHCI,FLKEKGGL,...,2,TRA,CAYRPPGTYKYIF,TRAV38-2/DV8*01,TRAJ40*01,PMID:15596521,"{""frequency"": """", ""identification"": ""tetramer-...","{""cell.subset"": ""CD8+"", ""clone.id"": """", ""donor...","{""cdr3"": ""CAYRPPGTYKYIF"", ""cdr3_old"": ""CAYRPPG...",2
4,5,TRB,CASSYLPGQGDHYSNQPQHF,TRBV13*01,TRBJ1-5*01,HomoSapiens,HLA-B*08,B2M,MHCI,FLKEQGGL,...,2,TRA,CIVRAPGRADMRF,TRAV26-1*01,TRAJ43*01,PMID:15596521,"{""frequency"": """", ""identification"": ""tetramer-...","{""cell.subset"": ""CD8+"", ""clone.id"": """", ""donor...","{""cdr3"": ""CIVRAPGRADMRF"", ""cdr3_old"": ""CIVRAPG...",2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28114,30590,TRB,CASSVRSTDTQYF,TRBV7-2*01,TRBJ2-3*01,HomoSapiens,HLA-DQA1*05:01,HLA-DQB1*02:01,MHCII,PQPELPYPQPQL,...,0,TRA,CMDEGGSNYKLTF,TRAV26-1*01,TRAJ53*01,PMID:33927715,"{""frequency"": """", ""identification"": ""tetramer-...","{""cell.subset"": ""CD4+"", ""clone.id"": """", ""donor...","{""cdr3"": ""CMDEGGSNYKLTF"", ""cdr3_old"": ""CMDEGGS...",0
28115,30591,TRB,CASSLRYTDTQYF,TRBV7-2*01,TRBJ2-3*01,HomoSapiens,HLA-DQA1*05:01,HLA-DQB1*02:01,MHCII,PQPELPYPQPQL,...,0,TRA,CSLYNNNDMRF,TRAV26-1*01,TRAJ43*01,PMID:33927715,"{""frequency"": """", ""identification"": ""tetramer-...","{""cell.subset"": ""CD4+"", ""clone.id"": """", ""donor...","{""cdr3"": ""CSLYNNNDMRF"", ""cdr3_old"": ""CSLYNNNDM...",0
28116,30592,TRB,CASSPGQGGDNEQFF,TRBV7-3*01,TRBJ2-1*01,HomoSapiens,HLA-DQA1*05:01,HLA-DQB1*02:01,MHCII,PQQPFPQPEQPFP,...,0,TRA,CALSTDSWGKLQF,TRAV6*01,TRAJ24*01,PMID:33927715,"{""frequency"": """", ""identification"": ""tetramer-...","{""cell.subset"": ""CD4+"", ""clone.id"": """", ""donor...","{""cdr3"": ""CALSTDSWGKLQF"", ""cdr3_old"": ""CALSTDS...",0
28117,30593,TRB,CASSLGAGGQETQYF,TRBV5-1*01,TRBJ2-5*01,HomoSapiens,HLA-DQA1*05:01,HLA-DQB1*02:01,MHCII,PQQPFPQPEQPFP,...,2,TRA,CAPQGATNKLIF,TRAV12-2*01,TRAJ32*01,PMID:33927715,"{""frequency"": """", ""identification"": ""tetramer-...","{""cell.subset"": ""CD4+"", ""clone.id"": """", ""donor...","{""cdr3"": ""CAPQGATNKLIF"", ""cdr3_old"": ""CAPQGATN...",2


In [9]:
vdjdb_paired_one_line_df['TRA_leader'] = vdjdb_paired_one_line_df['TRAV'] + "(L)"
vdjdb_paired_one_line_df['TRB_leader'] = vdjdb_paired_one_line_df['TRBV'] + "(L)"

for new_column in ['TRAC', 'TRBC', 'Linker', 'Link_order', 'TRA_5_prime_seq', 'TRA_3_prime_seq', 'TRB_5_prime_seq', 'TRB_3_prime_seq']:
    vdjdb_paired_one_line_df[new_column] = np.nan # TODO check this

template_columns_order = [
    'TCR_name', 'TRAV', 'TRAJ', 'TRA_CDR3', 'TRBV', 'TRBJ', 'TRB_CDR3',
    'TRAC', 'TRBC', 'TRA_leader', 'TRB_leader', 'Linker', 'Link_order',
    'TRA_5_prime_seq', 'TRA_3_prime_seq', 'TRB_5_prime_seq', 'TRB_3_prime_seq'
]

vdjdb_paired_only_Stitchr_schema_df = vdjdb_paired_one_line_df[template_columns_order]

vdjdb_paired_only_Stitchr_schema_df.to_csv("../data/VDJdb/temp/vdjdb_paired_only_Stitchr_schema.tsv", sep="\t", index=False)

vdjdb_paired_only_Stitchr_schema_df

Unnamed: 0,TCR_name,TRAV,TRAJ,TRA_CDR3,TRBV,TRBJ,TRB_CDR3,TRAC,TRBC,TRA_leader,TRB_leader,Linker,Link_order,TRA_5_prime_seq,TRA_3_prime_seq,TRB_5_prime_seq,TRB_3_prime_seq
0,1,TRAV26-1*01,TRAJ43*01,CIVRAPGRADMRF,TRBV13*01,TRBJ1-5*01,CASSYLPGQGDHYSNQPQHF,,,TRAV26-1*01(L),TRBV13*01(L),,,,,,
1,2,TRAV20*01,TRAJ28*01,CAVPSGAGSYQLTF,TRBV13*01,TRBJ1-5*01,CASSFEPGQGFYSNQPQHF,,,TRAV20*01(L),TRBV13*01(L),,,,,,
2,3,TRAV2*01,,CAVKASGSRLT,TRBV13*01,TRBJ1-5*01,CASSYEPGQVSHYSNQPQHF,,,TRAV2*01(L),TRBV13*01(L),,,,,,
3,4,TRAV38-2/DV8*01,TRAJ40*01,CAYRPPGTYKYIF,TRBV14*01,TRBJ2-1*01,CASSALASLNEQFF,,,TRAV38-2/DV8*01(L),TRBV14*01(L),,,,,,
4,5,TRAV26-1*01,TRAJ43*01,CIVRAPGRADMRF,TRBV13*01,TRBJ1-5*01,CASSYLPGQGDHYSNQPQHF,,,TRAV26-1*01(L),TRBV13*01(L),,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28114,30590,TRAV26-1*01,TRAJ53*01,CMDEGGSNYKLTF,TRBV7-2*01,TRBJ2-3*01,CASSVRSTDTQYF,,,TRAV26-1*01(L),TRBV7-2*01(L),,,,,,
28115,30591,TRAV26-1*01,TRAJ43*01,CSLYNNNDMRF,TRBV7-2*01,TRBJ2-3*01,CASSLRYTDTQYF,,,TRAV26-1*01(L),TRBV7-2*01(L),,,,,,
28116,30592,TRAV6*01,TRAJ24*01,CALSTDSWGKLQF,TRBV7-3*01,TRBJ2-1*01,CASSPGQGGDNEQFF,,,TRAV6*01(L),TRBV7-3*01(L),,,,,,
28117,30593,TRAV12-2*01,TRAJ32*01,CAPQGATNKLIF,TRBV5-1*01,TRBJ2-5*01,CASSLGAGGQETQYF,,,TRAV12-2*01(L),TRBV5-1*01(L),,,,,,


In [10]:
vdjdb_paired_only_Stitchr_schema_df.columns

Index(['TCR_name', 'TRAV', 'TRAJ', 'TRA_CDR3', 'TRBV', 'TRBJ', 'TRB_CDR3',
       'TRAC', 'TRBC', 'TRA_leader', 'TRB_leader', 'Linker', 'Link_order',
       'TRA_5_prime_seq', 'TRA_3_prime_seq', 'TRB_5_prime_seq',
       'TRB_3_prime_seq'],
      dtype='object')

Below is the desired schema, according to the documentation! We need to prepare the df to fit into this schema.

In [11]:
template_stitchr = pd.read_csv("../data/StitchrOutputs/templates/thimble_input_example_TRA-TRB.tsv", sep="\t")
template_stitchr.columns

Index(['TCR_name', 'TRAV', 'TRAJ', 'TRA_CDR3', 'TRBV', 'TRBJ', 'TRB_CDR3',
       'TRAC', 'TRBC', 'TRA_leader', 'TRB_leader', 'Linker', 'Link_order',
       'TRA_5_prime_seq', 'TRA_3_prime_seq', 'TRB_5_prime_seq',
       'TRB_3_prime_seq'],
      dtype='object')

In [12]:
str(template_stitchr.columns) == str(vdjdb_paired_only_Stitchr_schema_df.columns)

True

This command needs to be adjusted if not paired-chain data gets passed!

- AB -> paired (alpha & beta chains)
- B -> ONLY beta chains
- A -> ONLY alpha chains

In [13]:
from_paired_vdjdb_path = "../data/VDJdb/temp/vdjdb_paired_only_Stitchr_schema.tsv"
to_paired_vdjdb_path =  "../data/StitchrOutputs/VDJdb_Stitchr/VDJdb_paired_Stitchr_no_epitopes"
command = ['thimble', '-in', from_paired_vdjdb_path, '-r', 'AB', "-s", "HUMAN", "-o", to_paired_vdjdb_path]

In [14]:
result = subprocess.run(command, capture_output=True, text=True)

# Check if the command was executed successfully
if result.returncode == 0:
    print("Command executed successfully.")
    # If you want to print the output
    print("Output:", result.stdout)
else:
    print("Error in command execution.")
    # If you want to print the error
    print("Error:", result.stderr)

Command executed successfully.
Output: Took 24.82 seconds



In [15]:
df_vdjdb_stitchr = pd.read_csv(to_paired_vdjdb_path+".tsv", sep="\t")
df_vdjdb_stitchr

Unnamed: 0,TCR_name,TRA_nt,TRB_nt,TRA_aa,TRB_aa,TRAV,TRAJ,TRA_CDR3,TRBV,TRBJ,...,TRB_leader,Linker,Link_order,TRA_5_prime_seq,TRA_3_prime_seq,TRB_5_prime_seq,TRB_3_prime_seq,Linked_nt,Linked_aa,Warnings/Errors
0,1,ATGAGGCTGGTGGCAAGAGTAACTGTGTTTCTGACCTTTGGAACTA...,ATGCTTAGTCCTGACCTGCCTGACTCTGCCTGGAACACCAGGCTCC...,MRLVARVTVFLTFGTIIDAKTTQPPSMDCAEGRAANLPCNHSTISG...,MLSPDLPDSAWNTRLLCHVMLCLLGAVSVAAGVIQSPRHLIKEKRE...,TRAV26-1*01,TRAJ43*01,CIVRAPGRADMRF,TRBV13*01,TRBJ1-5*01,...,TRBV13*01(L),,,,,,,,,(TRA) Cannot find the sequence of the requeste...
1,2,ATGGAGAAAATGTTGGAGTGTGCATTCATAGTCTTGTGGCTTCAGC...,ATGCTTAGTCCTGACCTGCCTGACTCTGCCTGGAACACCAGGCTCC...,MEKMLECAFIVLWLQLGWLSGEDQVTQSPEALRLQEGESSSLNCSY...,MLSPDLPDSAWNTRLLCHVMLCLLGAVSVAAGVIQSPRHLIKEKRE...,TRAV20*01,TRAJ28*01,CAVPSGAGSYQLTF,TRBV13*01,TRBJ1-5*01,...,TRBV13*01(L),,,,,,,,,(TRA) Cannot find the sequence of the requeste...
2,3,,ATGCTTAGTCCTGACCTGCCTGACTCTGCCTGGAACACCAGGCTCC...,,MLSPDLPDSAWNTRLLCHVMLCLLGAVSVAAGVIQSPRHLIKEKRE...,TRAV2*01,,CAVKASGSRLT,TRBV13*01,TRBJ1-5*01,...,TRBV13*01(L),,,,,,,,,(TRA) Error: a JOINING sequence region has not...
3,4,ATGGCATGCCCTGGCTTCCTGTGGGCACTTGTGATCTCCACCTGTC...,ATGGTTTCCAGGCTTCTCAGTTTAGTGTCCCTTTGTCTCCTGGGAG...,MACPGFLWALVISTCLEFSMAQTVTQSQPEMSVQEAETVTLSCTYD...,MVSRLLSLVSLCLLGAKHIEAGVTQFPSHSVIEKGQTVTLRCDPIS...,TRAV38-2/DV8*01,TRAJ40*01,CAYRPPGTYKYIF,TRBV14*01,TRBJ2-1*01,...,TRBV14*01(L),,,,,,,,,(TRA) Cannot find the sequence of the requeste...
4,5,ATGAGGCTGGTGGCAAGAGTAACTGTGTTTCTGACCTTTGGAACTA...,ATGCTTAGTCCTGACCTGCCTGACTCTGCCTGGAACACCAGGCTCC...,MRLVARVTVFLTFGTIIDAKTTQPPSMDCAEGRAANLPCNHSTISG...,MLSPDLPDSAWNTRLLCHVMLCLLGAVSVAAGVIQSPRHLIKEKRE...,TRAV26-1*01,TRAJ43*01,CIVRAPGRADMRF,TRBV13*01,TRBJ1-5*01,...,TRBV13*01(L),,,,,,,,,(TRA) Cannot find the sequence of the requeste...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28114,30590,ATGAGGCTGGTGGCAAGAGTAACTGTGTTTCTGACCTTTGGAACTA...,ATGGGCACCAGGCTCCTCTTCTGGGTGGCCTTCTGTCTCCTGGGGG...,MRLVARVTVFLTFGTIIDAKTTQPPSMDCAEGRAANLPCNHSTISG...,MGTRLLFWVAFCLLGADHTGAGVSQSPSNKVTEKGKDVELRCDPIS...,TRAV26-1*01,TRAJ53*01,CMDEGGSNYKLTF,TRBV7-2*01,TRBJ2-3*01,...,TRBV7-2*01(L),,,,,,,,,(TRA) Cannot find the sequence of the requeste...
28115,30591,ATGAGGCTGGTGGCAAGAGTAACTGTGTTTCTGACCTTTGGAACTA...,ATGGGCACCAGGCTCCTCTTCTGGGTGGCCTTCTGTCTCCTGGGGG...,MRLVARVTVFLTFGTIIDAKTTQPPSMDCAEGRAANLPCNHSTISG...,MGTRLLFWVAFCLLGADHTGAGVSQSPSNKVTEKGKDVELRCDPIS...,TRAV26-1*01,TRAJ43*01,CSLYNNNDMRF,TRBV7-2*01,TRBJ2-3*01,...,TRBV7-2*01(L),,,,,,,,,(TRA) Cannot find the sequence of the requeste...
28116,30592,ATGGAGTCATTCCTGGGAGGTGTTTTGCTGATTTTGTGGCTTCAAG...,ATGGGCACCAGGCTCCTCTGCTGGGCAGCCCTGTGCCTCCTGGGGG...,MESFLGGVLLILWLQVDWVKSQKIEQNSEALNIQEGKTATLTCNYT...,MGTRLLCWAALCLLGADHTGAGVSQTPSNKVTEKGKYVELRCDPIS...,TRAV6*01,TRAJ24*01,CALSTDSWGKLQF,TRBV7-3*01,TRBJ2-1*01,...,TRBV7-3*01(L),,,,,,,,,(TRA) Cannot find the sequence of the requeste...
28117,30593,ATGAAATCCTTGAGAGTTTTACTAGTGATCCTGTGGCTTCAGTTGA...,ATGGGCTCCAGGCTGCTCTGTTGGGTGCTGCTTTGTCTCCTGGGAG...,MKSLRVLLVILWLQLSWVWSQQKEVEQNSGPLSVPEGAIASLNCTY...,MGSRLLCWVLLCLLGAGPVKAGVTQTPRYLIKTRGQQVTLSCSPIS...,TRAV12-2*01,TRAJ32*01,CAPQGATNKLIF,TRBV5-1*01,TRBJ2-5*01,...,TRBV5-1*01(L),,,,,,,,,(TRA) Cannot find the sequence of the requeste...


As we needed to remove the epitopes to pass it through the Stitchr package, we need to append them back to the dataset!

In [18]:
vdjdb_paired_epitopes_df = vdjdb_paired_one_line_df["Epitope"]
vdjdb_paired_epitopes_df

0             FLKEKGGL
1             FLKEKGGL
2             FLKEKGGL
3             FLKEKGGL
4             FLKEQGGL
             ...      
28114     PQPELPYPQPQL
28115     PQPELPYPQPQL
28116    PQQPFPQPEQPFP
28117    PQQPFPQPEQPFP
28118    PQQPFPQPEQPFP
Name: Epitope, Length: 28119, dtype: object

In [21]:
vdjdb_paired_only_Stitchr_schema_with_epitopes_df = pd.concat([df_vdjdb_stitchr, vdjdb_paired_epitopes_df], axis=1)
# vdjdb_paired_only_Stitchr_schema_df.to_csv("../data/VDJdb/temp/vdjdb_paired_only_Stitchr_schema.tsv", sep="\t", index=False)
vdjdb_paired_only_Stitchr_schema_with_epitopes_df.to_csv("../data/VDJdb/stitchr_and_epitopes/VDJdb_paired_Stitchr_with_epitopes.tsv", sep="\t", index=False)

-------------------------------------------------------------------------------------------------
After doing the paired dataset, we want to to the same pre-processing step (Stitchr) with the beta-only!

In [None]:
vdjdb_beta_artifact = wandb.use_artifact(f"{VDJDB_BETA}:latest")
vdjdb_beta_table = vdjdb_beta_artifact.get(f"{VDJDB_BETA}_table.table.json")
vdjdb_beta_df = pd.DataFrame(data=vdjdb_beta_table.data, columns=vdjdb_beta_table.columns)
vdjdb_beta_df

In [None]:
vdjdb_beta_only_df = vdjdb_beta_df

In [None]:
vdjdb_beta_only_df.rename(columns={
    "complex.id": "TCR_name", 
    "V": "TRBV", 
    "J": 'TRBJ', 
    'CDR3': 'TRB_CDR3'
}, inplace=True)
vdjdb_beta_only_df.columns

In [None]:
template_stitchr.columns

In [None]:
vdjdb_beta_only_df['TRB_leader'] = vdjdb_beta_only_df['TRBV'] + "(L)"
vdjdb_beta_only_df

In [None]:
vdjdb_beta_only_df.columns

In [None]:
template_stitchr.columns

In [None]:
# Assuming your DataFrame is named vdjdb_beta_only_df
# Define the new column order
new_columns = [
    'TCR_name', 'TRAV', 'TRAJ', 'TRA_CDR3', 'TRBV', 'TRBJ', 'TRB_CDR3',
    'TRAC', 'TRBC', 'TRA_leader', 'TRB_leader', 'Linker', 'Link_order',
    'TRA_5_prime_seq', 'TRA_3_prime_seq', 'TRB_5_prime_seq', 'TRB_3_prime_seq'
]

# Map your original columns to the new template, when applicable
column_mapping = {
    "TCR_name": "TCR_name",
    'TRB_CDR3': 'TRB_CDR3', # Example: Map 'TRB_CDR3' from the original DataFrame to 'TRB_CDR3' in the new DataFrame
    "TRBV": "TRBV", 
    "TRBJ": "TRBJ", 
    "TRB_leader": "TRB_leader"
}

# Use .rename() to change column names based on the mapping above
# This step is optional and depends on whether you need to rename any column to match the new template
vdjdb_beta_only_df = vdjdb_beta_only_df.rename(columns=column_mapping)

# Discard any columns not in the new column list and add missing ones with default values
vdjdb_beta_only_Stitchr_schema_df = vdjdb_beta_only_df.reindex(columns=new_columns, fill_value=np.nan)

# At this point, vdjdb_beta_only_df_rearranged contains your DataFrame rearranged with the new column order,
# discarding columns not in the template, and adding new columns with default values np.nan where necessary.
vdjdb_beta_only_Stitchr_schema_df.columns

In [None]:
str(template_stitchr.columns) == str(vdjdb_beta_only_Stitchr_schema_df.columns)

In [None]:
vdjdb_beta_only_Stitchr_schema_df.to_csv("../data/VDJdb/temp/vdjdb_beta_only_Stitchr_schema.tsv", sep="\t", index=False)

In [None]:
from_beta_vdjdb_path = "../data/VDJdb/temp/vdjdb_beta_only_Stitchr_schema.tsv"
to_beta_vdjdb_path =  "../data/StitchrOutputs/VDJdb_Stitchr/VDJdb_beta_Stitchr"
command = ['thimble', '-in', from_beta_vdjdb_path, '-r', 'B', "-s", "HUMAN", "-o", to_beta_vdjdb_path]

In [None]:
result = subprocess.run(command, capture_output=True, text=True)

# Check if the command was executed successfully
if result.returncode == 0:
    print("Command executed successfully.")
    # If you want to print the output
    print("Output:", result.stdout)
else:
    print("Error in command execution.")
    # If you want to print the error
    print("Error:", result.stderr)

-------------------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------------------

In the follwoing we want to create the same output for the McPAS-TCR dataset!

In [None]:
mcpastcr_artifact = wandb.use_artifact(f"{MCPAS_TCR}:latest")
mcpastcr_table = mcpastcr_artifact.get(f"{MCPAS_TCR}_table.table.json")
mcpastcr_artifact_dir = mcpastcr_artifact.download()
mcpastcr_df = pd.DataFrame(data=mcpastcr_table.data, columns=mcpastcr_table.columns)
mcpastcr_df

In [None]:
mcpastcr_df.columns

In [None]:
template_stitchr.columns

In [None]:
rename_mapping = {
    'CDR3.alpha.aa': 'TRA_CDR3',
    'CDR3.beta.aa': 'TRB_CDR3',
}
mcpastcr_df_renamed = mcpastcr_df.rename(columns=rename_mapping)
mcpastcr_df_renamed.columns

In [None]:
mcpastcr_df_renamed

In [None]:
mcpastcr_only_human = mcpastcr_df_renamed[mcpastcr_df_renamed["Species"] == "Human"]
mcpastcr_only_human

In [None]:
# Define the new column order and include new columns
new_columns = [
    'TCR_name', 'TRAV', 'TRAJ', 'TRA_CDR3', 'TRBV', 'TRBJ', 'TRB_CDR3',
    'TRAC', 'TRBC', 'TRA_leader', 'TRB_leader', 'Linker', 'Link_order',
    'TRA_5_prime_seq', 'TRA_3_prime_seq', 'TRB_5_prime_seq', 'TRB_3_prime_seq'
]

# Map existing columns to the new schema, filling non-existing columns with NaN
# Note: This template uses placeholders for TRA_CDR3 and TRB_CDR3, which you might need to adjust
# based on your actual column names or desired mappings.
mcpastcr_df_template = mcpastcr_df_renamed.reindex(columns=new_columns).assign(
    TRAC=np.nan,
    TRBC=np.nan,
    Linker=np.nan,
    Link_order=np.nan,
    TRA_5_prime_seq=np.nan,
    TRA_3_prime_seq=np.nan,
    TRB_5_prime_seq=np.nan,
    TRB_3_prime_seq=np.nan
)

# For the TRA_leader and TRB_leader, we add "(L)" to the TRAV and TRBV values, respectively
# Adjust these lines if TRAV and TRBV are not the exact column names or if additional logic is needed
mcpastcr_df_template['TRA_leader'] = mcpastcr_df['TRAV'] + "(L)"
mcpastcr_df_template['TRB_leader'] = mcpastcr_df['TRBV'] + "(L)"

# here replace the string 'nan' with np.nan as np.nan should be used with pandas dataframes
mcpastcr_df_template = mcpastcr_df_template.replace('nan', np.nan) 
mcpastcr_df_template['TCR_name'] = range(1, len(mcpastcr_df) + 1) # use index as TCR_name
mcpastcr_df_template

In [None]:
# mcpastcr_df_template.to_csv("./McPAS-TCR_complete_onlyHumans.tsv", sep="\t", index=False)
mcpastcr_df_template.columns

In [None]:
# Define columns related to alpha and beta chains for filtering
alpha_columns = ['TRA_CDR3']
beta_columns = ['TRB_CDR3']

# Check for presence of non-NaN values in alpha and beta columns
alpha_present = mcpastcr_df_template[alpha_columns].notna().any(axis=1)
beta_present = mcpastcr_df_template[beta_columns].notna().any(axis=1)

alpha_present

In [None]:
beta_present

The output above indicates that McPAS-TCR only contains paired or beta-only information!

In [None]:
alpha_only_df = mcpastcr_df_template[alpha_present & ~beta_present]
beta_only_df = mcpastcr_df_template[~alpha_present & beta_present]
paired_df = mcpastcr_df_template[alpha_present & beta_present]

In [None]:
beta_only_df

In [None]:
beta_only_df.to_csv("../data/McPAS-TCR/temp/McPAS-TCR_beta_only_Stitchr_schema.tsv", sep="\t", index=False)

**THIS IS OFF!!! => TRBV and TRBJ MUST HAVE SOME VALID VALUES!!**

=> after considerations i think now that it is possible that some V and/or J values are NaN even if the CDR3 region is not NaN. In VDJdb some have this as well. 

In [None]:
paired_df

In [None]:
paired_df.to_csv("../data/McPAS-TCR/temp/McPAS-TCR_paired_only_Stitchr_schema.tsv", sep="\t", index=False)

What confuses me a bit is this: 
paired + beta-only = 12739 + 24875 = 37'614

Total McPAS-TCR = 39985 
=> where are the "missing" rows? What do they contain that they nowhere got included? 

In [None]:
alpha_only_df

even if they would be alpha-only we do not match the toal number... 

paired + beta-only + alpha-only = 12739 + 24875 + 1970 = 39'584

-----------------------------------------------------------------------------------------------------------

In the following we pass the McPAS-TCR data through the Stitchr library!

In [None]:
from_paired_mcpas_tcr_path = "../data/McPAS-TCR/temp/McPAS-TCR_paired_only_Stitchr_schema.tsv"
to_paired_mcpas_tcr_path =  "../data/StitchrOutputs/McPAS-TCR_Stitchr/McPAS-TCR_paired_Stitchr"
command = ['thimble', '-in', from_paired_mcpas_tcr_path, '-r', 'AB', "-s", "HUMAN", "-o", to_paired_mcpas_tcr_path]

In [None]:
result = subprocess.run(command, capture_output=True, text=True)

# Check if the command was executed successfully
if result.returncode == 0:
    print("Command executed successfully.")
    # If you want to print the output
    print("Output:", result.stdout)
else:
    print("Error in command execution.")
    # If you want to print the error
    print("Error:", result.stderr)

In [None]:
from_beta_mcpas_tcr_path = "../data/McPAS-TCR/temp/McPAS-TCR_beta_only_Stitchr_schema.tsv"
to_beta_mcpas_tcr_path =  "../data/StitchrOutputs/McPAS-TCR_Stitchr/McPAS-TCR_beta_Stitchr"
command = ['thimble', '-in', from_beta_mcpas_tcr_path, '-r', 'B', "-s", "HUMAN", "-o", to_beta_mcpas_tcr_path]

In [None]:
result = subprocess.run(command, capture_output=True, text=True)

# Check if the command was executed successfully
if result.returncode == 0:
    print("Command executed successfully.")
    # If you want to print the output
    print("Output:", result.stdout)
else:
    print("Error in command execution.")
    # If you want to print the error
    print("Error:", result.stderr)