In [1]:
import os

# os.chdir("/scratch/project/tcr_neoantigen/misc/jaz/scripts")

from pathlib import Path
from pyfaidx import Fasta
from tqdm import tqdm

from _utils import (
    read_and_filter,
    extract_exon_info,
    create_minigenes,
    collapse_output,
    replace_bbsl_restriction_site,
)

# define head paths
# HG38FOLDER = Path("/scratch/project/tcr_neoantigen/resources/references/hg38")
# INPUTFOLDER = Path(
#     "/scratch/project/tcr_neoantigen/results/cSCC_BC_seq_data_10_patients/nextNEOpi"
# )
HG38FOLDER = Path("../resources")
INPUTFOLDER = Path("../data/nextNEOpi")

In [2]:
# import reference and data
# fasta_file = HG38FOLDER / "gdc" / "GRCh38.d1.vd1" / "fasta" / "GRCh38.d1.vd1.fa"
# gtf_file = HG38FOLDER / "annotation" / "gencode.v33.primary_assembly.annotation.gtf"
# refgen = Fasta(filename=fasta_file)
# exon_info = extract_exon_info(gtf_file)
fasta_file = HG38FOLDER / "GRCh38.d1.vd1.fa"
gtf_file = HG38FOLDER / "gencode.v33.primary_assembly.annotation.gtf"
refgen = Fasta(filename=fasta_file)
exon_info = extract_exon_info(gtf_file)

In [3]:
type(refgen)

pyfaidx.Fasta

In [4]:
type(exon_info)

collections.defaultdict

In [5]:
samples = [
    "2020135",
    "2020239_WO1",
    "2020246_NO1",
    "2020260_WO1",
    "2020281_WO1",
    "2021111_MO1",
    "DES001",
    "DES002",
    "DES002_001",
    "DES002_002",
    "DES010",
]

In [6]:
results = {}
for sample in samples:
    file_input_path = INPUTFOLDER / sample / "analyses" / sample / "05_vep" / "tables" / "high_confidence" / f"{sample}_hc_vep.txt"
    results[sample] = read_and_filter(file_input_path)

for sample in samples:
    print(sample, results[sample].shape)

for sample in samples:
    print(sample, results[sample].VARIANT_CLASS.unique())

2020135 (107, 91)
2020239_WO1 (95, 91)
2020246_NO1 (70, 91)
2020260_WO1 (27, 91)
2020281_WO1 (29, 91)
2021111_MO1 (32, 91)
DES001 (4164, 91)
DES002 (1240, 91)
DES002_001 (5481, 91)
DES002_002 (0, 91)
DES010 (199, 91)
2020135 ['SNV' 'deletion']
2020239_WO1 ['SNV' 'deletion']
2020246_NO1 ['SNV' 'deletion']
2020260_WO1 ['SNV' 'deletion']
2020281_WO1 ['SNV' 'deletion' 'insertion']
2021111_MO1 ['SNV' 'insertion']
DES001 ['SNV' 'substitution' 'deletion' 'insertion']
DES002 ['SNV' 'substitution' 'deletion' 'insertion']
DES002_001 ['SNV' 'substitution' 'deletion' 'insertion']
DES002_002 []
DES010 ['SNV' 'substitution']


In [7]:
final_results = {}
for sample in samples:
    final_results = create_minigenes(
        df=results[sample],
        sample=sample,
        exon_info=exon_info,
        fasta=refgen,
        out_dict=final_results,
    )

100%|██████████| 5/5 [00:00<00:00, 1760.54it/s]
100%|██████████| 102/102 [00:00<00:00, 3628.51it/s]
100%|██████████| 7/7 [00:00<00:00, 967.10it/s]
100%|██████████| 88/88 [00:00<00:00, 6361.03it/s]
100%|██████████| 3/3 [00:00<00:00, 1673.26it/s]
100%|██████████| 67/67 [00:00<00:00, 5600.21it/s]
100%|██████████| 2/2 [00:00<00:00, 1779.13it/s]
100%|██████████| 25/25 [00:00<00:00, 6203.49it/s]
100%|██████████| 1/1 [00:00<00:00, 1158.65it/s]
100%|██████████| 4/4 [00:00<00:00, 1341.86it/s]
100%|██████████| 24/24 [00:00<00:00, 5833.52it/s]
100%|██████████| 1/1 [00:00<00:00, 681.11it/s]
100%|██████████| 31/31 [00:00<00:00, 6755.87it/s]
100%|██████████| 4/4 [00:00<00:00, 3105.74it/s]
100%|██████████| 17/17 [00:00<00:00, 1798.36it/s]
100%|██████████| 3711/3711 [00:00<00:00, 6910.73it/s]
100%|██████████| 432/432 [00:00<00:00, 8486.08it/s]
100%|██████████| 1/1 [00:00<00:00, 2568.47it/s]
100%|██████████| 10/10 [00:00<00:00, 1327.69it/s]
100%|██████████| 1107/1107 [00:00<00:00, 9011.78it/s]
100%|███

In [8]:
ref_df, var_df = collapse_output(final_results)

In [9]:
ref_df = replace_bbsl_restriction_site(ref_df)

In [10]:
var_df = replace_bbsl_restriction_site(var_df)

In [11]:
ref_df

Unnamed: 0,reference,variant,amino_acid_ref,amino_acid_var,codon_ref,codon_var,chromosome,mutation_location,mutation_location2,variant_class,gene_id,protein_id,transcript_id,gene_symbol,strand,minigene,minigene_id,modified_minigene
0,C,-,T,X,aCc,ac,chr2,219483369,219483369,deletion,ENSG00000072195,ENSP00000311684,ENST00000312358,SPEG,1,CCACTGAGGATGAGGCCCTGGGGACCCCAGAGACTGGGGCTGCCAA...,2020135_14_ENSP00000311684.7_0_0_ref,CCACTGAGGATGAGGCCCTGGGGACCCCAGAGACTGGGGCTGCCAA...
1,C,-,T,X,aCc,ac,chr2,219483369,219483369,deletion,ENSG00000072195,ENSP00000311684,ENST00000312358,SPEG,1,CCCATGGACTGGCAGGAGCAGGGAAGGGCTCCCTCTCAGGACCAGG...,2020135_14_ENSP00000311684.7_0_1_ref,CCCATGGACTGGCAGGAGCAGGGAAGGGCTCCCTCTCAGGACCAGG...
2,C,-,T,X,aCc,ac,chr2,219483369,219483369,deletion,ENSG00000072195,ENSP00000311684,ENST00000312358,SPEG,1,GCTCCCAGCCCAGAGGCCCTCCCCTCCCCAGGCCAGGAGCCCGCAG...,2020135_14_ENSP00000311684.7_0_2_ref,GCTCCCAGCCCAGAGGCCCTCCCCTCCCCAGGCCAGGAGCCCGCAG...
3,C,-,T,X,aCc,ac,chr2,219483369,219483369,deletion,ENSG00000072195,ENSP00000311684,ENST00000312358,SPEG,1,GGGGCTAGCCCCAGGCGGGGAGAGCTCCGCAGGGGCAGCTCGGCTG...,2020135_14_ENSP00000311684.7_0_3_ref,GGGGCTAGCCCCAGGCGGGGAGAGCTCCGCAGGGGCAGCTCGGCTG...
4,C,-,T,X,aCc,ac,chr2,219483369,219483369,deletion,ENSG00000072195,ENSP00000311684,ENST00000312358,SPEG,1,AGCGCCCTGCCCCGGGCCGGGCCGCGGGAGCTGGGCCGGGGCCTGC...,2020135_14_ENSP00000311684.7_0_4_ref,AGCGCCCTGCCCCGGGCCGGGCCGCGGGAGCTGGGCCGGGGCCTGC...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39961,CC,TT,G,K,GGa,AAa,chrX,108221301,108221302,substitution,ENSG00000197565,ENSP00000334733,ENST00000334504,COL4A6,-1,GCCCAACAGGTCCTCAAGGATTCACTGGCTCTACTGGTTTATCGGG...,DES010_195_ENSP00000334733.7_1_0_ref,GCCCAACAGGTCCTCAAGGATTCACTGGCTCTACTGGTTTATCGGG...
39962,CC,TT,G,K,GGa,AAa,chrX,108221301,108221302,substitution,ENSG00000197565,ENSP00000334733,ENST00000334504,COL4A6,-1,GGCCCAACAGGTCCTCAAGGATTCACTGGCTCTACTGGTTTATCGG...,DES010_195_ENSP00000334733.7_2_0_ref,GGCCCAACAGGTCCTCAAGGATTCACTGGCTCTACTGGTTTATCGG...
39963,CC,AT,G,I,GGc,ATc,chrX,153804140,153804141,substitution,ENSG00000067840,ENSP00000164640,ENST00000164640,PDZD4,-1,GCGGGCCATGGCCGGCAACTCCAACTTGAACCGGACCCCTCCCGGG...,DES010_197_ENSP00000164640.4_0_0_ref,GCGGGCCATGGCCGGCAACTCCAACTTGAACCGGACCCCTCCCGGG...
39964,CC,AT,G,I,GGc,ATc,chrX,153804140,153804141,substitution,ENSG00000067840,ENSP00000164640,ENST00000164640,PDZD4,-1,GGCGGGCCATGGCCGGCAACTCCAACTTGAACCGGACCCCTCCCGG...,DES010_197_ENSP00000164640.4_1_0_ref,GGCGGGCCATGGCCGGCAACTCCAACTTGAACCGGACCCCTCCCGG...


In [12]:
var_df

Unnamed: 0,reference,variant,amino_acid_ref,amino_acid_var,codon_ref,codon_var,chromosome,mutation_location,mutation_location2,variant_class,gene_id,protein_id,transcript_id,gene_symbol,strand,minigene,minigene_id,modified_minigene
0,C,-,T,X,aCc,ac,chr2,219483369,219483369,deletion,ENSG00000072195,ENSP00000311684,ENST00000312358,SPEG,1,CCACTGAGGATGAGGCCCTGGGGACCCCAGAGACTGGGGCTGCCAA...,2020135_14_ENSP00000311684.7_0_0_var,CCACTGAGGATGAGGCCCTGGGGACCCCAGAGACTGGGGCTGCCAA...
1,C,-,T,X,aCc,ac,chr2,219483369,219483369,deletion,ENSG00000072195,ENSP00000311684,ENST00000312358,SPEG,1,CCATGGACTGGCAGGAGCAGGGAAGGGCTCCCTCTCAGGACCAGGA...,2020135_14_ENSP00000311684.7_0_1_var,CCATGGACTGGCAGGAGCAGGGAAGGGCTCCCTCTCAGGACCAGGA...
2,C,-,T,X,aCc,ac,chr2,219483369,219483369,deletion,ENSG00000072195,ENSP00000311684,ENST00000312358,SPEG,1,CTCCCAGCCCAGAGGCCCTCCCCTCCCCAGGCCAGGAGCCCGCAGC...,2020135_14_ENSP00000311684.7_0_2_var,CTCCCAGCCCAGAGGCCCTCCCCTCCCCAGGCCAGGAGCCCGCAGC...
3,C,-,T,X,aCc,ac,chr2,219483369,219483369,deletion,ENSG00000072195,ENSP00000311684,ENST00000312358,SPEG,1,GGGCTAGCCCCAGGCGGGGAGAGCTCCGCAGGGGCAGCTCGGCTGA...,2020135_14_ENSP00000311684.7_0_3_var,GGGCTAGCCCCAGGCGGGGAGAGCTCCGCAGGGGCAGCTCGGCTGA...
4,C,-,T,X,aCc,ac,chr2,219483369,219483369,deletion,ENSG00000072195,ENSP00000311684,ENST00000312358,SPEG,1,GCGCCCTGCCCCGGGCCGGGCCGCGGGAGCTGGGCCGGGGCCTGCA...,2020135_14_ENSP00000311684.7_0_4_var,GCGCCCTGCCCCGGGCCGGGCCGCGGGAGCTGGGCCGGGGCCTGCA...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39956,CC,TT,G,K,GGa,AAa,chrX,108221301,108221302,substitution,ENSG00000197565,ENSP00000334733,ENST00000334504,COL4A6,-1,GCCCAACAGGTCCTCAAGGATTCACTGGCTCTACTGGTTTATCGGA...,DES010_195_ENSP00000334733.7_1_0_var,GCCCAACAGGTCCTCAAGGATTCACTGGCTCTACTGGTTTATCGGA...
39957,CC,TT,G,K,GGa,AAa,chrX,108221301,108221302,substitution,ENSG00000197565,ENSP00000334733,ENST00000334504,COL4A6,-1,GGCCCAACAGGTCCTCAAGGATTCACTGGCTCTACTGGTTTATCGG...,DES010_195_ENSP00000334733.7_2_0_var,GGCCCAACAGGTCCTCAAGGATTCACTGGCTCTACTGGTTTATCGG...
39958,CC,AT,G,I,GGc,ATc,chrX,153804140,153804141,substitution,ENSG00000067840,ENSP00000164640,ENST00000164640,PDZD4,-1,GCGGGCCATGGCCGGCAACTCCAACTTGAACCGGACCCCTCCCGAT...,DES010_197_ENSP00000164640.4_0_0_var,GCGGGCCATGGCCGGCAACTCCAACTTGAACCGGACCCCTCCCGAT...
39959,CC,AT,G,I,GGc,ATc,chrX,153804140,153804141,substitution,ENSG00000067840,ENSP00000164640,ENST00000164640,PDZD4,-1,GGCGGGCCATGGCCGGCAACTCCAACTTGAACCGGACCCCTCCCGA...,DES010_197_ENSP00000164640.4_1_0_var,GGCGGGCCATGGCCGGCAACTCCAACTTGAACCGGACCCCTCCCGA...


In [13]:
ref_df.to_csv("final_minigenes_control_v3.csv", index=False)
var_df.to_csv("final_minigenes_variant_v3.csv", index=False)