In [1]:
import os

# os.chdir("/scratch/project/tcr_neoantigen/misc/jaz/scripts")

from pathlib import Path
from pyfaidx import Fasta
from tqdm import tqdm

from _utils import (
    find,
    read_and_filter,
    create_result_list,
    extract_result,
    extract_exon_info,
    filter_exon_pos,
    generate_windows,
    get_sequences_indel,
    get_sequences_substitution,
    complementary_sequence,
    reverse_complement,
    flanking_lower_positions,
    # print_windows,
)

# define head paths
# HG38FOLDER = Path("/scratch/project/tcr_neoantigen/resources/references/hg38")
# INPUTFOLDER = Path(
#     "/scratch/project/tcr_neoantigen/results/cSCC_BC_seq_data_10_patients/nextNEOpi"
# )
HG38FOLDER = Path("../resources")
INPUTFOLDER = Path("../data/nextNEOpi")

In [2]:
# import reference and data
# fasta_file = HG38FOLDER / "gdc" / "GRCh38.d1.vd1" / "fasta" / "GRCh38.d1.vd1.fa"
# gtf_file = HG38FOLDER / "annotation" / "gencode.v33.primary_assembly.annotation.gtf"
# refgen = Fasta(filename=fasta_file)
# exon_info = extract_exon_info(gtf_file)
fasta_file = HG38FOLDER / "GRCh38.d1.vd1.fa"
gtf_file = HG38FOLDER / "gencode.v33.primary_assembly.annotation.gtf"
refgen = Fasta(filename=fasta_file)
exon_info = extract_exon_info(gtf_file)

FastaNotFoundError: Cannot read FASTA from file ../resources/GRCh38.d1.vd1.fa

In [None]:
samples = [
    "2020135",
    "2020239_WO1",
    "2020246_NO1",
    "2020260_WO1",
    "2020281_WO1",
    "2021111_MO1",
    "DES001",
    "DES002",
    "DES002_001",
    "DES002_002",
    "DES010",
]

In [None]:
results = {}
for sample in samples:
    file_input_path = (
        INPUTFOLDER
        / sample
        / "analyses"
        / sample
        / "05_vep"
        / "tables"
        / "high_confidence"
        / f"{sample}_hc_vep.txt"
    )
    results[sample] = read_and_filter(file_input_path)

for sample in samples:
    print(sample, results[sample].shape)

for sample in samples:
    print(sample, results[sample].VARIANT_CLASS.unique())

FileNotFoundError: [Errno 2] No such file or directory: '../nextNEOpi/2020135/analyses/2020135/05_vep/tables/high_confidence/2020135_hc_vep.txt'

In [None]:
final_results = {}
for sample in samples:
    df = results[sample].copy()
    mut_dict = create_result_list(df)
    mutations = find(mut_dict["variant_class"], "insertion")
    final_results[sample] = {}
    if len(mutations) > 0:
        for mut in tqdm(mutations):
            final_results[sample][mut] = {}
            final_results[sample][mut]["ref"], final_results[sample][mut]["var"] = (
                get_sequences_indel(
                    mut_info=extract_result(mut_dict, mut),
                    exon_info=exon_info,
                    fasta=refgen,
                )
            )
            final_results[sample][mut]["mut_info"] = extract_result(mut_dict, mut)
    mutations = find(mut_dict["variant_class"], "deletion")
    if len(mutations) > 0:
        for mut in tqdm(mutations):
            final_results[sample][mut] = {}
            final_results[sample][mut]["ref"], final_results[sample][mut]["var"] = (
                get_sequences_indel(
                    mut_info=extract_result(mut_dict, mut),
                    exon_info=exon_info,
                    fasta=refgen,
                )
            )
            final_results[sample][mut]["mut_info"] = extract_result(mut_dict, mut)
    mutations = find(mut_dict["variant_class"], "SNV")
    if len(mutations) > 0:
        for mut in tqdm(mutations):
            final_results[sample][mut] = {}
            final_results[sample][mut]["ref"], final_results[sample][mut]["var"] = (
                get_sequences_substitution(
                    mut_info=extract_result(mut_dict, mut),
                    exon_info=exon_info,
                    fasta=refgen,
                )
            )
            final_results[sample][mut]["mut_info"] = extract_result(mut_dict, mut)
    mutations = find(mut_dict["variant_class"], "substitution")
    if len(mutations) > 0:
        for mut in tqdm(mutations):
            final_results[sample][mut] = {}
            final_results[sample][mut]["ref"], final_results[sample][mut]["var"] = (
                get_sequences_substitution(
                    mut_info=extract_result(mut_dict, mut),
                    exon_info=exon_info,
                    fasta=refgen,
                )
            )
            final_results[sample][mut]["mut_info"] = extract_result(mut_dict, mut)

100%|██████████| 5/5 [00:00<00:00, 1353.96it/s]
100%|██████████| 102/102 [00:00<00:00, 2781.95it/s]
100%|██████████| 7/7 [00:00<00:00, 1214.43it/s]
100%|██████████| 88/88 [00:00<00:00, 5212.60it/s]
100%|██████████| 3/3 [00:00<00:00, 1543.16it/s]
100%|██████████| 67/67 [00:00<00:00, 6856.28it/s]
100%|██████████| 2/2 [00:00<00:00, 1962.25it/s]
100%|██████████| 25/25 [00:00<00:00, 4711.64it/s]
100%|██████████| 1/1 [00:00<00:00, 1285.41it/s]
100%|██████████| 4/4 [00:00<00:00, 1334.70it/s]
100%|██████████| 24/24 [00:00<00:00, 4792.58it/s]
100%|██████████| 1/1 [00:00<00:00, 595.27it/s]
100%|██████████| 31/31 [00:00<00:00, 4649.84it/s]
100%|██████████| 4/4 [00:00<00:00, 2789.23it/s]
100%|██████████| 17/17 [00:00<00:00, 2017.80it/s]
100%|██████████| 3711/3711 [00:00<00:00, 5047.54it/s]
100%|██████████| 432/432 [00:00<00:00, 5877.14it/s]
100%|██████████| 1/1 [00:00<00:00, 1628.86it/s]
100%|██████████| 10/10 [00:00<00:00, 1320.83it/s]
100%|██████████| 1107/1107 [00:00<00:00, 5289.31it/s]
100%|██

In [None]:
final_results.keys()

dict_keys(['2020135', '2020239_WO1', '2020246_NO1', '2020260_WO1', '2020281_WO1', '2021111_MO1', 'DES001', 'DES002', 'DES002_001', 'DES002_002', 'DES010'])

In [None]:
final_results["2020239_WO1"].keys()

dict_keys([11, 18, 27, 37, 55, 74, 87, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16, 17, 19, 20, 21, 22, 23, 24, 25, 26, 28, 29, 30, 31, 32, 33, 34, 35, 36, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 88, 89, 90, 91, 92, 93, 94])

In [None]:
import pandas as pd

In [None]:
ref_list = []
for sample in final_results:
    for mut in final_results[sample]:
        if final_results[sample][mut]["ref"] is not None:
            for protein in final_results[sample][mut]["ref"]:
                for frame in final_results[sample][mut]["ref"][protein]:
                    for i, seq in enumerate(
                        final_results[sample][mut]["ref"][protein][frame]
                    ):
                        mut_info = {}
                        for key, record in final_results[sample][mut][
                            "mut_info"
                        ].items():
                            mut_info[key] = record
                        mut_info.update(
                            {
                                "minigene": seq,
                                "minigene_id": str(sample)
                                + "_"
                                + str(mut)
                                + "_"
                                + protein
                                + "_"
                                + str(frame)
                                + "_"
                                + str(i)
                                + "_ref",
                            }
                        )
                        ref_list.append(mut_info)

In [None]:
ref_df = pd.DataFrame(ref_list)
ref_df

Unnamed: 0,reference,variant,amino_acid_ref,amino_acid_var,codon_ref,codon_var,chromosome,mutation_location,mutation_location2,variant_class,gene_id,protein_id,transcript_id,gene_symbol,strand,minigene,minigene_id
0,C,-,T,X,aCc,ac,chr2,219483369,219483369,deletion,ENSG00000072195,ENSP00000311684,ENST00000312358,SPEG,1,CCACTGAGGATGAGGCCCTGGGGACCCCAGAGACTGGGGCTGCCAA...,2020135_14_ENSP00000311684.7_0_0_ref
1,C,-,T,X,aCc,ac,chr2,219483369,219483369,deletion,ENSG00000072195,ENSP00000311684,ENST00000312358,SPEG,1,CCCATGGACTGGCAGGAGCAGGGAAGGGCTCCCTCTCAGGACCAGG...,2020135_14_ENSP00000311684.7_0_1_ref
2,C,-,T,X,aCc,ac,chr2,219483369,219483369,deletion,ENSG00000072195,ENSP00000311684,ENST00000312358,SPEG,1,GCTCCCAGCCCAGAGGCCCTCCCCTCCCCAGGCCAGGAGCCCGCAG...,2020135_14_ENSP00000311684.7_0_2_ref
3,C,-,T,X,aCc,ac,chr2,219483369,219483369,deletion,ENSG00000072195,ENSP00000311684,ENST00000312358,SPEG,1,GGGGCTAGCCCCAGGCGGGGAGAGCTCCGCAGGGGCAGCTCGGCTG...,2020135_14_ENSP00000311684.7_0_3_ref
4,C,-,T,X,aCc,ac,chr2,219483369,219483369,deletion,ENSG00000072195,ENSP00000311684,ENST00000312358,SPEG,1,AGCGCCCTGCCCCGGGCCGGGCCGCGGGAGCTGGGCCGGGGCCTGC...,2020135_14_ENSP00000311684.7_0_4_ref
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
190649,CC,TT,G,K,GGa,AAa,chrX,108221301,108221302,substitution,ENSG00000197565,ENSP00000334733,ENST00000334504,COL4A6,-1,GCCCAACAGGTCCTCAAGGATTCACTGGCTCTACTGGTTTATCGGG...,DES010_195_ENSP00000334733.7_1_0_ref
190650,CC,TT,G,K,GGa,AAa,chrX,108221301,108221302,substitution,ENSG00000197565,ENSP00000334733,ENST00000334504,COL4A6,-1,GGCCCAACAGGTCCTCAAGGATTCACTGGCTCTACTGGTTTATCGG...,DES010_195_ENSP00000334733.7_2_0_ref
190651,CC,AT,G,I,GGc,ATc,chrX,153804140,153804141,substitution,ENSG00000067840,ENSP00000164640,ENST00000164640,PDZD4,-1,GCGGGCCATGGCCGGCAACTCCAACTTGAACCGGACCCCTCCCGGG...,DES010_197_ENSP00000164640.4_0_0_ref
190652,CC,AT,G,I,GGc,ATc,chrX,153804140,153804141,substitution,ENSG00000067840,ENSP00000164640,ENST00000164640,PDZD4,-1,GGCGGGCCATGGCCGGCAACTCCAACTTGAACCGGACCCCTCCCGG...,DES010_197_ENSP00000164640.4_1_0_ref


In [None]:
var_list = []
for sample in final_results:
    for mut in final_results[sample]:
        if final_results[sample][mut]["var"] is not None:
            for protein in final_results[sample][mut]["var"]:
                for frame in final_results[sample][mut]["var"][protein]:
                    for i, seq in enumerate(
                        final_results[sample][mut]["var"][protein][frame]
                    ):
                        mut_info = {}
                        for key, record in final_results[sample][mut][
                            "mut_info"
                        ].items():
                            mut_info[key] = record
                        mut_info.update(
                            {
                                "minigene": seq,
                                "minigene_id": str(sample)
                                + "_"
                                + str(mut)
                                + "_"
                                + protein
                                + "_"
                                + str(frame)
                                + "_"
                                + str(i)
                                + "_var",
                            }
                        )
                        var_list.append(mut_info)

In [None]:
var_df = pd.DataFrame(var_list)
var_df

Unnamed: 0,reference,variant,amino_acid_ref,amino_acid_var,codon_ref,codon_var,chromosome,mutation_location,mutation_location2,variant_class,gene_id,protein_id,transcript_id,gene_symbol,strand,minigene,minigene_id
0,C,-,T,X,aCc,ac,chr2,219483369,219483369,deletion,ENSG00000072195,ENSP00000311684,ENST00000312358,SPEG,1,CCACTGAGGATGAGGCCCTGGGGACCCCAGAGACTGGGGCTGCCAA...,2020135_14_ENSP00000311684.7_0_0_var
1,C,-,T,X,aCc,ac,chr2,219483369,219483369,deletion,ENSG00000072195,ENSP00000311684,ENST00000312358,SPEG,1,CCATGGACTGGCAGGAGCAGGGAAGGGCTCCCTCTCAGGACCAGGA...,2020135_14_ENSP00000311684.7_0_1_var
2,C,-,T,X,aCc,ac,chr2,219483369,219483369,deletion,ENSG00000072195,ENSP00000311684,ENST00000312358,SPEG,1,CTCCCAGCCCAGAGGCCCTCCCCTCCCCAGGCCAGGAGCCCGCAGC...,2020135_14_ENSP00000311684.7_0_2_var
3,C,-,T,X,aCc,ac,chr2,219483369,219483369,deletion,ENSG00000072195,ENSP00000311684,ENST00000312358,SPEG,1,GGGCTAGCCCCAGGCGGGGAGAGCTCCGCAGGGGCAGCTCGGCTGA...,2020135_14_ENSP00000311684.7_0_3_var
4,C,-,T,X,aCc,ac,chr2,219483369,219483369,deletion,ENSG00000072195,ENSP00000311684,ENST00000312358,SPEG,1,GCGCCCTGCCCCGGGCCGGGCCGCGGGAGCTGGGCCGGGGCCTGCA...,2020135_14_ENSP00000311684.7_0_4_var
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
190644,CC,TT,G,K,GGa,AAa,chrX,108221301,108221302,substitution,ENSG00000197565,ENSP00000334733,ENST00000334504,COL4A6,-1,GCCCAACAGGTCCTCAAGGATTCACTGGCTCTACTGGTTTATCGGA...,DES010_195_ENSP00000334733.7_1_0_var
190645,CC,TT,G,K,GGa,AAa,chrX,108221301,108221302,substitution,ENSG00000197565,ENSP00000334733,ENST00000334504,COL4A6,-1,GGCCCAACAGGTCCTCAAGGATTCACTGGCTCTACTGGTTTATCGG...,DES010_195_ENSP00000334733.7_2_0_var
190646,CC,AT,G,I,GGc,ATc,chrX,153804140,153804141,substitution,ENSG00000067840,ENSP00000164640,ENST00000164640,PDZD4,-1,GCGGGCCATGGCCGGCAACTCCAACTTGAACCGGACCCCTCCCGAT...,DES010_197_ENSP00000164640.4_0_0_var
190647,CC,AT,G,I,GGc,ATc,chrX,153804140,153804141,substitution,ENSG00000067840,ENSP00000164640,ENST00000164640,PDZD4,-1,GGCGGGCCATGGCCGGCAACTCCAACTTGAACCGGACCCCTCCCGA...,DES010_197_ENSP00000164640.4_1_0_var


In [None]:
ref_df.to_csv("final_minigenes_control.csv", index=False)
var_df.to_csv("final_minigenes_variant.csv", index=False)