In [1]:
import os

# os.chdir("/scratch/project/tcr_neoantigen/misc/jaz/scripts")

from pathlib import Path
from pyfaidx import Fasta
from tqdm import tqdm

from _utils import (
    find,
    read_and_filter,
    create_result_list,
    extract_result,
    extract_exon_info,
    filter_exon_pos,
    generate_windows,
    get_sequences_indel,
    get_sequences_substitution,
    complementary_sequence,
    reverse_complement,
    flanking_lower_positions,
    # print_windows,
)

# define head paths
# HG38FOLDER = Path("/scratch/project/tcr_neoantigen/resources/references/hg38")
# INPUTFOLDER = Path(
#     "/scratch/project/tcr_neoantigen/results/cSCC_BC_seq_data_10_patients/nextNEOpi"
# )
HG38FOLDER = Path("../resources")
INPUTFOLDER = Path("../data/nextNEOpi")

In [2]:
# import reference and data
fasta_file = HG38FOLDER / "gdc" / "GRCh38.d1.vd1" / "fasta" / "GRCh38.d1.vd1.fa"
gtf_file = HG38FOLDER / "annotation" / "gencode.v33.primary_assembly.annotation.gtf"
# refgen = Fasta(filename=fasta_file)
# exon_info = extract_exon_info(gtf_file)
# fasta_file = HG38FOLDER / "GRCh38.d1.vd1.fa"
# gtf_file = HG38FOLDER / "gencode.v33.primary_assembly.annotation.gtf"
refgen = Fasta(filename=fasta_file)
exon_info = extract_exon_info(gtf_file)

In [3]:
samples = [
    "2020135",
    "2020239_WO1",
    "2020246_NO1",
    "2020260_WO1",
    "2020281_WO1",
    "2021111_MO1",
    "DES001",
    "DES002",
    "DES002_001",
    "DES002_002",
    "DES010",
]

In [4]:
results = {}
samples = ["test"]
for sample in samples:
    # file_input_path = (
    #     INPUTFOLDER
    #     / sample
    #     / "analyses"
    #     / sample
    #     / "05_vep"
    #     / "tables"
    #     / "high_confidence"
    #     / f"{sample}_hc_vep.txt"
    # )
    file_input_path = f"../test/{sample}_hc_vep.txt"
    results[sample] = read_and_filter(file_input_path)

for sample in samples:
    print(sample, results[sample].shape)

for sample in samples:
    print(sample, results[sample].VARIANT_CLASS.unique())

test (3, 91)
test ['substitution' 'SNV' 'insertion']


In [5]:
final_results = {}
for sample in samples:
    df = results[sample].copy()
    mut_dict = create_result_list(df)
    mutations = find(mut_dict["variant_class"], "insertion")
    final_results[sample] = {}
    if len(mutations) > 0:
        for mut in tqdm(mutations):
            final_results[sample][mut] = {}
            final_results[sample][mut]["ref"], final_results[sample][mut]["var"] = (
                get_sequences_indel(
                    mut_info=extract_result(mut_dict, mut),
                    exon_info=exon_info,
                    fasta=refgen,
                )
            )
            final_results[sample][mut]["mut_info"] = extract_result(mut_dict, mut)
    mutations = find(mut_dict["variant_class"], "deletion")
    if len(mutations) > 0:
        for mut in tqdm(mutations):
            final_results[sample][mut] = {}
            final_results[sample][mut]["ref"], final_results[sample][mut]["var"] = (
                get_sequences_indel(
                    mut_info=extract_result(mut_dict, mut),
                    exon_info=exon_info,
                    fasta=refgen,
                )
            )
            final_results[sample][mut]["mut_info"] = extract_result(mut_dict, mut)
    mutations = find(mut_dict["variant_class"], "SNV")
    if len(mutations) > 0:
        for mut in tqdm(mutations):
            final_results[sample][mut] = {}
            final_results[sample][mut]["ref"], final_results[sample][mut]["var"] = (
                get_sequences_substitution(
                    mut_info=extract_result(mut_dict, mut),
                    exon_info=exon_info,
                    fasta=refgen,
                )
            )
            final_results[sample][mut]["mut_info"] = extract_result(mut_dict, mut)
    mutations = find(mut_dict["variant_class"], "substitution")
    if len(mutations) > 0:
        for mut in tqdm(mutations):
            final_results[sample][mut] = {}
            final_results[sample][mut]["ref"], final_results[sample][mut]["var"] = (
                get_sequences_substitution(
                    mut_info=extract_result(mut_dict, mut),
                    exon_info=exon_info,
                    fasta=refgen,
                )
            )
            final_results[sample][mut]["mut_info"] = extract_result(mut_dict, mut)

100%|██████████| 1/1 [00:00<00:00, 999.83it/s]
100%|██████████| 1/1 [00:00<00:00, 500.10it/s]
100%|██████████| 1/1 [00:00<00:00, 1001.51it/s]


In [6]:
final_results.keys()

dict_keys(['test'])

In [7]:
final_results["test"].keys()

dict_keys([2, 1, 0])

In [8]:
import pandas as pd

In [9]:
ref_list = []
for sample in final_results:
    for mut in final_results[sample]:
        if final_results[sample][mut]["ref"] is not None:
            for protein in final_results[sample][mut]["ref"]:
                for frame in final_results[sample][mut]["ref"][protein]:
                    for i, seq in enumerate(
                        final_results[sample][mut]["ref"][protein][frame]
                    ):
                        mut_info = {}
                        for key, record in final_results[sample][mut][
                            "mut_info"
                        ].items():
                            mut_info[key] = record
                        mut_info.update(
                            {
                                "minigene": seq,
                                "minigene_id": str(sample)
                                + "_"
                                + str(mut)
                                + "_"
                                + protein
                                + "_"
                                + str(frame)
                                + "_"
                                + str(i)
                                + "_ref",
                            }
                        )
                        ref_list.append(mut_info)

In [10]:
ref_df = pd.DataFrame(ref_list)
ref_df

Unnamed: 0,reference,variant,amino_acid_ref,amino_acid_var,codon_ref,codon_var,chromosome,mutation_location,mutation_location2,variant_class,gene_id,protein_id,transcript_id,gene_symbol,strand,minigene,minigene_id
0,-,G,S,SX,tca,tcGa,chr13,32379450,32379451,insertion,ENSG00000139618,ENSP00000369497,ENST00000380152,BRCA2,1,AATTAGGAAGGCCATGGAATCTGCTGAACAAAAGGAACAAGGTTTA...,test_2_ENSP00000369497.3_0_0_ref
1,-,G,S,SX,tca,tcGa,chr13,32379450,32379451,insertion,ENSG00000139618,ENSP00000369497,ENST00000380152,BRCA2,1,AAACCTTGTTCCTTTTGTTCAGCAGATTCCATGGCCTTCCTAATTT...,test_2_ENSP00000369497.3_0_1_ref
2,-,G,S,SX,tca,tcGa,chr13,32379450,32379451,insertion,ENSG00000139618,ENSP00000369497,ENST00000380152,BRCA2,1,AACTGGATCTGAGCTTGTTTCTTATCATTCAACATTTGCCTGTGAT...,test_2_ENSP00000369497.3_0_2_ref
3,-,G,S,SX,tca,tcGa,chr13,32379450,32379451,insertion,ENSG00000139618,ENSP00000369497,ENST00000380152,BRCA2,1,TTCAAGGCTCTTAACTGCTCTTCACTGAAATAACCCGGTAGTTGTT...,test_2_ENSP00000369497.3_0_3_ref
4,-,G,S,SX,tca,tcGa,chr13,32379450,32379451,insertion,ENSG00000139618,ENSP00000369497,ENST00000380152,BRCA2,1,TACTGAGTTTTTTTTGTCGCTGCTAACTGTATGTTAGCTCTTTCAG...,test_2_ENSP00000369497.3_0_4_ref
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
300,CC,TT,WD,*N,tgGGac,tgAAac,chr1,1355129,1355130,substitution,ENSG00000162576,ENSP00000307887,ENST00000309212,MXRA8,-1,TTGCCCTTTCTCTGCCCCATCCCTACCCTAGCCTTGCTCTCAGCCA...,test_0_ENSP00000307887.6_2_31_ref
301,CC,TT,WD,*N,tgGGac,tgAAac,chr1,1355129,1355130,substitution,ENSG00000162576,ENSP00000307887,ENST00000309212,MXRA8,-1,TTGATAGTCACTGGGCTCCCTGTGACTTCTGACCCTGACACCCCTC...,test_0_ENSP00000307887.6_2_32_ref
302,CC,TT,WD,*N,tgGGac,tgAAac,chr1,1355129,1355130,substitution,ENSG00000162576,ENSP00000307887,ENST00000309212,MXRA8,-1,TTGGACTCTGCCTGGGCTGGAGTCTAGGGCTGGGGCTACATTTGGC...,test_0_ENSP00000307887.6_2_33_ref
303,CC,TT,WD,*N,tgGGac,tgAAac,chr1,1355129,1355130,substitution,ENSG00000162576,ENSP00000307887,ENST00000309212,MXRA8,-1,CTGTACTGGCTGAGGACAGGGGAGGGAGTGAAGTTGGTTTGGGGTG...,test_0_ENSP00000307887.6_2_34_ref


In [11]:
var_list = []
for sample in final_results:
    for mut in final_results[sample]:
        if final_results[sample][mut]["var"] is not None:
            for protein in final_results[sample][mut]["var"]:
                for frame in final_results[sample][mut]["var"][protein]:
                    for i, seq in enumerate(
                        final_results[sample][mut]["var"][protein][frame]
                    ):
                        mut_info = {}
                        for key, record in final_results[sample][mut][
                            "mut_info"
                        ].items():
                            mut_info[key] = record
                        mut_info.update(
                            {
                                "minigene": seq,
                                "minigene_id": str(sample)
                                + "_"
                                + str(mut)
                                + "_"
                                + protein
                                + "_"
                                + str(frame)
                                + "_"
                                + str(i)
                                + "_var",
                            }
                        )
                        var_list.append(mut_info)

In [12]:
var_df = pd.DataFrame(var_list)
var_df

Unnamed: 0,reference,variant,amino_acid_ref,amino_acid_var,codon_ref,codon_var,chromosome,mutation_location,mutation_location2,variant_class,gene_id,protein_id,transcript_id,gene_symbol,strand,minigene,minigene_id
0,-,G,S,SX,tca,tcGa,chr13,32379450,32379451,insertion,ENSG00000139618,ENSP00000369497,ENST00000380152,BRCA2,1,ATTAGGAAGGCCATGGAATCTGCTGAACAAAAGGAACAAGGTTTAT...,test_2_ENSP00000369497.3_0_0_var
1,-,G,S,SX,tca,tcGa,chr13,32379450,32379451,insertion,ENSG00000139618,ENSP00000369497,ENST00000380152,BRCA2,1,TAAACCTTGTTCCTTTTGTTCAGCAGATTCCATGGCCTTCCTAATT...,test_2_ENSP00000369497.3_0_1_var
2,-,G,S,SX,tca,tcGa,chr13,32379450,32379451,insertion,ENSG00000139618,ENSP00000369497,ENST00000380152,BRCA2,1,CAACTGGATCTGAGCTTGTTTCTTATCATTCAACATTTGCCTGTGA...,test_2_ENSP00000369497.3_0_2_var
3,-,G,S,SX,tca,tcGa,chr13,32379450,32379451,insertion,ENSG00000139618,ENSP00000369497,ENST00000380152,BRCA2,1,ATTCAAGGCTCTTAACTGCTCTTCACTGAAATAACCCGGTAGTTGT...,test_2_ENSP00000369497.3_0_3_var
4,-,G,S,SX,tca,tcGa,chr13,32379450,32379451,insertion,ENSG00000139618,ENSP00000369497,ENST00000380152,BRCA2,1,ATACTGAGTTTTTTTTGTCGCTGCTAACTGTATGTTAGCTCTTTCA...,test_2_ENSP00000369497.3_0_4_var
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
300,CC,TT,WD,*N,tgGGac,tgAAac,chr1,1355129,1355130,substitution,ENSG00000162576,ENSP00000307887,ENST00000309212,MXRA8,-1,TTGCCCTTTCTCTGCCCCATCCCTACCCTAGCCTTGCTCTCAGCCA...,test_0_ENSP00000307887.6_2_31_var
301,CC,TT,WD,*N,tgGGac,tgAAac,chr1,1355129,1355130,substitution,ENSG00000162576,ENSP00000307887,ENST00000309212,MXRA8,-1,TTGATAGTCACTGGGCTCCCTGTGACTTCTGACCCTGACACCCCTC...,test_0_ENSP00000307887.6_2_32_var
302,CC,TT,WD,*N,tgGGac,tgAAac,chr1,1355129,1355130,substitution,ENSG00000162576,ENSP00000307887,ENST00000309212,MXRA8,-1,TTGGACTCTGCCTGGGCTGGAGTCTAGGGCTGGGGCTACATTTGGC...,test_0_ENSP00000307887.6_2_33_var
303,CC,TT,WD,*N,tgGGac,tgAAac,chr1,1355129,1355130,substitution,ENSG00000162576,ENSP00000307887,ENST00000309212,MXRA8,-1,CTGTACTGGCTGAGGACAGGGGAGGGAGTGAAGTTGGTTTGGGGTG...,test_0_ENSP00000307887.6_2_34_var


In [13]:
ref_df.to_csv("final_minigenes_control.csv", index=False)
var_df.to_csv("final_minigenes_variant.csv", index=False)