In [1]:
import os

# os.chdir("/scratch/project/tcr_neoantigen/misc/jaz/scripts")

from pathlib import Path
from pyfaidx import Fasta
from tqdm import tqdm

from _utils import (
    find,
    read_and_filter,
    create_result_list,
    extract_result,
    extract_exon_info,
    filter_exon_pos,
    get_sequences_indel,
    get_sequences_substitution,
    complementary_sequence,
    reverse_complement,
    flanking_lower_positions,
    # print_windows,
)

# define head paths
HG38FOLDER = Path("../resources")
INPUTFOLDER = Path("../data/nextNEOpi")

In [2]:
# import reference and data
fasta_file = HG38FOLDER / "GRCh38.d1.vd1.fa"
gtf_file = HG38FOLDER / "gencode.v33.primary_assembly.annotation.gtf"
refgen = Fasta(filename=fasta_file)
exon_info = extract_exon_info(gtf_file)

In [3]:
samples = [
    "2020135",
    "2020239_WO1",
    "2020246_NO1",
    "2020260_WO1",
    "2020281_WO1",
    "2021111_MO1",
    "DES001",
    "DES002",
    "DES002_001",
    "DES002_002",
    "DES010",
]

In [4]:
results = {}
for sample in samples:
    file_input_path = (
        INPUTFOLDER
        / sample
        / "analyses"
        / sample
        / "05_vep"
        / "tables"
        / "high_confidence"
        / f"{sample}_hc_vep.txt"
    )
    results[sample] = read_and_filter(file_input_path)

for sample in samples:
    print(sample, results[sample].shape)

for sample in samples:
    print(sample, results[sample].VARIANT_CLASS.unique())

2020135 (107, 91)
2020239_WO1 (95, 91)
2020246_NO1 (70, 91)
2020260_WO1 (27, 91)
2020281_WO1 (29, 91)
2021111_MO1 (32, 91)
DES001 (4164, 91)
DES002 (1240, 91)
DES002_001 (5481, 91)
DES002_002 (0, 91)
DES010 (199, 91)
2020135 ['SNV' 'deletion']
2020239_WO1 ['SNV' 'deletion']
2020246_NO1 ['SNV' 'deletion']
2020260_WO1 ['SNV' 'deletion']
2020281_WO1 ['SNV' 'deletion' 'insertion']
2021111_MO1 ['SNV' 'insertion']
DES001 ['SNV' 'substitution' 'deletion' 'insertion']
DES002 ['SNV' 'substitution' 'deletion' 'insertion']
DES002_001 ['SNV' 'substitution' 'deletion' 'insertion']
DES002_002 []
DES010 ['SNV' 'substitution']


In [5]:
for sample in samples:
    df = results[sample].copy()
    mut_dict = create_result_list(df)
    mutations = find(mut_dict["variant_class"], "insertion")
    if len(mutations) > 0:
        for mut in tqdm(mutations):
            ref_seqs, var_seqs = get_sequences_indel(
                mut_info=extract_result(mut_dict, mut),
                exon_info=exon_info,
                fasta=refgen,
            )
    mutations = find(mut_dict["variant_class"], "deletion")
    if len(mutations) > 0:
        for mut in tqdm(mutations):
            ref_seqs, var_seqs = get_sequences_indel(
                mut_info=extract_result(mut_dict, mut),
                exon_info=exon_info,
                fasta=refgen,
            )
    mutations = find(mut_dict["variant_class"], "SNV")
    if len(mutations) > 0:
        for mut in tqdm(mutations):
            ref_seqs, var_seqs = get_sequences_substitution(
                mut_info=extract_result(mut_dict, mut),
                exon_info=exon_info,
                fasta=refgen,
            )
    mutations = find(mut_dict["variant_class"], "substitution")
    if len(mutations) > 0:
        for mut in tqdm(mutations):
            ref_seqs, var_seqs = get_sequences_substitution(
                mut_info=extract_result(mut_dict, mut),
                exon_info=exon_info,
                fasta=refgen,
            )

100%|██████████| 5/5 [00:00<00:00, 1646.37it/s]
100%|██████████| 102/102 [00:00<00:00, 2337.83it/s]
100%|██████████| 7/7 [00:00<00:00, 970.45it/s]
100%|██████████| 88/88 [00:00<00:00, 3856.23it/s]
100%|██████████| 3/3 [00:00<00:00, 1848.53it/s]
100%|██████████| 67/67 [00:00<00:00, 4835.39it/s]
100%|██████████| 2/2 [00:00<00:00, 1430.53it/s]
100%|██████████| 25/25 [00:00<00:00, 3808.71it/s]
100%|██████████| 1/1 [00:00<00:00, 1326.47it/s]
100%|██████████| 4/4 [00:00<00:00, 1059.03it/s]
100%|██████████| 24/24 [00:00<00:00, 3078.48it/s]
100%|██████████| 1/1 [00:00<00:00, 604.19it/s]
100%|██████████| 31/31 [00:00<00:00, 3525.01it/s]
100%|██████████| 4/4 [00:00<00:00, 731.26it/s]
100%|██████████| 17/17 [00:00<00:00, 835.38it/s]
100%|██████████| 3711/3711 [00:00<00:00, 4249.83it/s]
100%|██████████| 432/432 [00:00<00:00, 5268.17it/s]
100%|██████████| 1/1 [00:00<00:00, 1392.99it/s]
100%|██████████| 10/10 [00:00<00:00, 1009.39it/s]
100%|██████████| 1107/1107 [00:00<00:00, 4044.25it/s]
100%|█████