In [None]:
from pathlib import Path
from pyfaidx import Fasta

from minigene_design._utils import (
    read_and_filter,
    extract_exon_info,
    create_minigenes,
    collapse_output,
    replace_bbsl_restriction_site,
)

# define head paths
HG38FOLDER = Path("../resources")
INPUTFOLDER = Path("../data/nextNEOpi")

In [None]:
# import reference and data
fasta_file = HG38FOLDER / "GRCh38.d1.vd1.fa"
gtf_file = HG38FOLDER / "gencode.v33.primary_assembly.annotation.gtf"
refgen = Fasta(filename=fasta_file)
exon_info = extract_exon_info(gtf_file)

In [None]:
type(refgen)

In [None]:
type(exon_info)

In [None]:
samples = [
    "2020135",
    "2020239_WO1",
    "2020246_NO1",
    "2020260_WO1",
    "2020281_WO1",
    "2021111_MO1",
    "DES001",
    "DES002",
    "DES002_001",
    "DES002_002",
    "DES010",
]

In [None]:
results = {}
for sample in samples:
    file_input_path = INPUTFOLDER / sample / "analyses" / sample / "05_vep" / "tables" / "high_confidence" / f"{sample}_hc_vep.txt"
    results[sample] = read_and_filter(file_input_path, keep_from_lines=103)

for sample in samples:
    print(sample, results[sample].shape)

for sample in samples:
    print(sample, results[sample].VARIANT_CLASS.unique())

In [None]:
final_results = {}
for sample in samples:
    final_results = create_minigenes(
        df=results[sample],
        sample=sample,
        exon_info=exon_info,
        fasta=refgen,
        out_dict=final_results,
    )

In [None]:
final_results["2020135"]

In [None]:
ref_df, var_df = collapse_output(final_results)

In [None]:
ref_df = replace_bbsl_restriction_site(ref_df)

In [None]:
var_df = replace_bbsl_restriction_site(var_df)

In [None]:
ref_df

In [None]:
var_df

In [None]:
ref_df.to_csv("final_minigenes_control_v3.csv", index=False)
var_df.to_csv("final_minigenes_variant_v3.csv", index=False)