In [1]:
from pathlib import Path
from pyfaidx import Fasta

from minigene_design._utils import (
    read_and_filter,
    extract_exon_info,
    create_minigenes,
    collapse_output,
    replace_bbsl_restriction_site,
)

# define head paths
HG38FOLDER = Path("../resources")
INPUTFOLDER = Path("../data/nextNEOpi")

In [2]:
# import reference and data
fasta_file = HG38FOLDER / "GRCh38.d1.vd1.fa"
gtf_file = HG38FOLDER / "gencode.v33.primary_assembly.annotation.gtf"
refgen = Fasta(filename=fasta_file)
exon_info = extract_exon_info(gtf_file)

In [3]:
type(refgen)

pyfaidx.Fasta

In [4]:
type(exon_info)

collections.defaultdict

In [5]:
samples = [
    "2020135",
    "2020239_WO1",
    "2020246_NO1",
    "2020260_WO1",
    "2020281_WO1",
    "2021111_MO1",
    "DES001",
    "DES002",
    "DES002_001",
    "DES002_002",
    "DES010",
]

In [None]:
results = {}
for sample in samples:
    file_input_path = INPUTFOLDER / sample / "analyses" / sample / "05_vep" / "tables" / "high_confidence" / f"{sample}_hc_vep.txt"
    results[sample] = read_and_filter(file_input_path, keep_from_lines=103)

for sample in samples:
    print(sample, results[sample].shape)

for sample in samples:
    print(sample, results[sample].VARIANT_CLASS.unique())

2020135 (107, 91)
2020239_WO1 (95, 91)
2020246_NO1 (70, 91)
2020260_WO1 (27, 91)
2020281_WO1 (29, 91)
2021111_MO1 (32, 91)
DES001 (4164, 91)
DES002 (1240, 91)
DES002_001 (5481, 91)
DES002_002 (0, 91)
DES010 (199, 91)
2020135 ['SNV' 'deletion']
2020239_WO1 ['SNV' 'deletion']
2020246_NO1 ['SNV' 'deletion']
2020260_WO1 ['SNV' 'deletion']
2020281_WO1 ['SNV' 'deletion' 'insertion']
2021111_MO1 ['SNV' 'insertion']
DES001 ['SNV' 'substitution' 'deletion' 'insertion']
DES002 ['SNV' 'substitution' 'deletion' 'insertion']
DES002_001 ['SNV' 'substitution' 'deletion' 'insertion']
DES002_002 []
DES010 ['SNV' 'substitution']


In [7]:
final_results = {}
for sample in samples:
    final_results = create_minigenes(
        df=results[sample],
        sample=sample,
        exon_info=exon_info,
        fasta=refgen,
        out_dict=final_results,
    )

100%|██████████| 5/5 [00:00<00:00, 1656.65it/s]
100%|██████████| 102/102 [00:00<00:00, 5173.96it/s]
100%|██████████| 7/7 [00:00<00:00, 761.28it/s]
100%|██████████| 88/88 [00:00<00:00, 5307.03it/s]
100%|██████████| 3/3 [00:00<00:00, 1818.34it/s]
100%|██████████| 67/67 [00:00<00:00, 5852.48it/s]
100%|██████████| 2/2 [00:00<00:00, 1180.66it/s]
100%|██████████| 25/25 [00:00<00:00, 5657.28it/s]
100%|██████████| 1/1 [00:00<00:00, 1135.13it/s]
100%|██████████| 4/4 [00:00<00:00, 1068.34it/s]
100%|██████████| 24/24 [00:00<00:00, 5812.64it/s]
100%|██████████| 1/1 [00:00<00:00, 734.68it/s]
100%|██████████| 31/31 [00:00<00:00, 6777.35it/s]
100%|██████████| 4/4 [00:00<00:00, 2383.80it/s]
100%|██████████| 17/17 [00:00<00:00, 1445.81it/s]
100%|██████████| 3711/3711 [00:00<00:00, 6178.12it/s]
100%|██████████| 432/432 [00:00<00:00, 7437.87it/s]
100%|██████████| 1/1 [00:00<00:00, 917.39it/s]
100%|██████████| 10/10 [00:00<00:00, 1105.71it/s]
100%|██████████| 1107/1107 [00:00<00:00, 7578.09it/s]
100%|████

In [8]:
final_results["2020135"]

{14: {'ref': {'ENSP00000311684.7': defaultdict(list,
               {0: ['CCACTGAGGATGAGGCCCTGGGGACCCCAGAGACTGGGGCTGCCAACCCCCATGGACTGGCAGGAGCAGGGAAGGGCTCCCTCTCAGGACCAG',
                 'CCCATGGACTGGCAGGAGCAGGGAAGGGCTCCCTCTCAGGACCAGGAGGCTCCCAGCCCAGAGGCCCTCCCCTCCCCAGGCCAGGAGCCCGCA',
                 'GCTCCCAGCCCAGAGGCCCTCCCCTCCCCAGGCCAGGAGCCCGCAGCTGGGGCTAGCCCCAGGCGGGGAGAGCTCCGCAGGGGCAGCTCGGCT',
                 'GGGGCTAGCCCCAGGCGGGGAGAGCTCCGCAGGGGCAGCTCGGCTGAGAGCGCCCTGCCCCGGGCCGGGCCGCGGGAGCTGGGCCGGGGCCTG',
                 'AGCGCCCTGCCCCGGGCCGGGCCGCGGGAGCTGGGCCGGGGCCTGCACAAGGCGGCGTCTGTGGAGCTGCCGCAGCGCCGGAGCCCCAGCCCG',
                 'AAGGCGGCGTCTGTGGAGCTGCCGCAGCGCCGGAGCCCCAGCCCGGGAGCCACCCGCCTGGCCCGGGGAGGCCTGGGTGAGGGCGAGTATGCC',
                 'GCCACCCGCCTGGCCCGGGGAGGCCTGGGTGAGGGCGAGTATGCCCAGAGGCTGCAGGCCCTGCGCCAGCGGCTGCTGCGGGGAGGCCCCGAG',
                 'AGGCTGCAGGCCCTGCGCCAGCGGCTGCTGCGGGGAGGCCCCGAGGATGGCAAGGTCAGCGGCCTCAGGGGTCCCCTGCTGGAGAGCCTGGGG',
                 'GGCAAGGTCAGCGG

In [9]:
ref_df, var_df = collapse_output(final_results)

In [10]:
ref_df = replace_bbsl_restriction_site(ref_df)

In [11]:
var_df = replace_bbsl_restriction_site(var_df)

In [12]:
ref_df

Unnamed: 0,reference,variant,amino_acid_ref,amino_acid_var,codon_ref,codon_var,chromosome,mutation_location,mutation_location2,variant_class,gene_id,protein_id,transcript_id,gene_symbol,strand,minigene,minigene_id,modified_minigene
0,C,-,T,X,aCc,ac,chr2,219483369,219483369,deletion,ENSG00000072195,ENSP00000311684,ENST00000312358,SPEG,1,CCACTGAGGATGAGGCCCTGGGGACCCCAGAGACTGGGGCTGCCAA...,2020135_14_ENSP00000311684.7_0_0_ref,CCACTGAGGATGAGGCCCTGGGGACCCCAGAGACTGGGGCTGCCAA...
1,C,-,T,X,aCc,ac,chr2,219483369,219483369,deletion,ENSG00000072195,ENSP00000311684,ENST00000312358,SPEG,1,CCCATGGACTGGCAGGAGCAGGGAAGGGCTCCCTCTCAGGACCAGG...,2020135_14_ENSP00000311684.7_0_1_ref,CCCATGGACTGGCAGGAGCAGGGAAGGGCTCCCTCTCAGGACCAGG...
2,C,-,T,X,aCc,ac,chr2,219483369,219483369,deletion,ENSG00000072195,ENSP00000311684,ENST00000312358,SPEG,1,GCTCCCAGCCCAGAGGCCCTCCCCTCCCCAGGCCAGGAGCCCGCAG...,2020135_14_ENSP00000311684.7_0_2_ref,GCTCCCAGCCCAGAGGCCCTCCCCTCCCCAGGCCAGGAGCCCGCAG...
3,C,-,T,X,aCc,ac,chr2,219483369,219483369,deletion,ENSG00000072195,ENSP00000311684,ENST00000312358,SPEG,1,GGGGCTAGCCCCAGGCGGGGAGAGCTCCGCAGGGGCAGCTCGGCTG...,2020135_14_ENSP00000311684.7_0_3_ref,GGGGCTAGCCCCAGGCGGGGAGAGCTCCGCAGGGGCAGCTCGGCTG...
4,C,-,T,X,aCc,ac,chr2,219483369,219483369,deletion,ENSG00000072195,ENSP00000311684,ENST00000312358,SPEG,1,AGCGCCCTGCCCCGGGCCGGGCCGCGGGAGCTGGGCCGGGGCCTGC...,2020135_14_ENSP00000311684.7_0_4_ref,AGCGCCCTGCCCCGGGCCGGGCCGCGGGAGCTGGGCCGGGGCCTGC...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17737,GG,AA,S,F,tCC,tTT,chr20,13714961,13714962,substitution,ENSG00000089048,ENSP00000202816,ENST00000202816,ESF1,-1,AAAGAAAAAAGAGAGTGAGATTGAAAAGGAATCACAAAGGAAGTTC...,DES010_178_ENSP00000202816.1_0_0_ref,AAAGAAAAAAGAGAGTGAGATTGAAAAGGAATCACAAAGGAAGTTC...
17738,TG,CA,AS,AG,gcCAgc,gcTGgc,chr22,29260468,29260469,substitution,ENSG00000100263,ENSP00000216085,ENST00000216085,RHBDD3,-1,CACCTGGGAGGGCTCCTCAGAGGCAGGCCTGGACTGGGCTGGGGGC...,DES010_182_ENSP00000216085.7_0_0_ref,CACCTGGGAGGGCTCCTCAGAGGCAGGCCTGGACTGGGCTGGGGGC...
17739,CC,TT,LD,LN,ttGGac,ttAAac,chrX,312856,312857,substitution,ENSG00000178605,ENSP00000316598,ENST00000326153,GTPBP6,-1,GCAGCGTCTCCTGAGAGAGAAGGAGGCCAAGATCAGGAAGGCCTTT...,DES010_186_ENSP00000316598.5_0_0_ref,GCAGCGTCTCCTGAGAGAGAAGGAGGCCAAGATCAGGAAGGCCTTT...
17740,CC,TT,G,K,GGa,AAa,chrX,108221301,108221302,substitution,ENSG00000197565,ENSP00000334733,ENST00000334504,COL4A6,-1,CCCAACAGGTCCTCAAGGATTCACTGGCTCTACTGGTTTATCGGGG...,DES010_195_ENSP00000334733.7_0_0_ref,CCCAACAGGTCCTCAAGGATTCACTGGCTCTACTGGTTTATCGGGG...


In [13]:
var_df

Unnamed: 0,reference,variant,amino_acid_ref,amino_acid_var,codon_ref,codon_var,chromosome,mutation_location,mutation_location2,variant_class,gene_id,protein_id,transcript_id,gene_symbol,strand,minigene,minigene_id,modified_minigene
0,C,-,T,X,aCc,ac,chr2,219483369,219483369,deletion,ENSG00000072195,ENSP00000311684,ENST00000312358,SPEG,1,CCACTGAGGATGAGGCCCTGGGGACCCCAGAGACTGGGGCTGCCAA...,2020135_14_ENSP00000311684.7_0_0_var,CCACTGAGGATGAGGCCCTGGGGACCCCAGAGACTGGGGCTGCCAA...
1,C,-,T,X,aCc,ac,chr2,219483369,219483369,deletion,ENSG00000072195,ENSP00000311684,ENST00000312358,SPEG,1,CCATGGACTGGCAGGAGCAGGGAAGGGCTCCCTCTCAGGACCAGGA...,2020135_14_ENSP00000311684.7_0_1_var,CCATGGACTGGCAGGAGCAGGGAAGGGCTCCCTCTCAGGACCAGGA...
2,C,-,T,X,aCc,ac,chr2,219483369,219483369,deletion,ENSG00000072195,ENSP00000311684,ENST00000312358,SPEG,1,CTCCCAGCCCAGAGGCCCTCCCCTCCCCAGGCCAGGAGCCCGCAGC...,2020135_14_ENSP00000311684.7_0_2_var,CTCCCAGCCCAGAGGCCCTCCCCTCCCCAGGCCAGGAGCCCGCAGC...
3,C,-,T,X,aCc,ac,chr2,219483369,219483369,deletion,ENSG00000072195,ENSP00000311684,ENST00000312358,SPEG,1,GGGCTAGCCCCAGGCGGGGAGAGCTCCGCAGGGGCAGCTCGGCTGA...,2020135_14_ENSP00000311684.7_0_3_var,GGGCTAGCCCCAGGCGGGGAGAGCTCCGCAGGGGCAGCTCGGCTGA...
4,C,-,T,X,aCc,ac,chr2,219483369,219483369,deletion,ENSG00000072195,ENSP00000311684,ENST00000312358,SPEG,1,GCGCCCTGCCCCGGGCCGGGCCGCGGGAGCTGGGCCGGGGCCTGCA...,2020135_14_ENSP00000311684.7_0_4_var,GCGCCCTGCCCCGGGCCGGGCCGCGGGAGCTGGGCCGGGGCCTGCA...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17732,GG,AA,S,F,tCC,tTT,chr20,13714961,13714962,substitution,ENSG00000089048,ENSP00000202816,ENST00000202816,ESF1,-1,AAAGAAAAAAGAGAGTGAGATTGAAAAGGAATCACAAAGGAAGTTT...,DES010_178_ENSP00000202816.1_0_0_var,AAAGAAAAAAGAGAGTGAGATTGAAAAGGAATCACAAAGGAAGTTT...
17733,TG,CA,AS,AG,gcCAgc,gcTGgc,chr22,29260468,29260469,substitution,ENSG00000100263,ENSP00000216085,ENST00000216085,RHBDD3,-1,CACCTGGGAGGGCTCCTCAGAGGCAGGCCTGGACTGGGCTGGGGGC...,DES010_182_ENSP00000216085.7_0_0_var,CACCTGGGAGGGCTCCTCAGAGGCAGGCCTGGACTGGGCTGGGGGC...
17734,CC,TT,LD,LN,ttGGac,ttAAac,chrX,312856,312857,substitution,ENSG00000178605,ENSP00000316598,ENST00000326153,GTPBP6,-1,GCAGCGTCTCCTGAGAGAGAAGGAGGCCAAGATCAGGAAGGCCTTT...,DES010_186_ENSP00000316598.5_0_0_var,GCAGCGTCTCCTGAGAGAGAAGGAGGCCAAGATCAGGAAGGCCTTT...
17735,CC,TT,G,K,GGa,AAa,chrX,108221301,108221302,substitution,ENSG00000197565,ENSP00000334733,ENST00000334504,COL4A6,-1,CCCAACAGGTCCTCAAGGATTCACTGGCTCTACTGGTTTATCGGAA...,DES010_195_ENSP00000334733.7_0_0_var,CCCAACAGGTCCTCAAGGATTCACTGGCTCTACTGGTTTATCGGAA...


In [13]:
ref_df.to_csv("final_minigenes_control_v3.csv", index=False)
var_df.to_csv("final_minigenes_variant_v3.csv", index=False)