In [1]:
# conda create -n sss python=3.8 -y && conda activate sss
# conda install -y -c bioconda gffutils jupyter tqdm cyvcf2 pathlib2 pandarallel pysam liftover pybedtools

import os
import re
import numpy as np
import pandas as pd
# from Bio.Seq import Seq
# from liftover import get_lifter
from pathlib2 import Path
from pandarallel import pandarallel
from tqdm import tqdm
import gffutils
import pysam
from cyvcf2 import VCF

### Logging setup
from logging import getLogger, config
import yaml
parent_directory = os.path.dirname(os.path.dirname('__file__'))
config_path: str = os.path.join(parent_directory, '../../../config/logging.yaml')
with open(config_path, 'r') as f:
    config.dictConfig(yaml.safe_load(f))
logger = getLogger(__name__)

########   Initialize and setup pandas methods   ########
os.environ['JOBLIB_TEMP_FOLDER'] = '/tmp' 
pandarallel.initialize(nb_workers=5, progress_bar=True, verbose=1, use_memory_fs=False) 
tqdm.pandas()

import sys
try: 
    __file__
    sys.path.append(os.path.join(os.path.dirname('__file__')))
except NameError:
    Path().resolve()
    sys.path.append(os.path.join(Path().resolve(), '../../../'))

from libs import utils, preprocess, variantfilter, posparser, splaiparser
# from libs import predeffect, scoring
from libs import anno_spliceai, anno_clinvar
from libs.deco import print_filtering_count
# from libs import predeffect
from libs.scoring import Scoring
from libs import predeffect


gencode_gff = '../../../Resources/05_GENCODE_v43lift37/gencode.v43lift37.annotation.sort.gff3.gz'

try:
    db_anno_gencode = '../../../Resources/06_gffutilsdb/gencode.v43lift37.annotation.gtf.db'
    db_anno_intron = '../../../Resources/06_gffutilsdb/gencode.v43lift37.annotation.intron.gtf.db'
    db = gffutils.FeatureDB(db_anno_gencode)
    db_intron = gffutils.FeatureDB(db_anno_intron)
except ValueError:
    db_anno_gencode = '/resources/DBs/gencode.v43lift37.annotation.gtf.db'
    db_anno_intron = '/resources/DBs/gencode.v43lift37.annotation.intron.gtf.db'
    db = gffutils.FeatureDB(db_anno_gencode)
    db_intron = gffutils.FeatureDB(db_anno_intron)

## Thresholds configuration
thresholds_SpliceAI_parser: dict = {
    'TH_min_sALDL': 0.02, 'TH_max_sALDL': 0.2, 
    'TH_min_sAGDG': 0.01, 'TH_max_sAGDG': 0.05,
    'TH_min_GExon': 25, 'TH_max_GExon': 500,
    'TH_sAG': 0.2, 'TH_sDG': 0.2
    }

In [2]:
# Loading allmut variants from pickle
allmut_dm_maf0_snv_hg19 = pd.read_pickle('allmut_dm_maf0_snv_liftover.pkl')

# Rename POS_hg19 to POS
allmut_dm_maf0_snv_hg19.rename(columns={'POS_hg19': 'POS'}, inplace=True)

# Drop unknown positions in 'POS' column and assign integer type
allmut_dm_maf0_snv_hg19.dropna(subset=['POS'], inplace=True)
allmut_dm_maf0_snv_hg19 = allmut_dm_maf0_snv_hg19.astype({'POS': int})

# Change object name to allmut
allmut = allmut_dm_maf0_snv_hg19

# Generate ID column
allmut['ID'] = allmut['CHROM'].astype(str) + '-' + allmut['POS'].astype(str) + '-' + allmut['hgvs']

# Extract useful columns
allmut = allmut[['ID', 'mutype', 'clinvar_clnsig', 'tag', 'deletion', 'insertion', 'expected_inheritance', 'gnomad_AF']]

# Load VCF file annoteted by analysis pipeline
df = pd.read_pickle('splai_vep_vcfs/hgmd_dm/allchr.DM.splai.vep.nondel.enst.prescore.hgnconly.pkl')
df['HGVSc'] = df['HGVSc'].str.replace('c.', '')
df['ID'] = df['CHROM'].astype(str) + '-' + df['POS'].astype(str) + '-' + df['HGVSc']

# merge df and allmut on 'ID' column with inner join
print(len(df))
df = pd.merge(df, allmut, on='ID', how='inner')
print(len(df))

# df_spl contains splicing mutations (splice, canonical-splice, exonic-splice)
df_spl = df[df['mutype'].str.contains('splice')].copy()

# df_non_spl contains non-splicing mutations (missense, nonsense, synonymous)
df_non_spl = df[df['mutype'].str.contains('missense|nonsense|synonymous')].copy()

print(f"Splicing: {len(df_spl)}, Non-splicing: {len(df_non_spl)}, total: {len(df_spl) + len(df_non_spl)}")

256603
126859
Splicing: 18441, Non-splicing: 107021, total: 125462


In [4]:
df_spl.columns

Index(['CHROM', 'POS', 'REF', 'ALT', 'GeneSymbol', 'SymbolSource', 'HGNC_ID',
       'ENST', 'HGVSc', 'Consequence', 'EXON', 'INTRON', 'Strand', 'DS_AG',
       'DS_AL', 'DS_DG', 'DS_DL', 'DP_AG', 'DP_AL', 'DP_DG', 'DP_DL',
       'maxsplai', 'ENST_Full', 'is_Canonical', 'IntronDist', 'ex_up_dist',
       'ex_down_dist', 'exon_pos', 'prc_exon_loc', 'exon_splice_site',
       'SpliceType', 'clinvar_same_pos', 'clinvar_same_motif', 'ExInt_INFO',
       'Pseudoexon', 'Part_IntRet', 'Part_ExDel', 'Exon_skipping',
       'Int_Retention', 'multiexs', 'Size_Part_ExDel', 'Size_Part_IntRet',
       'Size_pseudoexon', 'Size_IntRet', 'Size_skipped_exon', 'variant_id',
       'CDS_Length', 'is_10%_truncation', 'is_eLoF', 'is_NMD_at_Canon',
       'is_Frameshift_Part_ExDel', 'is_Frameshift_Part_IntRet',
       'is_Frameshift_pseudoexon', 'is_Frameshift_IntRet',
       'is_Frameshift_skipped_exon', 'is_Frameshift', 'skipped_region',
       'deleted_region', 'skipped_ccrs', 'deleted_ccrs', 'ID', 'mut

In [10]:
df_spl['Consequence'].value_counts()

Consequence
splice_donor_variant                                                        6753
splice_acceptor_variant                                                     6093
splice_donor_5th_base_variant&intron_variant                                1194
splice_donor_region_variant&intron_variant                                   999
splice_region_variant&splice_polypyrimidine_tract_variant&intron_variant     977
intron_variant                                                               764
splice_polypyrimidine_tract_variant&intron_variant                           487
missense_variant&splice_region_variant                                       376
splice_region_variant&synonymous_variant                                     363
synonymous_variant                                                           218
missense_variant                                                             126
splice_region_variant&intron_variant                                          46
stop_gained     

In [11]:
df_spl.loc[df['Consequence'] == '5_prime_UTR_variant', ['Consequence', 'mutype']]

Unnamed: 0,Consequence,mutype
307,5_prime_UTR_variant,exonic-splice
81264,5_prime_UTR_variant,exonic-splice
