##### Import package

In [2]:
import os
import re
import sys
from pathlib import Path

import gffutils
import numpy as np
import pandas as pd
from pybedtools import BedTool
import pysam
from Bio.Seq import Seq
# from liftover import get_lifter
from pandarallel import pandarallel
from tqdm import tqdm

### Import my modules in libs directory
try: 
    __file__
    sys.path.append(os.path.join(os.path.dirname(__file__)))
except NameError:
    Path().resolve()
    sys.path.append(os.path.join(Path().resolve(), '../../'))

from libs import utils, preprocess, variantfilter, posparser, splaiparser
from libs import predeffect
from libs import anno_spliceai, anno_clinvar
from libs.deco import print_filtering_count

########   Initialize and setup pandas methods   ########
pandarallel.initialize(progress_bar=False)
tqdm.pandas()

INFO: Pandarallel will run on 4 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [4]:
########   Import genocode DBs (exon DB and intron DB)   ########
try:
    db_anno_gencode = '../../Resources/06_gffutilsdb/gencode.v43lift37.annotation.gtf.db'
    db_anno_intron = '../../Resources/06_gffutilsdb/gencode.v43lift37.annotation.intron.gtf.db'
    db = gffutils.FeatureDB(db_anno_gencode)
    db_intron = gffutils.FeatureDB(db_anno_intron)
except ValueError:
    db_anno_gencode = '/resources/DBs/gencode.v43lift37.annotation.gtf.db'
    db_anno_intron = '/resources/DBs/gencode.v43lift37.annotation.intron.gtf.db'
    db = gffutils.FeatureDB(db_anno_gencode)
    db_intron = gffutils.FeatureDB(db_anno_intron)

########   Import TSVs, VCFs and GFFs   ########
### Downloaded database files
#1. Clinvar variants (BED format)
clinvar_file = '../../Resources/03_ClinVar/variant_summary.snv.grch37.germline.criteria.sort.bed.gz'
tbx_clinvar = pysam.TabixFile(clinvar_file)
#2. GENCODE file (GFF3 format)
gencode_gff = '../../Resources/05_GENCODE_v43lift37/gencode.v43lift37.annotation.sort.gff3.gz'
tbx_anno = pysam.TabixFile(gencode_gff)
#3. Canonical transcripts table
# canonlist = '../../CanonicalTranscripts_COMP/CanonicalTranscripts.exoncount.tsv'
canonlist = '../../Resources/01_CanonicalTranscripts/CanonicalTranscripts.exoncount.tsv'
#4. CCRs (BED format)
autoccr = '../../Resources/04_CCRs/ccrs.autosomes.v2.20180420.sort.bed.gz'
xccr = '../../Resources/04_CCRs/ccrs.xchrom.v2.20180420.sort.bed.gz'

## Thresholds configuration
thresholds_SpliceAI_parser: dict = {
    'TH_min_sALDL': 0.02, 'TH_max_sALDL': 0.2, 
    'TH_min_sAGDG': 0.01, 'TH_max_sAGDG': 0.05,
    'TH_min_GExon': 25, 'TH_max_GExon': 500,
    'TH_sAG': 0.2, 'TH_sDG': 0.2
    }

############ Functions for cleansing and adjusting HGMD data ############
def adjust_enst_for_hgmd(df: pd.DataFrame) -> pd.DataFrame:
    result = df.replace(
        {'gene': {'PRKN': 'PARK2', 'MICOS13': 'C19orf70'},
         'ENST': {'ENST00000263201': 'ENST00000437685'},
         'ENST_Full': {'ENST00000263201.7_4': 'ENST00000437685.6_1',
                       'ENST00000361547.7_7': 'ENST00000361547.7_8',
                       'ENST00000609375.1_7': 'ENST00000347364.7_5',
                       'ENST00000649912.1_4': 'ENST00000347364.7_5'}})
    return result

@print_filtering_count
def remove_unkown_refalt(df: pd.DataFrame) -> pd.DataFrame:
    result = df.dropna(subset='REF', axis=0)
    return result

## loading original file and setting output base path
mydata = './original.snpeff.state.disease.identifiedgene.tsv'
# mydata = './original.snpeff.state.disease.filtered.tsv'
results_splai = './results_spliceai_mydata.vcf'

### Loading files as pandas DataFrame ###
df = pd.read_table(mydata, sep='\t', dtype=str)
output_base: str = utils.configure_output(mydata)
vcf_columns = ['CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO']
splai = pd.read_table(results_splai, sep='\t', header=None, names=vcf_columns)
canon = pd.read_table(canonlist, sep='\t', header=0)


In [5]:

## Pre-processing
df = preprocess.rename_like_vcf_format(df)
df['variant_id'] = df.apply(preprocess.generate_variant_id_col, axis=1)
df.rename(columns={'GeneSymbol': 'gene'}, inplace=True)

### Mapping ENST full ID like ENST000000000000.0_0 ###
# Mearge ENST Full ID
canon = canon[['GeneSymbol', 'ENST', 'ENST_Full', 'ENSG_Full', 'Strand', 'Tag']]
df = pd.merge(df, canon, how='left', 
              left_on='ENST', right_on='ENST')
print(f"Merged: {len(df)}")
df = df.loc[~(df['ENST_Full'].isnull())]
df.drop_duplicates(inplace=True)
# df.drop_duplicates(subset=['variant_id', 'sample'], inplace=True)

print(f"Drop duplicated: {len(df)} \n")
# df = df.drop('GeneSymbol_y', axis=1)
# df.rename(columns={'GeneSymbol_x': 'GeneSymbol'}, inplace=True)

# Cleansing and adjusting DataFrames
df = adjust_enst_for_hgmd(df)
df = remove_unkown_refalt(df)
df.rename(columns={'POS_hg19': 'POS', 'csq_snpeff': 'csq'}, inplace=True)
df['variant_id'] = df.parallel_apply(preprocess.generate_variant_id_col, axis=1)
splai['ID'] = splai.parallel_apply(preprocess.generate_variant_id_col, axis=1)

#4.   Annotate SpliceAI results
print('\nAnnotating SpliceAI results...')
#4-1. Annotate results
df = anno_spliceai.anno(df, splai, 'gene')
#4-2. Calcurate Max delata score
df = anno_spliceai.insert_maxsplai(df)
#4-3. Exclude variant Non SpliceAI prediction
df = df.loc[~df['maxsplai'].isnull(),:] 

Merged: 22057
Drop duplicated: 19926 

Start remove_unkown_refalt
Filtering : 19926 --> 19926


Annotating SpliceAI results...


In [6]:
## Extract coding SNVs
df = variantfilter.extract_snv(df, key='type')
df = variantfilter.exclude_intergenic(df, key='csq')
df = variantfilter.exclude_utr(df, key='csq')
df = variantfilter.exclude_no_transcripts(df, key='csq')
df = variantfilter.exclude_up_down_stream(df, key='csq')
df = variantfilter.exclude_tf_binding(df, key='csq')

## DeNovo filter (extract estimated denovo variants)
df = variantfilter.extract_denovo(df)

Start extract_snv
Filtering : 18152 --> 18152

Start exclude_intergenic
Filtering : 18152 --> 18152

Start exclude_utr
Filtering : 18152 --> 17522

Start exclude_no_transcripts
Filtering : 17522 --> 17522

Start exclude_up_down_stream
Filtering : 17522 --> 16954

Start exclude_tf_binding
Filtering : 16954 --> 16954

Start extract_denovo
Filtering : 16954 --> 16353



In [7]:
df.to_csv('./original.snpeff.state.disease.identifiedgene.filtered.splai.tsv', sep='\t', index=False)

In [24]:
#1.   Classify 'Canonical' splice site or 'Non-canonical' splice site
print('\nClassify "Canonical" splice site or "Non-canonical" splice site...')
df = posparser.classifying_canonical(df, cdot='c.HGVS')

#2.   Calculate exonic positions
print('\nCalculating exonic positions...')
#2-1. Generate 'exonic upstream distance and exonic downstream distance
df['exon_loc'] = df.progress_apply(posparser.calc_exon_loc, tabixfile=tbx_anno, 
                          enstcolname='ENST', axis=1)
df = pd.concat([df, df['exon_loc'].str.split(':', expand=True)], axis=1)
df.rename(columns={0: 'ex_up_dist', 1: 'ex_down_dist'}, inplace=True)

#2-2. Select minimum distance from upstream distance and downstream distance
df['exon_pos'] = df.parallel_apply(posparser.select_exon_pos, axis=1)

#2-3. Decision exonic splice sites (1 nt in acceptor site or 3 nts on Donor site)
df['exon_splice_site'] = df.parallel_apply(posparser.extract_splicing_region, axis=1)

#3.   Additional Splicing information
print('\nAnnotating splicing information...')
#3-1. Annotate splicing type ('Exonic Acceptor' etc.)
df['SpliceType'] = df.parallel_apply(posparser.select_donor_acceptor, axis=1)

#3-2. Annotate rank of exon or intron
df['Num_ExInt'] = df.progress_apply(posparser.calc_ex_int_num, 
                                    db=db, db_intron=db_intron, axis=1)

#5.   Annotate ClinVar varaints interpretations
print('\nAnnotating ClinVar varaints interpretations...')
df['clinvar_same_pos'] = df.progress_apply(anno_clinvar.anno_same_pos_vars, 
                                           tabixfile=tbx_clinvar, axis=1)
df['clinvar_same_motif'] = df.progress_apply(anno_clinvar.anno_same_motif_vars, 
                                             tabixfile=tbx_clinvar, axis=1)



Classify "Canonical" splice site or "Non-canonical" splice site...
-2: 63
-1: 60
+1: 80
+2: 45
Total variants      : 16708
Canonical variants  : 248
non-Canon variants  : 16460


Calculating exonic positions...


  0%|          | 0/16708 [00:00<?, ?it/s]

100%|██████████| 16708/16708 [00:08<00:00, 2013.30it/s]



Annotating splicing information...


100%|██████████| 16708/16708 [00:15<00:00, 1107.58it/s]



Annotating SpliceAI results...


KeyError: 'is_Multi'

In [5]:
#6.   Parising SpliceAI results (For PVS1 decision tree)
print('\nParsing SpliceAI results...')
## Thresholds configuration
TH_min_sALDL, TH_max_sALDL = 0.02, 0.2
TH_min_sAGDG, TH_max_sAGDG = 0.01, 0.05
TH_min_GExon, TH_max_GExon = 25, 500
TH_sAG, TH_sDG = 0.2, 0.2

#6-1. Annotate Exon/Intron position information
df['ExInt_INFO'] = df.progress_apply(splaiparser.calc_exint_info, 
                                     db=db, 
                                     db_intron=db_intron, 
                                     axis=1)
df = df.loc[~(df['ExInt_INFO'] == 'Warning')]
unklist = [
    '10:127786968-C-T', '15:34145792-C-G', '22:24325115-C-T',
    '4:155305543-C-T', '4:2820079-C-T', '4:3432717-C-T']
df = df.loc[~df['variant_id'].isin(unklist)]

#6-2. Relative exon location
df['prc_exon_loc'] = df.parallel_apply(posparser.calc_prc_exon_loc, axis=1)

#6-3. Predict splicing effects
df['Pseudoexon'] = df.progress_apply(
    splaiparser.pseudoexon_activation,
    thresholds=thresholds_SpliceAI_parser, 
    db_intron=db_intron,
    axis=1)


df['Part_IntRet'] = df.parallel_apply(
    splaiparser.partial_intron_retention,
    thresholds=thresholds_SpliceAI_parser, 
    axis=1)

df['Part_ExDel'] = df.parallel_apply(
    splaiparser.partial_exon_deletion,
    thresholds=thresholds_SpliceAI_parser, 
    axis=1)

df['Exon_skipping'] = df.parallel_apply(
    splaiparser.exon_skipping, 
    thresholds=thresholds_SpliceAI_parser, 
    axis=1)
                                        
df['Int_Retention'] = df.parallel_apply(
    splaiparser.intron_retention, 
    thresholds=thresholds_SpliceAI_parser, 
    axis=1)

df['multiexs'] = df.parallel_apply(
    splaiparser.multi_exon_skipping, 
    thresholds=thresholds_SpliceAI_parser, 
    axis=1)


#7.   Annotate aberrant splicing size (bp)
print('\nAnnotating aberrant splicing size (bp)...')
#7-1. Annotate size of 
df['Size_Part_ExDel'] = df.parallel_apply(
    splaiparser.anno_partial_exon_del_size, 
    thresholds=thresholds_SpliceAI_parser, 
    axis=1)

#7-3. Annotate size of partial intron retention
df['Size_Part_IntRet'] = df.parallel_apply(
    splaiparser.anno_partial_intron_retention_size, 
    thresholds=thresholds_SpliceAI_parser,
    axis=1)

#7-2. Annotate size of pseudoexon
df['Size_pseudoexon'] = df.parallel_apply(
    splaiparser.anno_gained_exon_size, 
    thresholds=thresholds_SpliceAI_parser, 
    axis=1)

#7-4. Annotate size of intron retention
df['Size_IntRet'] = df.parallel_apply(
    splaiparser.anno_intron_retention_size, 
    thresholds=thresholds_SpliceAI_parser,
    axis=1)

#7-5. Annotate size of exon skipping
df['Size_skipped_exon'] = df.parallel_apply(
    splaiparser.anno_skipped_exon_size, 
    thresholds=thresholds_SpliceAI_parser,
    axis=1)


#8.   Evaluate splicing effects
print('\nPredicting CDS change...')
#8-1. Predict CDS change
df['CDS_Length'] = df.progress_apply(predeffect.calc_cds_len, db=db, axis=1)
df['is_10%_truncation'] = df.progress_apply(predeffect.calc_cds_len_shorten, axis=1)

#8-2. Determine if the gene is included in eLoFs genes
df['is_eLoF'] = df.parallel_apply(predeffect.elofs_judge, axis=1)

#8-3. Determine causing NMD or not
df['is_NMD_at_Canon'] = df.parallel_apply(predeffect.nmd_judge, axis=1)


#8-4. Frame check
df['is_Frameshift_Part_ExDel'] = df['Size_Part_ExDel'].parallel_apply(
    predeffect.frame_check)
df['is_Frameshift_Part_IntRet'] = df['Size_Part_IntRet'].parallel_apply(
    predeffect.frame_check)
df['is_Frameshift_pseudoexon'] = df['Size_pseudoexon'].parallel_apply(
    predeffect.frame_check)
df['is_Frameshift_IntRet'] = df['Size_IntRet'].parallel_apply(
    predeffect.frame_check)
df['is_Frameshift_skipped_exon'] = df['Size_skipped_exon'].parallel_apply(
    predeffect.frame_check)
df['is_Frameshift'] = df[['is_Frameshift_Part_ExDel', 
                          'is_Frameshift_Part_IntRet', 
                          'is_Frameshift_pseudoexon', 
                          'is_Frameshift_IntRet', 
                          'is_Frameshift_skipped_exon'
                          ]].any(axis=1)

#9.   CCRs
print('\nAnnotating CCRs info...')
#9-1. Annotate truncated regions 
df['skipped_region'] = df.parallel_apply(
    splaiparser.anno_skipped_regions, axis=1)

df['deleted_region'] = df.parallel_apply(
    splaiparser.anno_deleted_regions, 
    thresholds=thresholds_SpliceAI_parser, axis=1)

#9-2. Intersect with CCRs
df = predeffect.anno_ccr_score(df)

#10.   Annotate ACMG/AMP codes
print('\nAnnotating ACMG/AMP codes...')
#10-1. Annotate codes
df['BP4'] = df.parallel_apply(acmgamp.apply_BP4, axis=1)
df['BP7'] = df.parallel_apply(acmgamp.apply_BP7, axis=1)
df['PP3'] = df.parallel_apply(acmgamp.apply_PP3, axis=1)
df['PVS1s'] = df.parallel_apply(acmgamp.apply_PVS1s, axis=1)
df['PS1s'] = df.parallel_apply(acmgamp.apply_PS1s, axis=1)

#10-2. Annotate final prediction
df['Final_Prediction'] = df.parallel_apply(acmgamp.final_evaluation, axis=1)



Parsing SpliceAI results...


  1%|          | 125/17531 [00:00<00:27, 624.85it/s]

100%|██████████| 17531/17531 [00:26<00:00, 673.68it/s]
100%|██████████| 17022/17022 [00:04<00:00, 3582.06it/s]



Annotating aberrant splicing size (bp)...

Predicting CDS change...


100%|██████████| 17022/17022 [00:18<00:00, 903.89it/s] 
100%|██████████| 17022/17022 [00:00<00:00, 67519.93it/s]



Annotating CCRs info...


  df['skipped_ccrs'] = df.parallel_apply(



Annotating ACMG/AMP codes...


  df['deleted_ccrs'] = df.parallel_apply(
  df['BP4'] = df.parallel_apply(acmgamp.apply_BP4, axis=1)
  df['BP7'] = df.parallel_apply(acmgamp.apply_BP7, axis=1)
  df['PP3'] = df.parallel_apply(acmgamp.apply_PP3, axis=1)
  df['PVS1s'] = df.parallel_apply(acmgamp.apply_PVS1s, axis=1)
  df['PS1s'] = df.parallel_apply(acmgamp.apply_PS1s, axis=1)
  df['Final_Prediction'] = df.parallel_apply(acmgamp.final_evaluation, axis=1)


In [5]:
df.to_pickle(f'./final.pickle')

In [10]:
df = pd.read_pickle(f'./final.pickle')

### Note

In [17]:
df['Final_Prediction'].value_counts()

Final_Prediction
Very low priority     12418
Low priority           3543
High priority           228
Moderate priority        67
Very high priority        6
Name: count, dtype: int64

In [7]:
df['csq']

0                                intron_variant
1                                intron_variant
2                            synonymous_variant
3                                intron_variant
4        missense_variant&splice_region_variant
                          ...                  
20087                            intron_variant
20088                          missense_variant
20089                          missense_variant
20090                          missense_variant
20091                          missense_variant
Name: csq, Length: 17531, dtype: object

In [6]:
df['Final_Prediction'].value_counts()


Final_Prediction
Low priority          11658
Very low priority      5213
High priority            74
Moderate priority        70
Very high priority        7
Name: count, dtype: int64

In [7]:
df.drop_duplicates(subset=['variant_id', 'sample'], inplace=True)
df.to_csv(f'./20231026_final_raw.tsv', sep='\t', index=False)

In [21]:
len(df)

16262

In [18]:
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go


In [19]:
labels = ['Very high priority', 'High priority', 'Moderate priority', 'Low priority', 'Very low priority']
value = [len(df.loc[df['Final_Prediction'] == 'Very high priority']), 
         len(df.loc[df['Final_Prediction'] == 'High priority']), 
         len(df.loc[df['Final_Prediction'] == 'Moderate priority']), 
         len(df.loc[df['Final_Prediction'] == 'Low priority']), 
         len(df.loc[df['Final_Prediction'] == 'Very low priority'])]
fig = px.bar(x=labels, y=value, height=600, width=800, title='HGMD variant interpretation', template='seaborn')
fig.update_layout(xaxis=dict(title='Classification'),
                 yaxis=dict(title='Nmuber of variants'))

fig.show()

##### 次のセルは，SpliceAIの結果をマージした時のもの．

In [None]:
df2 = pd.read_table(
    '/work/Github/MyProjects/SplicingScreening/data/ResultsPredictionTools/results_spliceai_raw_maxdistance.nonheader.final.vcf',
    sep='\t', dtype=str, header=None) 
df2.columns = ['CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO']
df2['variant_id'] = df2.apply(preprocess.generate_variant_id_col, axis=1)
df2 = df2[df2['INFO'] != '.']

df3 = pd.read_table('/work/Github/MyProjects/SplicingScreening/data/ResultsPredictionTools/reanalysis.splai.NMfixed.parsed.vcf',
                    sep='\t', dtype=str, header=None)
df3.columns = ['CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO']
df3['variant_id'] = df3.apply(preprocess.generate_variant_id_col, axis=1)    

df4 = pd.concat([df2, df3], axis=0)
df4[df4.duplicated(subset='variant_id', keep='last')].sort_values(by='variant_id')
df4 = df4[['CHROM', 'POS', 'variant_id', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO']]

df4.to_csv('results_spliceai_mydata.vcf', sep='\t', index=False)

##### Creating DBs


In [None]:
################################################################################
##                                Creating DBs                                ##
################################################################################

import gffutils
import gffutils.pybedtools_integration

## Function
def generate_intoron_gtf(db: gffutils.FeatureDB, output: str) -> None:
    introns = db.create_introns(exon_featuretype='exon', 
                                new_featuretype='intron', 
                                merge_attributes=True, 
                                numeric_sort=True)
    pybed = gffutils.pybedtools_integration.to_bedtool(introns)
    pybed.saveas(output)
    
    return None


## Set path to GTF files for creating DBs
gencode_gtf = '/resources/GENCODE/gencode.v43lift37.annotation.gtf.gz'
intron_gtf = '/resources/GENCODE/gencode.v43lift37.annotation.intron.gtf.gz'

## Set path to output DBs
db_anno_gencode = '/work/resources/gencode.v43lift37.annotation.gtf.db'
db_anno_intron = '/work/resources/gencode.v43lift37.annotation.intron.gtf.db'

#1. Create DB from GENCODE GTF
db = gffutils.create_db(gencode_gtf, db_anno_gencode,
                        disable_infer_genes=True, 
                        disable_infer_transcripts=True,
                        keep_order=True)

db = gffutils.FeatureDB(db_anno_gencode)

#2. Create intron information file as GTF
generate_intoron_gtf(db, intron_gtf)

#3. Create intron DB from above GTF
id_spec_dict = {'gene': 'gene_id', 'transcript': 'transcript_id',
                'exon_number': 'intron_number'}
db_intron = gffutils.create_db(intron_gtf, db_anno_intron, 
                               disable_infer_genes=True, 
                               disable_infer_transcripts=True, 
                               keep_order=True,
                               merge_strategy="merge")