##### Import package

In [4]:
import os
import re
import sys
from pathlib import Path

import gffutils
import numpy as np
import pandas as pd
import pysam
from pandarallel import pandarallel
from tqdm import tqdm

### Import my modules in libs directory
try: 
    __file__
    sys.path.append(os.path.join(os.path.dirname(__file__)))
except NameError:
    Path().resolve()
    sys.path.append(os.path.join(Path().resolve(), '../../'))

from libs import utils, preprocess, variantfilter, posparser, splaiparser
from libs import predeffect, scoring
from libs import anno_spliceai, anno_clinvar
from libs.deco import print_filtering_count

### Logging setup
from logging import getLogger, config
import yaml
parent_directory = os.path.dirname(os.path.dirname('__file__'))
config_path: str = os.path.join(parent_directory, '../../config/logging.yaml')
with open(config_path, 'r') as f:
    config.dictConfig(yaml.safe_load(f))
logger = getLogger(__name__)


########   Initialize and setup pandas methods   ########
pandarallel.initialize(nb_workers=os.cpu_count()-1, progress_bar=False, 
                       verbose=2, use_memory_fs=False) 
os.environ['JOBLIB_TEMP_FOLDER'] = '/tmp' 
tqdm.pandas()

INFO: Pandarallel will run on 7 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [5]:
########   Import genocode DBs (exon DB and intron DB)   ########
try:
    db_anno_gencode = '../../Resources/06_gffutilsdb/gencode.v43lift37.annotation.gtf.db'
    db_anno_intron = '../../Resources/06_gffutilsdb/gencode.v43lift37.annotation.intron.gtf.db'
    db = gffutils.FeatureDB(db_anno_gencode)
    db_intron = gffutils.FeatureDB(db_anno_intron)
except ValueError:
    db_anno_gencode = '/resources/DBs/gencode.v43lift37.annotation.gtf.db'
    db_anno_intron = '/resources/DBs/gencode.v43lift37.annotation.intron.gtf.db'
    db = gffutils.FeatureDB(db_anno_gencode)
    db_intron = gffutils.FeatureDB(db_anno_intron)

########   Import TSVs, VCFs and GFFs   ########
### Downloaded database files
#1. Clinvar variants (BED format)
clinvar_file = '../../Resources/03_ClinVar/variant_summary.snv.grch37.germline.criteria.sort.bed.gz'
tbx_clinvar = pysam.TabixFile(clinvar_file)
#2. GENCODE file (GFF3 format)
gencode_gff = '../../Resources/05_GENCODE_v43lift37/gencode.v43lift37.annotation.sort.gff3.gz'
tbx_anno = pysam.TabixFile(gencode_gff)
#3. Canonical transcripts table
# canonlist = '../../CanonicalTranscripts_COMP/CanonicalTranscripts.exoncount.tsv'
canonlist = '../../Resources/01_CanonicalTranscripts/CanonicalTranscripts.exoncount.tsv'
#4. CCRs (BED format)
autoccr = '../../Resources/04_CCRs/ccrs.autosomes.v2.20180420.sort.bed.gz'
xccr = '../../Resources/04_CCRs/ccrs.xchrom.v2.20180420.sort.bed.gz'


## Thresholds configuration
thresholds_SpliceAI_parser: dict = {
    'TH_min_sALDL': 0.02, 'TH_max_sALDL': 0.2, 
    'TH_min_sAGDG': 0.01, 'TH_max_sAGDG': 0.05,
    'TH_min_GExon': 25, 'TH_max_GExon': 500,
    'TH_sAG': 0.2, 'TH_sDG': 0.2
    }

############ Functions for cleansing and adjusting HGMD data ############
def adjust_enst_for_hgmd(df: pd.DataFrame) -> pd.DataFrame:
    result = df.replace(
        {'gene': {'PRKN': 'PARK2', 'MICOS13': 'C19orf70'},
         'ENST': {'ENST00000263201': 'ENST00000437685'},
         'ENST_Full': {'ENST00000263201.7_4': 'ENST00000437685.6_1',
                       'ENST00000361547.7_7': 'ENST00000361547.7_8',
                       'ENST00000609375.1_7': 'ENST00000347364.7_5',
                       'ENST00000649912.1_4': 'ENST00000347364.7_5'}})
    return result

@print_filtering_count
def remove_unkown_refalt(df: pd.DataFrame) -> pd.DataFrame:
    result = df.dropna(subset='REF', axis=0)
    return result

## loading original file and setting output base path
# mydata = './original.snpeff.state.disease.filtered.splai.tsv'
mydata = './original.snpeff.state.disease.identifiedgene.filtered.splai.tsv'
results_splai = './results_spliceai_mydata.vcf'

### Loading files as pandas DataFrame ###
df = pd.read_table(mydata, sep='\t', dtype=str)
output_base: str = utils.configure_output(mydata)
vcf_columns = ['CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO']
splai = pd.read_table(results_splai, sep='\t', header=None, names=vcf_columns)
canon = pd.read_table(canonlist, sep='\t', header=0)


In [3]:
print(len(df))
df = variantfilter.extract_denovo(df)
print(len(df))

16353
Start extract_denovo
Filtering : 16353 --> 9944

9944


In [4]:
################################################################################
#### Annotation step  ####

#1.   Classify 'Canonical' splice site or 'Non-canonical' splice site
logger.info('Classify "Canonical" splice site or "Non-canonical" splice site...')
df = posparser.classifying_canonical(df, cdot='c.HGVS')

#2.   Calculate exonic positions
logger.info('Calculating exonic positions...')
#2-1. Generate 'exonic upstream distance and exonic downstream distance
df['exon_loc'] = df.progress_apply(posparser.calc_exon_loc, tabixfile=tbx_anno, 
                          enstcolname='ENST', axis=1)
df = pd.concat([df, df['exon_loc'].str.split(':', expand=True)], axis=1)
df.rename(columns={0: 'ex_up_dist', 1: 'ex_down_dist'}, inplace=True)


#2-2. Select minimum distance from upstream distance and downstream distance
df['exon_pos'] = df.parallel_apply(posparser.select_exon_pos, axis=1)


#2-3. Decision exonic splice sites (1 nt in acceptor site or 3 nts on Donor site)
df['exon_splice_site'] = df.parallel_apply(posparser.extract_splicing_region, axis=1)

#3.   Additional Splicing information
logger.info('Annotating splicing information...')
#3-1. Annotate splicing type ('Exonic Acceptor' etc.)
df['SpliceType'] = df.parallel_apply(posparser.select_donor_acceptor, axis=1)

#3-2. Annotate rank of exon or intron
df['Num_ExInt'] = df.progress_apply(posparser.calc_ex_int_num, 
                                    db=db, db_intron=db_intron, axis=1)


#5.   Annotate ClinVar varaints interpretations
logger.info('Annotating ClinVar varaints interpretations...')
df['clinvar_same_pos'] = df.progress_apply(anno_clinvar.anno_same_pos_vars, 
                                           tabixfile=tbx_clinvar, axis=1)
df['clinvar_same_motif'] = df.progress_apply(anno_clinvar.anno_same_motif_vars, 
                                             tabixfile=tbx_clinvar, axis=1)

#6.   Parising SpliceAI results (For PVS1 decision tree)
logger.info('Parsing SpliceAI results...')
## Thresholds configuration
TH_min_sALDL, TH_max_sALDL = 0.02, 0.2
TH_min_sAGDG, TH_max_sAGDG = 0.01, 0.05
TH_min_GExon, TH_max_GExon = 25, 500
TH_sAG, TH_sDG = 0.2, 0.2


#6-1. Annotate Exon/Intron position information
df['ExInt_INFO'] = df.progress_apply(splaiparser.calc_exint_info, 
                                     db=db, 
                                     db_intron=db_intron, 
                                     axis=1)

#6-2. Relative exon location
df['prc_exon_loc'] = df.parallel_apply(posparser.calc_prc_exon_loc, axis=1)


#6-3. Predict splicing effects
df['Pseudoexon'] = df.progress_apply(
    splaiparser.pseudoexon_activation,
    thresholds=thresholds_SpliceAI_parser, 
    db_intron=db_intron,
    axis=1)

df['Part_IntRet'] = df.parallel_apply(
    splaiparser.partial_intron_retention,
    thresholds=thresholds_SpliceAI_parser, 
    axis=1)

df['Part_ExDel'] = df.parallel_apply(
    splaiparser.partial_exon_deletion,
    thresholds=thresholds_SpliceAI_parser, 
    axis=1)

df['Exon_skipping'] = df.parallel_apply(
    splaiparser.exon_skipping, 
    thresholds=thresholds_SpliceAI_parser, 
    axis=1)
                                        
df['Int_Retention'] = df.parallel_apply(
    splaiparser.intron_retention, 
    thresholds=thresholds_SpliceAI_parser, 
    axis=1)

df['multiexs'] = df.parallel_apply(
    splaiparser.multi_exon_skipping, 
    thresholds=thresholds_SpliceAI_parser, 
    axis=1)


#7.   Annotate aberrant splicing size (bp)
logger.info('Annotating aberrant splicing size (bp)...')
#7-1. Annotate size of 
df['Size_Part_ExDel'] = df.parallel_apply(
    splaiparser.anno_partial_exon_del_size, 
    thresholds=thresholds_SpliceAI_parser, 
    axis=1)

#7-3. Annotate size of partial intron retention
df['Size_Part_IntRet'] = df.parallel_apply(
    splaiparser.anno_partial_intron_retention_size, 
    thresholds=thresholds_SpliceAI_parser,
    axis=1)

#7-2. Annotate size of pseudoexon
df['Size_pseudoexon'] = df.parallel_apply(
    splaiparser.anno_gained_exon_size, 
    thresholds=thresholds_SpliceAI_parser, 
    axis=1)

#7-4. Annotate size of intron retention
df['Size_IntRet'] = df.parallel_apply(
    splaiparser.anno_intron_retention_size, 
    thresholds=thresholds_SpliceAI_parser,
    axis=1)

#7-5. Annotate size of exon skipping
df['Size_skipped_exon'] = df.parallel_apply(
    splaiparser.anno_skipped_exon_size, 
    thresholds=thresholds_SpliceAI_parser,
    axis=1)

#8.   Evaluate splicing effects
logger.info('Predicting CDS change...')
#8-1. Predict CDS change
df['CDS_Length'] = df.progress_apply(predeffect.calc_cds_len, db=db, axis=1)
df['is_10%_truncation'] = df.progress_apply(predeffect.calc_cds_len_shorten, axis=1)

#8-2. Determine if the gene is included in eLoFs genes
df['is_eLoF'] = df.parallel_apply(predeffect.elofs_judge, axis=1)

#8-3. Determine causing NMD or not
df['is_NMD_at_Canon'] = df.parallel_apply(predeffect.nmd_judge, axis=1)

#8-4. Frame check
df['is_Frameshift_Part_ExDel'] = df['Size_Part_ExDel'].parallel_apply(
    predeffect.frame_check)
df['is_Frameshift_Part_IntRet'] = df['Size_Part_IntRet'].parallel_apply(
    predeffect.frame_check)
df['is_Frameshift_pseudoexon'] = df['Size_pseudoexon'].parallel_apply(
    predeffect.frame_check)
df['is_Frameshift_IntRet'] = df['Size_IntRet'].parallel_apply(
    predeffect.frame_check)
df['is_Frameshift_skipped_exon'] = df['Size_skipped_exon'].parallel_apply(
    predeffect.frame_check)
df['is_Frameshift'] = df[['is_Frameshift_Part_ExDel', 
                          'is_Frameshift_Part_IntRet', 
                          'is_Frameshift_pseudoexon', 
                          'is_Frameshift_IntRet', 
                          'is_Frameshift_skipped_exon'
                          ]].any(axis=1)


#9.   CCRs
logger.info('Annotating CCRs info...')

#9-1. Annotate truncated regions 
df['skipped_region'] = df.parallel_apply(
    splaiparser.anno_skipped_regions, axis=1)

df['deleted_region'] = df.parallel_apply(
    splaiparser.anno_deleted_regions, 
    thresholds=thresholds_SpliceAI_parser, axis=1)

#9-2. Intersect with CCRs
logger.info('Annotate CCR score')
df = predeffect.anno_ccr_score(df)

df.to_pickle('mydata.tmp.pkl')
df = pd.read_pickle('mydata.tmp.pkl')

2024/05/11 05:49:12 [INFO   ] (__main__) - Classify "Canonical" splice site or "Non-canonical" splice site...
-2: 27
-1: 25
+1: 53
+2: 21
Total variants      : 9944
Canonical variants  : 126
non-Canon variants  : 9818

2024/05/11 05:49:12 [INFO   ] (__main__) - Calculating exonic positions...


100%|██████████| 9944/9944 [00:05<00:00, 1933.52it/s]


2024/05/11 05:49:18 [INFO   ] (__main__) - Annotating splicing information...


100%|██████████| 9944/9944 [00:08<00:00, 1119.35it/s]

2024/05/11 05:49:27 [INFO   ] (__main__) - Annotating ClinVar varaints interpretations...



100%|██████████| 9944/9944 [00:00<00:00, 35360.19it/s]
100%|██████████| 9944/9944 [00:00<00:00, 26602.37it/s]

2024/05/11 05:49:27 [INFO   ] (__main__) - Parsing SpliceAI results...



100%|██████████| 9944/9944 [00:08<00:00, 1154.75it/s]
100%|██████████| 9944/9944 [00:01<00:00, 5720.97it/s]


2024/05/11 05:49:45 [INFO   ] (__main__) - Annotating aberrant splicing size (bp)...
2024/05/11 05:49:46 [INFO   ] (__main__) - Predicting CDS change...


100%|██████████| 9944/9944 [00:05<00:00, 1861.72it/s]
100%|██████████| 9944/9944 [00:00<00:00, 111616.43it/s]






2024/05/11 05:49:53 [INFO   ] (__main__) - Annotating CCRs info...
2024/05/11 05:49:53 [INFO   ] (__main__) - Annotate CCR score


In [5]:
### Don't touch the original data ###
### Save the result as pickle file before scoring ###
df.to_pickle('./pickles/mydata_prescore_Fixed.pkl')

In [6]:
df = pd.read_pickle('./pickles/mydata_prescore_Fixed.pkl')

In [7]:
#10.   Scoring
# df = pd.read_pickle('./pickles/mydata_prescore.pkl')
from libs.scoring import Scoring
logger.info('Annotating Screening scores...')
df = df.astype({'maxsplai': 'float', 'vqslod': 'float', 
                'denovogear': float, 'triodenovo': float, 'dnmfilter': float})
print(len(df))


# Recalibrate the thresholds for De novo detection
df = df[df['vqslod'] > -7.18]
df = df[((df['denovogear'] > 0.02) | (df['denovogear'].isnull()))
        & ((df['triodenovo'] > 5.72) | (df['triodenovo'].isnull()))
        & ((df['dnmfilter'] > 0.196) | (df['dnmfilter'].isnull()))]

print(len(df))
sccore_ths = {'clinvar_same_pos': 2,     
                 'clinvar_same_motif': 1,
                 'clinvar_else': 0,
                 'non_canon_splai_lte_0.1_outside': -3,
                 'non_canon_splai_lte_0.1_other': -2,
                 'non_canon_splai_bet_0.1_0.2': 1,
                 'non_canon_splai_gte_0.2': 2,
                 'canon_strong': 6, 
                 'canon_moderate': 5, 
                 'frameshift_nmd_eloF': 7, 
                 'frameshift_nmd_not_eloF': 3,
                 'canon_splai_lte_0.1': -3,
                 'canon_splai_bet_0.1_0.2': -1,
                 'canon_splai_gte_0.2': 0}


scoring = Scoring(ths=sccore_ths)

df['insilico_screening'] = df.parallel_apply(scoring.insilico_screening, axis=1)
df['clinvar_screening'] = df.parallel_apply(scoring.clinvar_screening, axis=1)
df = scoring.calc_priority_score(df)
df.to_pickle(f'./pickles/mydata.scored.pickle')

2024/05/11 08:13:46 [INFO   ] (__main__) - Annotating Screening scores...
9944
6406


In [8]:
# df = pd.read_pickle(f'./pickles/mydata.scored.pickle')
df.replace({'\u2161': 'II', '\u2163': 'IV', '\u2162': 'III'}, inplace=True, regex=True)
df.fillna('.', inplace=True)
df.rename(columns={'gene_x': 'gene', 'ID_x': 'ID'}, inplace=True)
path_to_hgmd = '../../Resources/07_HGMD_GeneBasedInfo/HGMD_GeneBasedInfo_2024.1.tsv.gz'

def anno_hgmd(df: pd.DataFrame) -> pd.DataFrame:
    hgmd = pd.read_table(path_to_hgmd, header=0, dtype=str)
    hgmd = hgmd[['gene', 'altsymbol', 'refseq', 
                 'expected_inheritance', 'hgncID', 'omimid', 'DM']]
    hgmd = hgmd.astype({'DM': 'float64'})
    df = pd.merge(
        df, hgmd, left_on='Gene.refGene', right_on='gene', how='left'
        )

    return df

def anno_sf(df: pd.DataFrame) -> pd.DataFrame:
    df_sf = pd.read_table('../../Resources/ACMG_SFv3.2.txt', header=0, dtype=str)
    df_sf = df_sf[['Gene', 'Disease/Phentyope', 'Inheritance ', 'Variants to report']]
    df = pd.merge(df, df_sf, left_on='Gene.refGene', right_on='Gene', how='left'
        )

    return df

df = anno_hgmd(df)
df = anno_sf(df)
# df = df.drop(columns=['gene_y', 'ID_y'])
print(df.duplicated(subset=['sample', 'ID']).sum())
df.drop_duplicates(subset=['sample', 'ID'], inplace=True)
print(len(df))


280
6139


  df.fillna('.', inplace=True)


In [9]:
# Replace '.' to np.nan in 'skipped_ccrs' and 'deleted_ccrs'
# Change type to float in 'skipped_ccrs' and 'deleted_ccrs'
df['skipped_ccrs'] = df['skipped_ccrs'].replace('.', np.nan).astype(float)
df['deleted_ccrs'] = df['deleted_ccrs'].replace('.', np.nan).astype(float)
df['is_95%_CCRs'] = df.apply(lambda row: True if (row['skipped_ccrs'] > 95 or row['deleted_ccrs'] > 95) else False, axis=1)

  df['skipped_ccrs'] = df['skipped_ccrs'].replace('.', np.nan).astype(float)
  df['deleted_ccrs'] = df['deleted_ccrs'].replace('.', np.nan).astype(float)


In [10]:
df = df[[
    'DM', 'PriorityScore', 'expected_inheritance', 'Gene.refGene', 'c.HGVS', 'p.HVGS', 
    'Disease', 'maxsplai', 'is_Canonical', 'is_Frameshift', 'is_NMD_at_Canon', 
    'is_eLoF', 'is_10%_truncation', 'is_95%_CCRs', 'variant_id', 
    'clinvar_same_pos', 'clinvar_same_motif', 'sample', 'fa', 'mo', 
    'Disease/Phentyope', 'Inheritance ', 'type', 'ID', 
    'vqslod', 'triodenovo', 'dnmfilter', 'denovogear', 'denovofilter', 'sanger', 
    'SIFT_score', 'SIFT_pred', 'Polyphen2_HVAR_score', 'Polyphen2_HVAR_pred', 
    'PROVEAN_score', 'PROVEAN_pred', 'CADD_phred', 'GERP++_RS', 'gerp++gt2',
    'phyloP7way_vertebrate', 'phastCons7way_vertebrate', 'ExAC_ALL', 'ExAC_AFR',
    'ExAC_AMR', 'ExAC_EAS', 'ExAC_FIN', 'ExAC_NFE', 'ExAC_OTH', 'ExAC_SAS',
    'snp138NonFlagged', 'snp20160620_tommo_exome', 'esp6500siv2_all', 'CHROM',
    'POS', 'REF', 'ALT', 'ENSG', 'ENST', 'Rank', 'csq',
    'Biotype', 'DNA_ID', 'Identified_Gene', 'State', 'GeneSymbol', 
    'ENST_Full', 'ENSG_Full', 'Strand', 'Tag', 'is_Multi', 'SpliceAI', 
    'DS_AG', 'DS_AL', 'DS_DG', 'DS_DL', 'DP_AG', 'DP_AL', 'DP_DG', 'DP_DL',
    'Int_loc', 'exon_loc', 'ex_up_dist', 'ex_down_dist', 'exon_pos', 
    'exon_splice_site', 'SpliceType', 'Num_ExInt', 'ExInt_INFO', 'prc_exon_loc',
    'Pseudoexon', 'Part_IntRet', 'Part_ExDel', 'Exon_skipping', 'Int_Retention',
    'multiexs', 'Size_Part_ExDel', 'Size_Part_IntRet', 'Size_pseudoexon',
    'Size_IntRet', 'Size_skipped_exon', 'CDS_Length', 'is_Frameshift_Part_ExDel', 
    'is_Frameshift_Part_IntRet', 'is_Frameshift_pseudoexon',
    'is_Frameshift_IntRet', 'is_Frameshift_skipped_exon', 
    'skipped_region', 'deleted_region', 'skipped_ccrs', 'deleted_ccrs',
    'insilico_screening', 'clinvar_screening', 'altsymbol',
    'refseq', 'hgncID', 'omimid', 'Variants to report'
]]

In [11]:
df.loc[df['Identified_Gene'] != '.', 'State'] = 'Identified'
df_solved = df[df['State'] == 'Identified']
df_unsolved = df[df['State'] == 'Undetermined']

print(f"{len(df_solved)} + {len(df_unsolved)}")

1525 + 4614


In [12]:
df.to_pickle(f'./ExcelOutput/Fixed_FINAL_mydata.denovo.all.pkl')

In [12]:
# Save as tsv
df.to_csv(f'./ExcelOutput/Fixed_FINAL_mydata.denovo.all.tsv', sep='\t', index=False, encoding='shift-jis')

# Save as xlsx
df.to_excel(f'./ExcelOutput/Fixed_FINAL_mydata.denovo.all.xlsx', index=False)
df_unsolved.to_excel(f'./ExcelOutput/Fixed_FINAL_mydata.denovo.unsolved.xlsx', index=False)
df_solved.to_excel(f'./ExcelOutput/Fixed_FINAL_mydata.denovo.solved.xlsx', index=False)

In [9]:
len(df_unsolved[df_unsolved['PriorityScore'] >= 3])

67

In [27]:
# Visualizations
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

In [28]:
# Distribution of Priority scores (Solved cases vs. Unsolved cases) using Plotly

fig = go.Figure()
fig.add_trace(go.Histogram(x=df_unsolved['PriorityScore'], name='Unsolved'))
fig.add_trace(go.Histogram(x=df_solved['PriorityScore'], name='Solved'))
fig.update_layout(barmode='overlay', title='Distribution of Priority scores (Solved cases vs. Unsolved cases)')
fig.update_traces(opacity=0.5)
fig.show()

In [39]:
df_undetermined_patho = df_undetermined[df_undetermined['FinalScore'] >= 4]
df_undetermined_patho.to_excel(f'./undetermined_patho.xlsx', index=False)

In [14]:
df_undetermined_vus = df_undetermined[df_undetermined['FinalScore'] == 3]
df_undetermined_vus.to_excel(f'./undetermined_vus.xlsx', index=False)

In [26]:
df_solved_patho = df_solved[df_solved['PrioritySscore'] >= 4]

In [28]:
df_solved_patho.to_csv(f'./ExcelOutput/mydata.solved.patho.tsv', sep='\t', index=False, encoding='shift-jis')

In [30]:
df_solved_patho.to_excel(f'./ExcelOutput/mydata.solved.patho.xlsx', index=False)


In [19]:
df['FinalScore'].value_counts()

FinalScore
1    8835
0    4711
2     933
3     859
4      92
5      80
6      65
7      21
8       8
9       1
Name: count, dtype: int64

In [27]:
cars_df = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/imports-85.csv')

categorical_dimensions = ['body-style', 'drive-wheels', 'fuel-type']
dimensions = [dict(values=cars_df[label], label=label) for label in categorical_dimensions]

# Build colorscale
color = np.zeros(len(cars_df), dtype='uint8')
colorscale = [[0, 'gray'], [0.33, 'gray'],
              [0.33, 'firebrick'], [0.66, 'firebrick'],
              [0.66, 'blue'], [1.0, 'blue']]
cmin = -0.5
cmax = 2.5

In [28]:
fig = go.FigureWidget(
    data=[go.Scatter(x=cars_df.horsepower, y=cars_df['highway-mpg'],
                marker={'color': color, 'cmin': cmin, 'cmax': cmax,
                        'colorscale': colorscale, 'showscale': True,
                        'colorbar': {'tickvals': [0, 1, 2], 'ticktext': ['None', 'Red', 'Blue']}},
                     mode='markers'),

      go.Parcats(domain={'y': [0, 0.4]}, dimensions=dimensions,
                   line={'colorscale': colorscale, 'cmin': cmin,
                   'cmax': cmax, 'color': color, 'shape': 'hspline'})]
)

fig.show()

In [23]:
cars_df

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,13495.0
1,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,16500.0
2,1,,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154.0,5000.0,19,26,16500.0
3,2,164.0,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.40,10.0,102.0,5500.0,24,30,13950.0
4,2,164.0,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.40,8.0,115.0,5500.0,18,22,17450.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,-1,95.0,volvo,gas,std,four,sedan,rwd,front,109.1,...,141,mpfi,3.78,3.15,9.5,114.0,5400.0,23,28,16845.0
201,-1,95.0,volvo,gas,turbo,four,sedan,rwd,front,109.1,...,141,mpfi,3.78,3.15,8.7,160.0,5300.0,19,25,19045.0
202,-1,95.0,volvo,gas,std,four,sedan,rwd,front,109.1,...,173,mpfi,3.58,2.87,8.8,134.0,5500.0,18,23,21485.0
203,-1,95.0,volvo,diesel,turbo,four,sedan,rwd,front,109.1,...,145,idi,3.01,3.40,23.0,106.0,4800.0,26,27,22470.0


In [18]:
# Visualizations
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from ipywidgets import widgets

In [None]:
df 

In [33]:
# Plot the distribution of the final score using bar plot with log scale
# x-axis: Final score (all scores are shown)
# y-axis: Number of variants

fig = px.bar(tb, x=tb.index, y=tb.values,
                labels={'x': 'Final score', 'y': 'Number of variants'},
                title='Priority score distribution',
                log_y=True)
fig.update_layout(
    xaxis=dict(
        tickmode='linear',
        tick0=0,
        dtick=1
    )
)
# X-axis title
fig.update_xaxes(title_text='Priority score')
# Y-axis title
fig.update_yaxes(title_text='Number of variants (log scale)')

fig.show()

# Saving the html file
fig.write_html('final_score_distribution.html')


In [40]:
# Plot the ratio between identified and undetermined variants
print(len(df_solved), len(df_undetermined))

# Pie chart
# color as soft color palette
colors = sns.color_palette('pastel')[0:2]
fig = go.Figure(data=[go.Pie(labels=['Identified', 'Undetermined'], 
                             values=[len(df_solved), len(df_undetermined)])])
fig.update_traces(marker=dict(colors=colors))
fig.update_layout(title='Ratio between identified and undetermined variants')

# Add number of variants
fig.add_annotation(
    x=0.2, y=0.5, showarrow=False,
    text=f'Identified: {len(df_solved)}<br>Undetermined: {len(df_undetermined)}',
    font=dict(size=20)
)

# legend location
fig.update_layout(legend=dict(
    yanchor='top',
    y=0.1,
    xanchor='left',
    x=0.3
))

fig.show()

# Saving the html file
fig.write_html('identified_undetermined_ratio.html')

3892 11713


In [44]:
import codecs
with codecs.open("mydata.solved.patho.matched.txt", "r", "Shift-JIS", "ignore") as file:
    df = pd.read_table(file, delimiter="\t")

188


In [46]:
# plot the distribution of the final score in identified variants
tb2 = df_identified['FinalScore'].value_counts()
tb2 = tb2.sort_index()

# x-axis: Final score (all scores are shown) log sacale
# y-axis: Number of variants
# color: Red
fig = px.bar(tb2, x=tb2.index, y=tb2.values,
                labels={'x': 'Final score', 'y': 'Number of variants'},
                title='Priority score distribution in identified variants',
                log_y=True)
fig.update_layout(
    xaxis=dict(
        tickmode='linear',
        tick0=0,
        dtick=1
    )
)

# X-axis title
fig.update_xaxes(title_text='Priority score')

# Y-axis title
fig.update_yaxes(title_text='Number of variants (log scale)')
fig.show()

# save html file
fig.write_html('final_score_distribution_identified.html')

In [26]:
df_patho = df[df['FinalScore'] >= 4]

In [27]:
df_patho.to_csv(f'./mydata.final.patho.tsv', sep='\t', index=False, encoding='shift-jis')

### Note

##### 次のセルは，SpliceAIの結果をマージした時のもの．

In [None]:
df2 = pd.read_table(
    '/work/Github/MyProjects/SplicingScreening/data/ResultsPredictionTools/results_spliceai_raw_maxdistance.nonheader.final.vcf',
    sep='\t', dtype=str, header=None) 
df2.columns = ['CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO']
df2['variant_id'] = df2.apply(preprocess.generate_variant_id_col, axis=1)
df2 = df2[df2['INFO'] != '.']

df3 = pd.read_table('/work/Github/MyProjects/SplicingScreening/data/ResultsPredictionTools/reanalysis.splai.NMfixed.parsed.vcf',
                    sep='\t', dtype=str, header=None)
df3.columns = ['CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO']
df3['variant_id'] = df3.apply(preprocess.generate_variant_id_col, axis=1)    

df4 = pd.concat([df2, df3], axis=0)
df4[df4.duplicated(subset='variant_id', keep='last')].sort_values(by='variant_id')
df4 = df4[['CHROM', 'POS', 'variant_id', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO']]

df4.to_csv('results_spliceai_mydata.vcf', sep='\t', index=False)

##### Creating DBs


In [None]:
################################################################################
##                                Creating DBs                                ##
################################################################################

import gffutils
import gffutils.pybedtools_integration

## Function
def generate_intoron_gtf(db: gffutils.FeatureDB, output: str) -> None:
    introns = db.create_introns(exon_featuretype='exon', 
                                new_featuretype='intron', 
                                merge_attributes=True, 
                                numeric_sort=True)
    pybed = gffutils.pybedtools_integration.to_bedtool(introns)
    pybed.saveas(output)
    
    return None


## Set path to GTF files for creating DBs
gencode_gtf = '/resources/GENCODE/gencode.v43lift37.annotation.gtf.gz'
intron_gtf = '/resources/GENCODE/gencode.v43lift37.annotation.intron.gtf.gz'

## Set path to output DBs
db_anno_gencode = '/work/resources/gencode.v43lift37.annotation.gtf.db'
db_anno_intron = '/work/resources/gencode.v43lift37.annotation.intron.gtf.db'

#1. Create DB from GENCODE GTF
db = gffutils.create_db(gencode_gtf, db_anno_gencode,
                        disable_infer_genes=True, 
                        disable_infer_transcripts=True,
                        keep_order=True)

db = gffutils.FeatureDB(db_anno_gencode)

#2. Create intron information file as GTF
generate_intoron_gtf(db, intron_gtf)

#3. Create intron DB from above GTF
id_spec_dict = {'gene': 'gene_id', 'transcript': 'transcript_id',
                'exon_number': 'intron_number'}
db_intron = gffutils.create_db(intron_gtf, db_anno_intron, 
                               disable_infer_genes=True, 
                               disable_infer_transcripts=True, 
                               keep_order=True,
                               merge_strategy="merge")