##### Import package

In [1]:
import os
import re
import sys
from pathlib import Path

import gffutils
import numpy as np
import pandas as pd
import pysam
from pandarallel import pandarallel
from tqdm import tqdm

import plotly.graph_objects as go
from ipywidgets import widgets

### Import my modules in libs directory
try: 
    __file__
    sys.path.append(os.path.join(os.path.dirname(__file__)))
except NameError:
    Path().resolve()
    sys.path.append(os.path.join(Path().resolve(), '../../'))

from libs import utils, preprocess, variantfilter, posparser, splaiparser
from libs import predeffect, scoring
from libs import anno_spliceai, anno_clinvar
from libs.deco import print_filtering_count

### Logging setup
from logging import getLogger, config
import yaml
parent_directory = os.path.dirname(os.path.dirname('__file__'))
config_path: str = os.path.join(parent_directory, '../../config/logging.yaml')
with open(config_path, 'r') as f:
    config.dictConfig(yaml.safe_load(f))
logger = getLogger(__name__)


########   Initialize and setup pandas methods   ########
pandarallel.initialize(nb_workers=os.cpu_count()-1, progress_bar=False, 
                       verbose=2, use_memory_fs=False) 
os.environ['JOBLIB_TEMP_FOLDER'] = '/tmp' 
tqdm.pandas()

INFO: Pandarallel will run on 7 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [None]:
########   Import genocode DBs (exon DB and intron DB)   ########
try:
    db_anno_gencode = '../../Resources/06_gffutilsdb/gencode.v43lift37.annotation.gtf.db'
    db_anno_intron = '../../Resources/06_gffutilsdb/gencode.v43lift37.annotation.intron.gtf.db'
    db = gffutils.FeatureDB(db_anno_gencode)
    db_intron = gffutils.FeatureDB(db_anno_intron)
except ValueError:
    db_anno_gencode = '/resources/DBs/gencode.v43lift37.annotation.gtf.db'
    db_anno_intron = '/resources/DBs/gencode.v43lift37.annotation.intron.gtf.db'
    db = gffutils.FeatureDB(db_anno_gencode)
    db_intron = gffutils.FeatureDB(db_anno_intron)

########   Import TSVs, VCFs and GFFs   ########
### Downloaded database files
#1. Clinvar variants (BED format)
clinvar_file = '../../Resources/03_ClinVar/variant_summary.snv.grch37.germline.criteria.sort.bed.gz'
tbx_clinvar = pysam.TabixFile(clinvar_file)
#2. GENCODE file (GFF3 format)
gencode_gff = '../../Resources/05_GENCODE_v43lift37/gencode.v43lift37.annotation.sort.gff3.gz'
tbx_anno = pysam.TabixFile(gencode_gff)
#3. Canonical transcripts table
# canonlist = '../../CanonicalTranscripts_COMP/CanonicalTranscripts.exoncount.tsv'
canonlist = '../../Resources/01_CanonicalTranscripts/CanonicalTranscripts.exoncount.tsv'
#4. CCRs (BED format)
autoccr = '../../Resources/04_CCRs/ccrs.autosomes.v2.20180420.sort.bed.gz'
xccr = '../../Resources/04_CCRs/ccrs.xchrom.v2.20180420.sort.bed.gz'


## Thresholds configuration
thresholds_SpliceAI_parser: dict = {
    'TH_min_sALDL': 0.02, 'TH_max_sALDL': 0.2, 
    'TH_min_sAGDG': 0.01, 'TH_max_sAGDG': 0.05,
    'TH_min_GExon': 25, 'TH_max_GExon': 500,
    'TH_sAG': 0.2, 'TH_sDG': 0.2
    }

############ Functions for cleansing and adjusting HGMD data ############
def adjust_enst_for_hgmd(df: pd.DataFrame) -> pd.DataFrame:
    result = df.replace(
        {'gene': {'PRKN': 'PARK2', 'MICOS13': 'C19orf70'},
         'ENST': {'ENST00000263201': 'ENST00000437685'},
         'ENST_Full': {'ENST00000263201.7_4': 'ENST00000437685.6_1',
                       'ENST00000361547.7_7': 'ENST00000361547.7_8',
                       'ENST00000609375.1_7': 'ENST00000347364.7_5',
                       'ENST00000649912.1_4': 'ENST00000347364.7_5'}})
    return result

@print_filtering_count
def remove_unkown_refalt(df: pd.DataFrame) -> pd.DataFrame:
    result = df.dropna(subset='REF', axis=0)
    return result

## loading original file and setting output base path
# mydata = './original.snpeff.state.disease.filtered.splai.tsv'
mydata = './original.snpeff.state.disease.identifiedgene.filtered.splai.tsv'
results_splai = './results_spliceai_mydata.vcf'

### Loading files as pandas DataFrame ###
df = pd.read_table(mydata, sep='\t', dtype=str)
output_base: str = utils.configure_output(mydata)
vcf_columns = ['CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO']
splai = pd.read_table(results_splai, sep='\t', header=None, names=vcf_columns)
canon = pd.read_table(canonlist, sep='\t', header=0)


In [4]:
print(len(df))
df = variantfilter.extract_denovo(df)
print(len(df))

16353
Start extract_denovo
Filtering : 16353 --> 9944

9944


In [9]:
df.rename(columns={'csq': 'Consequence'}, inplace=True)

In [20]:
df['Consequence'].value_counts()

Consequence
missense_variant                            5203
synonymous_variant                          2422
intron_variant                              1181
splice_region_variant&intron_variant         467
stop_gained                                  225
missense_variant&splice_region_variant       154
splice_donor_variant&intron_variant           74
splice_region_variant&synonymous_variant      65
non_coding_transcript_exon_variant            62
splice_acceptor_variant&intron_variant        53
start_lost                                    18
stop_gained&splice_region_variant             14
initiator_codon_variant                        3
stop_retained_variant                          1
splice_region_variant                          1
stop_lost                                      1
Name: count, dtype: int64

In [None]:
################################################################################
#### Annotation step  ####

#1.   Classify 'Canonical' splice site or 'Non-canonical' splice site
logger.info('Classify "Canonical" splice site or "Non-canonical" splice site...')
df = posparser.classifying_canonical(df)

#2.   Calculate exonic positions
logger.info('Calculating exonic positions...')
#2-1. Generate 'exonic upstream distance and exonic downstream distance
df['exon_loc'] = df.progress_apply(posparser.calc_exon_loc, tabixfile=tbx_anno, 
                          enstcolname='ENST', axis=1)
df = pd.concat([df, df['exon_loc'].str.split(':', expand=True)], axis=1)
df.rename(columns={0: 'ex_up_dist', 1: 'ex_down_dist'}, inplace=True)


#2-2. Select minimum distance from upstream distance and downstream distance
df['exon_pos'] = df.parallel_apply(posparser.select_exon_pos, axis=1)


#2-3. Decision exonic splice sites (1 nt in acceptor site or 3 nts on Donor site)
df['exon_splice_site'] = df.parallel_apply(posparser.extract_splicing_region, axis=1)

#3.   Additional Splicing information
logger.info('Annotating splicing information...')
#3-1. Annotate splicing type ('Exonic Acceptor' etc.)
df['SpliceType'] = df.parallel_apply(posparser.select_donor_acceptor, axis=1)

#3-2. Annotate rank of exon or intron
df['Num_ExInt'] = df.progress_apply(posparser.calc_ex_int_num, 
                                    db=db, db_intron=db_intron, axis=1)


#5.   Annotate ClinVar varaints interpretations
logger.info('Annotating ClinVar varaints interpretations...')
df['clinvar_same_pos'] = df.progress_apply(anno_clinvar.anno_same_pos_vars, 
                                           tabixfile=tbx_clinvar, axis=1)
df['clinvar_same_motif'] = df.progress_apply(anno_clinvar.anno_same_motif_vars, 
                                             tabixfile=tbx_clinvar, axis=1)

#6.   Parising SpliceAI results (For PVS1 decision tree)
logger.info('Parsing SpliceAI results...')
## Thresholds configuration
TH_min_sALDL, TH_max_sALDL = 0.02, 0.2
TH_min_sAGDG, TH_max_sAGDG = 0.01, 0.05
TH_min_GExon, TH_max_GExon = 25, 500
TH_sAG, TH_sDG = 0.2, 0.2


#6-1. Annotate Exon/Intron position information
df['ExInt_INFO'] = df.progress_apply(splaiparser.calc_exint_info, 
                                     db=db, 
                                     db_intron=db_intron, 
                                     axis=1)

#6-2. Relative exon location
df['prc_exon_loc'] = df.parallel_apply(posparser.calc_prc_exon_loc, axis=1)


#6-3. Predict splicing effects
df['Pseudoexon'] = df.progress_apply(
    splaiparser.pseudoexon_activation,
    thresholds=thresholds_SpliceAI_parser, 
    db_intron=db_intron,
    axis=1)

df['Part_IntRet'] = df.parallel_apply(
    splaiparser.partial_intron_retention,
    thresholds=thresholds_SpliceAI_parser, 
    axis=1)

df['Part_ExDel'] = df.parallel_apply(
    splaiparser.partial_exon_deletion,
    thresholds=thresholds_SpliceAI_parser, 
    axis=1)

df['Exon_skipping'] = df.parallel_apply(
    splaiparser.exon_skipping, 
    thresholds=thresholds_SpliceAI_parser, 
    axis=1)
                                        
df['Int_Retention'] = df.parallel_apply(
    splaiparser.intron_retention, 
    thresholds=thresholds_SpliceAI_parser, 
    axis=1)

df['multiexs'] = df.parallel_apply(
    splaiparser.multi_exon_skipping, 
    thresholds=thresholds_SpliceAI_parser, 
    axis=1)


#7.   Annotate aberrant splicing size (bp)
logger.info('Annotating aberrant splicing size (bp)...')
#7-1. Annotate size of 
df['Size_Part_ExDel'] = df.parallel_apply(
    splaiparser.anno_partial_exon_del_size, 
    thresholds=thresholds_SpliceAI_parser, 
    axis=1)

#7-3. Annotate size of partial intron retention
df['Size_Part_IntRet'] = df.parallel_apply(
    splaiparser.anno_partial_intron_retention_size, 
    thresholds=thresholds_SpliceAI_parser,
    axis=1)

#7-2. Annotate size of pseudoexon
df['Size_pseudoexon'] = df.parallel_apply(
    splaiparser.anno_gained_exon_size, 
    thresholds=thresholds_SpliceAI_parser, 
    axis=1)

#7-4. Annotate size of intron retention
df['Size_IntRet'] = df.parallel_apply(
    splaiparser.anno_intron_retention_size, 
    thresholds=thresholds_SpliceAI_parser,
    axis=1)

#7-5. Annotate size of exon skipping
df['Size_skipped_exon'] = df.parallel_apply(
    splaiparser.anno_skipped_exon_size, 
    thresholds=thresholds_SpliceAI_parser,
    axis=1)

#8.   Evaluate splicing effects
logger.info('Predicting CDS change...')
#8-1. Predict CDS change
df['CDS_Length'] = df.progress_apply(predeffect.calc_cds_len, db=db, axis=1)
df['is_10%_truncation'] = df.progress_apply(predeffect.calc_cds_len_shorten, axis=1)

#8-2. Determine if the gene is included in eLoFs genes
df['is_eLoF'] = df.parallel_apply(predeffect.elofs_judge, axis=1)

#8-3. Determine causing NMD or not
df['is_NMD_at_Canon'] = df.parallel_apply(predeffect.nmd_judge, axis=1)

#8-4. Frame check
df['is_Frameshift_Part_ExDel'] = df['Size_Part_ExDel'].parallel_apply(
    predeffect.frame_check)
df['is_Frameshift_Part_IntRet'] = df['Size_Part_IntRet'].parallel_apply(
    predeffect.frame_check)
df['is_Frameshift_pseudoexon'] = df['Size_pseudoexon'].parallel_apply(
    predeffect.frame_check)
df['is_Frameshift_IntRet'] = df['Size_IntRet'].parallel_apply(
    predeffect.frame_check)
df['is_Frameshift_skipped_exon'] = df['Size_skipped_exon'].parallel_apply(
    predeffect.frame_check)
df['is_Frameshift'] = df[['is_Frameshift_Part_ExDel', 
                          'is_Frameshift_Part_IntRet', 
                          'is_Frameshift_pseudoexon', 
                          'is_Frameshift_IntRet', 
                          'is_Frameshift_skipped_exon'
                          ]].any(axis=1)


#9.   CCRs
logger.info('Annotating CCRs info...')

#9-1. Annotate truncated regions 
df['skipped_region'] = df.parallel_apply(
    splaiparser.anno_skipped_regions, axis=1)

df['deleted_region'] = df.parallel_apply(
    splaiparser.anno_deleted_regions, 
    thresholds=thresholds_SpliceAI_parser, axis=1)

#9-2. Intersect with CCRs
logger.info('Annotate CCR score')
df = predeffect.anno_ccr_score(df)

df.to_pickle('mydata.tmp_20241023.pkl')
df = pd.read_pickle('mydata.tmp_20241023.pkl')

In [7]:
### Don't touch the original data ###
### Save the result as pickle file before scoring ###
df.to_pickle('./pickles/mydata_prescore_Gene_HGNC.pkl')

## ここから下で解析する！！

In [83]:
df = pd.read_pickle('./pickles/mydata_prescore_Gene_HGNC.pkl')

In [54]:
#10.   Scoring
# df = pd.read_pickle('./pickles/mydata_prescore.pkl')

df = pd.read_pickle(f'./pickles/mydata.scored.pickle')
df['is_Canonical'] = df['is_Canonical'].replace({'True': "Yes", 'False': "No"}).copy()

from libs.scoring import Scoring
logger.info('Annotating Screening scores...')
df = df.astype({'maxsplai': 'float', 'vqslod': 'float', 
                'denovogear': float, 'triodenovo': float, 'dnmfilter': float})

# Recalibrate the thresholds for De novo detection
df = df[df['vqslod'] > -7.18]
df = df[((df['denovogear'] > 0.02) | (df['denovogear'].isnull()))
        & ((df['triodenovo'] > 5.72) | (df['triodenovo'].isnull()))
        & ((df['dnmfilter'] > 0.196) | (df['dnmfilter'].isnull()))]

# df = df[df['vqslod'] > -7.18]
# df = df[((df['denovogear'] > 0.02) 
#         & (df['triodenovo'] > 5.72)
#         & (df['dnmfilter'] > 0.196))]

sccore_ths = {'clinvar_same_pos': 2,
             'clinvar_same_motif': 1,
             'clinvar_else': 0,
             'non_canon_splai_lte_0.1_outside': -1,
             'non_canon_splai_lte_0.1_other': 0,
             'non_canon_splai_bet_0.1_0.2': 2,
             'non_canon_splai_gte_0.2': 3,
             'canon_strong': 4, 
             'canon_moderate': 3, 
             'frameshift_nmd_eloF': 5, 
             'frameshift_nmd_not_eloF': 1,
             'canon_splai_lte_0.1': -1,
             'canon_splai_bet_0.1_0.2': 0,
             'canon_splai_gte_0.2': 2}

thsdict = {
    'opti': 
            {'clinvar_same_pos': 3,
             'clinvar_same_motif': 2,
             'clinvar_else': 0,
             'non_canon_splai_lte_0.1_outside': -4,
             'non_canon_splai_lte_0.1_other': -3,
             'non_canon_splai_bet_0.1_0.2': 3,
             'non_canon_splai_gte_0.2': 4,
             'canon_strong': 5, 
             'canon_moderate': 4, 
             'frameshift_nmd_eloF': 6, 
             'frameshift_nmd_not_eloF': 4,
             'canon_splai_lte_0.1': -4,
             'canon_splai_bet_0.1_0.2': -1,
             'canon_splai_gte_0.2': 0},
}

# original
# sccore_ths = {'clinvar_same_pos': 2,     
#                  'clinvar_same_motif': 1,
#                  'clinvar_else': 0,
#                  'non_canon_splai_lte_0.1_outside': -3,
#                  'non_canon_splai_lte_0.1_other': -2,
#                  'non_canon_splai_bet_0.1_0.2': 1,
#                  'non_canon_splai_gte_0.2': 2,
#                  'canon_strong': 6, 
#                  'canon_moderate': 5, 
#                  'frameshift_nmd_eloF': 7, 
#                  'frameshift_nmd_not_eloF': 3,
#                  'canon_splai_lte_0.1': -3,
#                  'canon_splai_bet_0.1_0.2': -1,
#                  'canon_splai_gte_0.2': 0}

scoring = Scoring(ths=sccore_ths)

df['insilico_screening'] = df.parallel_apply(scoring.insilico_screening, axis=1)
df['clinvar_screening'] = df.parallel_apply(scoring.clinvar_screening, axis=1)
# df['PriorityScore'] = df.parallel_apply(scoring.calc_priority_score, axis=1)
# df.to_pickle(f'./pickles/mydata.scored_gene_fixed_HGNC.pickle')
# df['PriorityScore'] = df.parallel_apply(scoring.calc_priority_score, axis=1)
df = scoring.calc_priority_score2(df)

print(df['PriorityScore'].value_counts())

2024/10/24 09:27:14 [INFO   ] (__main__) - Annotating Screening scores...
PriorityScore
0    5348
2     427
3     339
1     208
5      44
4      21
7      13
8       3
6       2
9       1
Name: count, dtype: int64


In [55]:
df.replace({'\u2161': 'II', '\u2163': 'IV', '\u2162': 'III'}, inplace=True, regex=True)
df.fillna('.', inplace=True)
df = df.rename(columns={'gene_x': 'gene', 'ID_x': 'ID'}).copy()
path_to_hgmd = '../../Resources/07_HGMD_GeneBasedInfo/HGMD_GeneBasedInfo_2024.1.tsv.gz'
# path_to_hgmd = '../../Resources/07_HGMD_GeneBasedInfo/HGMD_GeneBasedInfo_2023.4.tsv.gz'

def anno_hgmd(df: pd.DataFrame) -> pd.DataFrame:
    hgmd = pd.read_table(path_to_hgmd, header=0, dtype=str)
    hgmd = hgmd[['gene', 'altsymbol', 'refseq', 
                 'expected_inheritance', 'hgncID', 'omimid', 'DM']]
    hgmd = hgmd.astype({'DM': 'float64'})
    df = pd.merge(
        df, hgmd, left_on='Gene.refGene', right_on='gene', how='left'
        )

    return df

def anno_sf(df: pd.DataFrame) -> pd.DataFrame:
    df_sf = pd.read_table('../../Resources/ACMG_SFv3.2.txt', header=0, dtype=str)
    df_sf = df_sf[['Gene', 'Disease/Phentyope', 'Inheritance ', 'Variants to report']]
    df = pd.merge(df, df_sf, left_on='Gene.refGene', right_on='Gene', how='left'
        )

    return df

df = anno_hgmd(df)
df = anno_sf(df)
# df = df.drop(columns=['gene_y', 'ID_y'])
print(df.duplicated(subset=['sample', 'ID']).sum())
df = df.drop_duplicates(subset=['sample', 'ID']).copy()
print(len(df))
# Replace '.' to np.nan in 'skipped_ccrs' and 'deleted_ccrs'
# Change type to float in 'skipped_ccrs' and 'deleted_ccrs'
df['skipped_ccrs'] = df['skipped_ccrs'].replace('.', np.nan).astype(float).copy()
df['deleted_ccrs'] = df['deleted_ccrs'].replace('.', np.nan).astype(float)
df['is_95%_CCRs'] = df.apply(lambda row: True if (row['skipped_ccrs'] > 95 or row['deleted_ccrs'] > 95) else False, axis=1)


Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value '.' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.



280
6139



Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`


Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`



In [56]:
df = df[[
    'DM', 'PriorityScore', 'expected_inheritance', 'Gene.refGene', 'c.HGVS', 'p.HVGS', 
    'Disease', 'maxsplai', 'is_Canonical', 'is_Frameshift', 'is_NMD_at_Canon', 
    'is_eLoF', 'is_10%_truncation', 'is_95%_CCRs', 'variant_id', 
    'clinvar_same_pos', 'clinvar_same_motif', 'sample', 'fa', 'mo', 
    'Disease/Phentyope', 'Inheritance ', 'type', 'ID', 
    'vqslod', 'triodenovo', 'dnmfilter', 'denovogear', 'denovofilter', 'sanger', 
    'SIFT_score', 'SIFT_pred', 'Polyphen2_HVAR_score', 'Polyphen2_HVAR_pred', 
    'PROVEAN_score', 'PROVEAN_pred', 'CADD_phred', 'GERP++_RS', 'gerp++gt2',
    'phyloP7way_vertebrate', 'phastCons7way_vertebrate', 'ExAC_ALL', 'ExAC_AFR',
    'ExAC_AMR', 'ExAC_EAS', 'ExAC_FIN', 'ExAC_NFE', 'ExAC_OTH', 'ExAC_SAS',
    'snp138NonFlagged', 'snp20160620_tommo_exome', 'esp6500siv2_all', 'CHROM',
    'POS', 'REF', 'ALT', 'ENSG', 'ENST', 'Rank', 
    # 'Consequence',
    'csq',
    'Biotype', 'DNA_ID', 'Identified_Gene', 'State', 'GeneSymbol', 
    'ENST_Full', 'ENSG_Full', 'Strand', 'Tag', 'is_Multi', 'SpliceAI', 
    'DS_AG', 'DS_AL', 'DS_DG', 'DS_DL', 'DP_AG', 'DP_AL', 'DP_DG', 'DP_DL',
    'Int_loc', 'exon_loc', 'ex_up_dist', 'ex_down_dist', 'exon_pos', 
    'exon_splice_site', 'SpliceType', 'Num_ExInt', 'ExInt_INFO', 'prc_exon_loc',
    'Pseudoexon', 'Part_IntRet', 'Part_ExDel', 'Exon_skipping', 'Int_Retention',
    'multiexs', 'Size_Part_ExDel', 'Size_Part_IntRet', 'Size_pseudoexon',
    'Size_IntRet', 'Size_skipped_exon', 'CDS_Length', 'is_Frameshift_Part_ExDel', 
    'is_Frameshift_Part_IntRet', 'is_Frameshift_pseudoexon',
    'is_Frameshift_IntRet', 'is_Frameshift_skipped_exon', 
    'skipped_region', 'deleted_region', 'skipped_ccrs', 'deleted_ccrs',
    'insilico_screening', 'clinvar_screening', 'altsymbol',
    'refseq', 'hgncID', 'omimid', 'Variants to report'
]]

In [57]:
df.loc[df['Identified_Gene'] != '.', 'State'] = 'Identified'
df_solved = df[df['State'] == 'Identified']
solved_case_ids = [
    'Sample_4143', 'Sample_8803', 'Sample_17110', 'Sample_9768', 'Sample_16992',
    'Sample_16970', 'Sample_17367', 'Sample_4938', 'Sample_11555']
df.loc[df['sample'].isin(solved_case_ids), 'State'] = 'Identified'
df_unsolved = df[df['State'] == 'Undetermined']

print(f"{len(df_solved)} + {len(df_unsolved)}")
print(len(df_unsolved['sample'].unique().tolist()))

1525 + 4250
1330


In [58]:
# For parallel coordinates plot
df_bk = df.copy()

In [54]:
df = df_bk.copy()

In [59]:
df = df.loc[df['State'] == 'Undetermined']
print(len(df))
df.rename(columns={'is_eLoF': 'eLoF', 'PriorityScore': 'Priority Score', 
                   'is_Canonical': 'Canonical splice cite'}, inplace=True)

4250


In [60]:
cutoff = 4
n_pos: int = len(df.loc[df['Priority Score'] >= cutoff])
n_neg: int = len(df.loc[df['Priority Score'] < cutoff])
print(f"Screen Positive: {n_pos}, Screen Negative: {n_neg}")

def add_screening_result_col(x) -> str:
    if x >= cutoff:
        return f"Positive (n = {n_pos})"
    else:
        return f"Negative (n = {n_neg})"
    
def change_boolen_to_str(x) -> str:
    if x == 'true':
        return 'eLoF gene'
    else:
        return 'Non-eLoF gene'
    
df['Screening Result'] = df['Priority Score'].apply(add_screening_result_col)
df['eLoF'] = df['eLoF'].replace({True: 'eLoF gene', False: 'Non-eLoF gene'})
df['Canonical splice cite'] = df['Canonical splice cite'].replace({'True': 'Canonical', 'False': 'Non-canonical'})

Screen Positive: 37, Screen Negative: 4213


In [61]:
### Add Sample_IDs that have been solved to the true_list below 
true_list = ['Sample_20526', 'Sample_17367', 'Sample_5100', 'Sample_21599',
            'Sample_11750', 'Sample_7528', 'Sample_2288', 'Sample_17367',
            'Sample_11873', 'Sample_6024', 'Sample_21407', 'Sample_7605',
            'Sample_11467', 'Sample_17483', 'Sample_8021', 'Sample_5037',
            'Sample_12927', 'Sample_20526', 'Sample_17367', 'Sample_22460',
            'Sample_13765', 'Sample_52', 'Sample_5766', 'Sample_16260',
            'Sample_5766', 'Sample_3986', 'Sample_13920', 'Sample_22025',
            'Sample_13635', 'Sample_7723', 'Sample_19560', 'Sample_8928',
            'Sample_17579', 'Sample_20287', 'Sample_12988', 'Sample_9869', 
            'Sample_20078', 'Sample_21789', 'Sample_21156', 'Sample_19227', 
            'Sample_17367', 'Sample_14452', 'Sample_11444', 'Sample_10713', 
            'Sample_9091', 'Sample_8436', 'Sample_4752', 'Sample_372', 
            'Sample_20468', 'Sample_9043', 'Sample_6024', 'Sample_21206',
            'Sample_19880', 'Sample_13387', 'Sample_12988', 'Sample_12291', 
            'Sample_11555', 'Sample_4938', 'Sample_4413', 'Sample_2325']

len(true_list)

# Remove above samples from dataframe
df = df.loc[df['sample'].isin(true_list)]

for s in list(df.loc[df['Screening Result'] == f"Positive (n = {n_pos})", 'sample']):
    if s not in set(true_list):
        print(s)

In [72]:
df[['sample']]

Unnamed: 0,sample
200,Sample_21156
811,Sample_8021
888,Sample_6024
929,Sample_14452
1302,Sample_4752
1449,Sample_21407
1494,Sample_7528
1764,Sample_20526
1923,Sample_22460
2027,Sample_5100


In [62]:
df.to_excel('20241023_screening_result.xlsx', index=False)

In [63]:
# dimensions = [dict(values=cars_df[label], label=label) for label in categorical_dimensions]
categorical_dimensions = ["Canonical splice cite", "eLoF", "Priority Score", "Screening Result"]
dimensions = [dict(values=df[label], label=label) for label in categorical_dimensions]

# Build colorscale
color = np.zeros(len(df), dtype='uint8')
colorscale = [[0, 'lightgray'], [1, 'red']]

In [64]:
# total number of unsoloved cases
len(df['sample'].unique().tolist())

50

In [65]:
# Build figure as FigureWidget
fig = go.FigureWidget(
    data=[
        go.Scatter(
            x=df['Priority Score'], y=df['maxsplai'],
            marker={'color': 'gray'}, mode='markers', selected={'marker': {'color': 'firebrick'}},
            unselected={'marker': {'opacity': 0.3}}), 
        go.Parcats(
            domain={'y': [0, 0.5]}, 
            dimensions=dimensions,
            line={'colorscale': colorscale, 'cmin': 0,'cmax': 1, 'color': color, 'shape': 'hspline'})
    ])

fig.update_layout(
        height=800, xaxis={'title': 'Priority Score', 'tickvals': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]},
        yaxis={'title': 'Maximum SpliceAI ∆score', 'domain': [0.6, 1]},
        dragmode='lasso', hovermode='closest')


def update_color(trace, points, state):
    # Update scatter selection
    fig.data[0].selectedpoints = points.point_inds

    # Update parcats colors
    new_color = np.zeros(len(df), dtype='uint8')
    new_color[points.point_inds] = 1
    fig.data[1].line.color = new_color

# Register callback on scatter selection...
fig.data[0].on_selection(update_color)
# and parcats click
fig.data[1].on_click(update_color)

# Update fig size
fig.update_layout(width=1200, height=1000)

# Save as html
# fig.write_html('FigureS3.html')

FigureWidget({
    'data': [{'marker': {'color': 'gray'},
              'mode': 'markers',
              'selected': {'marker': {'color': 'firebrick'}},
              'type': 'scatter',
              'uid': 'bb07bf32-322d-48ad-926f-6fa035f542cb',
              'unselected': {'marker': {'opacity': 0.3}},
              'x': array([0, 0, 3, ..., 5, 0, 1]),
              'y': array([0.01, 0.04, 0.99, ..., 0.75, 0.02, 0.  ])},
             {'dimensions': [{'label': 'Canonical splice cite',
                              'values': array(['No', 'No', 'Yes', ..., 'No', 'No', 'No'], dtype=object)},
                             {'label': 'eLoF',
                              'values': array(['Non-eLoF gene', 'eLoF gene', 'Non-eLoF gene', ..., 'eLoF gene',
                                               'eLoF gene', 'eLoF gene'], dtype=object)},
                             {'label': 'Priority Score', 'values': array([0, 0, 3, ..., 5, 0, 1])},
                             {'label': 'Screening Resul

In [108]:
fig.show()

In [19]:
df2 = pd.read_csv("screenpositives.txt", sep='\t', header=0)


In [78]:
df2.head()

Unnamed: 0,DM,Priority Score,expected_inheritance,Gene.refGene,c.HGVS,p.HVGS,Disease,maxsplai,Canonical splice cite,is_Frameshift,...,skipped_ccrs,deleted_ccrs,insilico_screening,clinvar_screening,altsymbol,refseq,hgncID,omimid,Variants to report,Screening Result
200,,5,,SFT2D2,c.444-2A>C,.,"Epilepsy ,Epilepsy",0.99,Yes,False,...,,56.218302,5,0,,,,,,Positive (n = 37)
811,443.0,4,AR,CDH23,c.2304+6T>G,.,Axial_SMD,0.58,No,True,...,,,3,1,CDHR23|PITA5|USH1D,NM_022124.6,13733.0,605516.0,,Positive (n = 37)
888,317.0,5,AR,DYNC2H1,c.1078C>T,p.Arg360*,ASD,0.24,No,False,...,,,3,2,ATD3|DHC1b|DHC2|DNCH2|DYH1B|hdhc11|SRPS2B|SRTD3,NM_001080463.2,2962.0,603297.0,,Positive (n = 37)
929,0.0,5,UNK,USP2,c.826-2A>G,.,"Lennox-Gastaut症候群,Lennox-Gastaut症候群",1.0,Yes,False,...,,81.947918,5,0,UBP41|USP9,NM_004205.5,12618.0,604725.0,,Positive (n = 37)
1302,0.0,5,UNK,CCDC63,c.489+1G>A,.,Adrenal_failure,0.95,Yes,False,...,80.385564,,5,0,ODA5,NM_152591.3,26669.0,617969.0,,Positive (n = 37)


In [81]:

df = df.loc[df['Priority Score'] >=cutoff]
df2 = df
df2 = df2.rename(columns={'sample': 'Case'})


In [79]:

df2['Removed'] = df2['Case'].apply(
    lambda x: 'Removed' if f"Sample_{x}" not in set(df['sample']) else 'Remain'
    )
df2 = df2.loc[df2['Removed'] == 'Remain']

# Add reported case interpretation
# df2.loc[df2['Case'] == 13920, 'Final interpretation'] = 'P (Reported)'


In [None]:

# counting
n_knwon = len(df2.loc[df2['Known disease genes'] == 'Known'])
n_unknown = len(df2.loc[df2['Known disease genes'] == 'Unknown'])

n_novel = len(df2.loc[df2['Final interpretation'] == 'Novel candidate gene'])
n_vus = len(df2.loc[df2['Final interpretation'] == 'VUS'])
n_phenomismatch = len(df2.loc[df2['Final interpretation'] == 'Phenotype mismatch'])
n_lp = len(df2.loc[df2['Final interpretation'] == 'LP (Novel)'])
n_p_reported = len(df2.loc[df2['Final interpretation'] == 'P (Reported)'])
n_p_novel = len(df2.loc[df2['Final interpretation'] == 'P (Novel)'])
n_inhmismatch = len(df2.loc[df2['Final interpretation'] == 'Inheritance mismatch'])

def add_num_to_gene_class(x) -> str:
    if x == 'Known':
        return f"Known <br>(n = {n_knwon})"
    else:
        return f"Unknown <br>(n = {n_unknown})"

def add_num_to_interpretation(x) -> str:
    if x == 'Novel candidate gene':
        return f"Novel candidate gene (n = {n_novel})"
    elif x == 'VUS':
        return f"VUS (n = {n_vus})"
    elif x == 'Phenotype mismatch':
        return f"Mismatched phenotype (n = {n_phenomismatch})"
    elif x == 'LP (Novel)':
        return f"LP (Novel) (n = {n_lp})"
    elif x == 'P (Reported)':
        return f"P (Reported) (n = {n_p_reported})"
    elif x == 'P (Novel)':
        return f"P (Novel) (n = {n_p_novel})"
    elif x == 'Inheritance mismatch':
        return f"Mismatched inheritance (n = {n_inhmismatch})"
    else:
        return x

df2['Known disease genes'] = df2['Known disease genes'].apply(add_num_to_gene_class)
df2['Final interpretation'] = df2['Final interpretation'].apply(add_num_to_interpretation)    

categorical_dimensions = ["Known disease genes", "Reported inheritance", "eLoF", "Priority Score", "Final interpretation"]
dimensions = [dict(values=df2[label], label=label) for label in categorical_dimensions]

# Build colorscale
color = np.zeros(len(df), dtype='uint8')
colorscale = [[0, 'lightgray'], [1, 'red']]

In [17]:
# Build colorscale
color = np.zeros(len(df2), dtype='uint8')
colorscale = [[0, 'gray'], [0.14, 'gray'],                  # Gray
              [0.14, 'firebrick'], [0.28, 'firebrick'],     # Red
              [0.28, '#82C6EF'], [0.42, '#82C6EF'],          # Blue
              [0.42, '#EAA63A'], [0.56, '#EAA63A'],          # Orange
              [0.56, '#4FAC7E'], [0.70, '#4FAC7E'],          # Green
              [0.70, '#E3948D'], [0.84, '#E3948D'],         # Pink
              [0.84, '#804031'], [1, '#804031']]            # Brown
cmin, cmax = 0, 7
# cmin, cmax = 0, 1

In [18]:
fig = go.FigureWidget(
    data=[
        go.Scatter(
            x=df2['Final interpretation'], 
            y=df2['Priority Score'],
            marker={
                'color': color, 'cmin': cmin, 'cmax': cmax,
                'colorscale': colorscale, 'showscale': False,
                'colorbar': {
                    'tickvals': [0, 1, 2, 3, 4, 5, 6], 
                    'ticktext': ['None', 'Red', 'Blue', 'Pink', 'Orange', 'Green', 'Yellow']
                    }
                },
            mode='markers'),
        go.Parcats(
            domain={'y': [0, 0.4]}, dimensions=dimensions,
            line={'colorscale': colorscale, 'cmin': cmin,
                   'cmax': cmax, 'color': color, 'shape': 'hspline'})]
)

x_labels = ['Novel<br>candidate gene', 'VUS', 'P<br>(Reported)', 'P<br>(Novel)', 
            'LP<br>(Novel)', 'Mismatched<br>phenotype', 'Mismatched<br>inhritance']

fig.update_layout(height=800, width=1000,
                  xaxis={
                      'title': 'Final Interpretation', 
                      'tickvals': [0, 1, 2, 3, 4, 5, 6],
                      'ticktext': x_labels
                      },
                  yaxis={'title': 'Priority Score', 'domain': [0.6, 1]},
                  dragmode='lasso', hovermode='closest')

# fig.update_xaxes(categoryorder='array', categoryarray=x_labels)

# Build color selection widget
color_toggle = widgets.ToggleButtons(
    options=['None', 'Red', 'Blue', 'Orange', 'Green', 'Pink', 'Brown'],
    style={'button_width': '96px', 'font_size': '14px'},
    index=1, description='Brush Color:', disabled=False)

# Update color callback
def update_color(trace, points, state):
    # Compute new color array
    new_color = np.array(fig.data[0].marker.color)
    new_color[points.point_inds] = color_toggle.index

    with fig.batch_update():
        # Update scatter color
        fig.data[0].marker.color = new_color

        # Update parcats colors
        fig.data[1].line.color = new_color

# Register callback on scatter selection...
fig.data[0].on_selection(update_color)
# and parcats click
fig.data[1].on_click(update_color)
fig.update_layout(margin=dict(t=20, b=20, l=10, r=120))
# Update fig size
fig.update_layout(width=800, height=1200)

# Display figure
widgets.VBox([color_toggle, fig])



# Save as html
# fig.write_html('FigureS4.html')

VBox(children=(ToggleButtons(description='Brush Color:', index=1, options=('None', 'Red', 'Blue', 'Orange', 'G…

In [125]:
widgets

<module 'ipywidgets.widgets' from '/Users/utsu/miniforge3/envs/wesanno/lib/python3.9/site-packages/ipywidgets/widgets/__init__.py'>

Output(layout=Layout(border_bottom='1px solid black', border_left='1px solid black', border_right='1px solid b…

In [23]:
df.to_pickle(f'./ExcelOutput/Fixed_FINAL_mydata.denovo.all.pkl')

In [14]:
# Save as tsv
df.to_csv(f'./ExcelOutput/Fixed_FINAL_mydata.denovo.all_gene_fix.tsv', sep='\t', index=False, encoding='shift-jis')

# Save as xlsx
df.to_excel(f'./ExcelOutput/Fixed_FINAL_mydata.denovo.all_gene_fix.xlsx', index=False)
df_unsolved.to_excel(f'./ExcelOutput/Fixed_FINAL_mydata.denovo.unsolved_gene_fix.xlsx', index=False)
df_solved.to_excel(f'./ExcelOutput/Fixed_FINAL_mydata.denovo.solved_gene_fix.xlsx', index=False)

In [5]:
elofs = pd.read_table('../../Resources/02_EstimatedLoFGenes/lof_genes.txt', 
                      header=None, names=['gene'], sep='\t')
elofs_genes = elofs['gene'].unique().tolist()


def elofs_judge(row):
    if row['gene'] in elofs_genes:
        return True
    else:
        return False

In [15]:
len(df_unsolved[df_unsolved['PriorityScore'] >= 3])

67

In [27]:
# Visualizations
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

In [19]:
df['FinalScore'].value_counts()

FinalScore
1    8835
0    4711
2     933
3     859
4      92
5      80
6      65
7      21
8       8
9       1
Name: count, dtype: int64

In [27]:
cars_df = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/imports-85.csv')

categorical_dimensions = ['body-style', 'drive-wheels', 'fuel-type']
dimensions = [dict(values=cars_df[label], label=label) for label in categorical_dimensions]

# Build colorscale
color = np.zeros(len(cars_df), dtype='uint8')
colorscale = [[0, 'gray'], [0.33, 'gray'],
              [0.33, 'firebrick'], [0.66, 'firebrick'],
              [0.66, 'blue'], [1.0, 'blue']]
cmin = -0.5
cmax = 2.5

In [28]:
fig = go.FigureWidget(
    data=[go.Scatter(x=cars_df.horsepower, y=cars_df['highway-mpg'],
                marker={'color': color, 'cmin': cmin, 'cmax': cmax,
                        'colorscale': colorscale, 'showscale': True,
                        'colorbar': {'tickvals': [0, 1, 2], 'ticktext': ['None', 'Red', 'Blue']}},
                     mode='markers'),

      go.Parcats(domain={'y': [0, 0.4]}, dimensions=dimensions,
                   line={'colorscale': colorscale, 'cmin': cmin,
                   'cmax': cmax, 'color': color, 'shape': 'hspline'})]
)

fig.show()

In [23]:
cars_df

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,13495.0
1,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,16500.0
2,1,,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154.0,5000.0,19,26,16500.0
3,2,164.0,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.40,10.0,102.0,5500.0,24,30,13950.0
4,2,164.0,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.40,8.0,115.0,5500.0,18,22,17450.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,-1,95.0,volvo,gas,std,four,sedan,rwd,front,109.1,...,141,mpfi,3.78,3.15,9.5,114.0,5400.0,23,28,16845.0
201,-1,95.0,volvo,gas,turbo,four,sedan,rwd,front,109.1,...,141,mpfi,3.78,3.15,8.7,160.0,5300.0,19,25,19045.0
202,-1,95.0,volvo,gas,std,four,sedan,rwd,front,109.1,...,173,mpfi,3.58,2.87,8.8,134.0,5500.0,18,23,21485.0
203,-1,95.0,volvo,diesel,turbo,four,sedan,rwd,front,109.1,...,145,idi,3.01,3.40,23.0,106.0,4800.0,26,27,22470.0


In [18]:
# Visualizations
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from ipywidgets import widgets

In [40]:
# Plot the ratio between identified and undetermined variants
print(len(df_solved), len(df_undetermined))

# Pie chart
# color as soft color palette
colors = sns.color_palette('pastel')[0:2]
fig = go.Figure(data=[go.Pie(labels=['Identified', 'Undetermined'], 
                             values=[len(df_solved), len(df_undetermined)])])
fig.update_traces(marker=dict(colors=colors))
fig.update_layout(title='Ratio between identified and undetermined variants')

# Add number of variants
fig.add_annotation(
    x=0.2, y=0.5, showarrow=False,
    text=f'Identified: {len(df_solved)}<br>Undetermined: {len(df_undetermined)}',
    font=dict(size=20)
)

# legend location
fig.update_layout(legend=dict(
    yanchor='top',
    y=0.1,
    xanchor='left',
    x=0.3
))

fig.show()

# Saving the html file
fig.write_html('identified_undetermined_ratio.html')

3892 11713


In [44]:
import codecs
with codecs.open("mydata.solved.patho.matched.txt", "r", "Shift-JIS", "ignore") as file:
    df = pd.read_table(file, delimiter="\t")