##### Import package

In [1]:
# conda create -n sss python=3.8 -y && conda activate sss
# conda install -y -c bioconda gffutils jupyter tqdm cyvcf2 pathlib2 pandarallel pysam liftover pybedtools

import os
import re
import numpy as np
import pandas as pd
# from Bio.Seq import Seq
# from liftover import get_lifter
from pathlib2 import Path
from pandarallel import pandarallel
from tqdm import tqdm
import gffutils
import pysam
from cyvcf2 import VCF

### Logging setup
from logging import getLogger, config
import yaml
parent_directory = os.path.dirname(os.path.dirname('__file__'))
config_path: str = os.path.join(parent_directory, '../../../config/logging.yaml')
with open(config_path, 'r') as f:
    config.dictConfig(yaml.safe_load(f))
logger = getLogger(__name__)

########   Initialize and setup pandas methods   ########
os.environ['JOBLIB_TEMP_FOLDER'] = '/tmp' 
pandarallel.initialize(nb_workers=5, progress_bar=True, verbose=1, use_memory_fs=False) 
tqdm.pandas()

import sys
try: 
    __file__
    sys.path.append(os.path.join(os.path.dirname('__file__')))
except NameError:
    Path().resolve()
    sys.path.append(os.path.join(Path().resolve(), '../../../'))

from libs import utils, preprocess, variantfilter, posparser, splaiparser
# from libs import predeffect, scoring
from libs import anno_spliceai, anno_clinvar
from libs.deco import print_filtering_count
# from libs import predeffect
from libs.scoring import Scoring
from libs import predeffect


gencode_gff = '../../../Resources/05_GENCODE_v43lift37/gencode.v43lift37.annotation.sort.gff3.gz'

try:
    db_anno_gencode = '../../../Resources/06_gffutilsdb/gencode.v43lift37.annotation.gtf.db'
    db_anno_intron = '../../../Resources/06_gffutilsdb/gencode.v43lift37.annotation.intron.gtf.db'
    db = gffutils.FeatureDB(db_anno_gencode)
    db_intron = gffutils.FeatureDB(db_anno_intron)
except ValueError:
    db_anno_gencode = '/resources/DBs/gencode.v43lift37.annotation.gtf.db'
    db_anno_intron = '/resources/DBs/gencode.v43lift37.annotation.intron.gtf.db'
    db = gffutils.FeatureDB(db_anno_gencode)
    db_intron = gffutils.FeatureDB(db_anno_intron)

## Thresholds configuration
thresholds_SpliceAI_parser: dict = {
    'TH_min_sALDL': 0.02, 'TH_max_sALDL': 0.2, 
    'TH_min_sAGDG': 0.01, 'TH_max_sAGDG': 0.05,
    'TH_min_GExon': 25, 'TH_max_GExon': 500,
    'TH_sAG': 0.2, 'TH_sDG': 0.2
    }

解析前の raw vcf ファイルのパスは後でまた使う

In [2]:
raw_vcf: str = "/Users/utsu/work/Github/nar/mydata/mydata.splai.pangolin.vep.maxentscan.loftee.vcf"
fp = Path(raw_vcf)
fp_stem, fp_dir = fp.stem, fp.parent

## Parse VCF and annotate ENST

In [None]:

## Parse VCF to simple input table
vcf = VCF(raw_vcf)
header = vcf.header_iter()
for h in header:
    try:
        h['ID']
    except KeyError:
        continue
    else:
        if h['ID'] == 'CSQ':
            vep_cols_list = h['Description'].split('Format: ')[1].rstrip('"').split('|')
        elif h['ID'] == 'SpliceAI':
            splai_cols_list = h['Description'].split('Format: ')[1].rstrip('"').split('|')
        elif h['ID'] == 'Pangolin':
            pang_cols_list = h['Description'].split('Format: ')[1].rstrip('"').split('|')
        else:
            pass

vepidx: dict = {col: i for i, col in enumerate(vep_cols_list)}
splaidx: dict = {col: i for i, col in enumerate(splai_cols_list)}

cols = [
    'CHROM', 'POS', 'REF', 'ALT', 'GeneSymbol', 'SymbolSource', 'HGNC_ID', 
    'ENST', 'HGVSc', 'Consequence', 'EXON', 'INTRON', 'Strand',
    'DS_AG', 'DS_AL', 'DS_DG', 'DS_DL', 
    'DP_AG', 'DP_AL', 'DP_DG', 'DP_DL', 'maxsplai',
    'loftee', 'maxentscan_alt', 'maxentscan_diff', 'maxentscan_ref', 
    'pang_gene', 'pang_pos_socre_gain', 'pang_pos_score_loss', 'pang_warning', 
    'maxpangolin'
]

df: pd.DataFrame = pd.DataFrame(columns=cols)
for v in VCF(raw_vcf):
    vep: list = v.INFO.get('CSQ').split('|')

    # Get HGVSc from VEP
    try:
        hgvsc = re.search('(?<=:).*',vep[vepidx['HGVSc']])[0]
    except TypeError:
        hgvsc = "NA"

    # Get SpliceAI scores
    if v.INFO.get('SpliceAI'):
        splai: list = v.INFO.get('SpliceAI').split(',')[0].split('|')
    else:
        splai = ['NA'] * len(splai_cols_list)

    # Get Pangolin scores
    if v.INFO.get('Pangolin'):
        pangolin: list = v.INFO.get('Pangolin').split('|')
    else:
        pangolin = ['NA'] * len(pang_cols_list)
        
    # Get Squirls scores
    if v.INFO.get('SQUIRLS_SCORE'):
        squirls: float = v.INFO.get('SQUIRLS_SCORE')
    else:
        squirls = "NA"

    # Convert strand to +/- 
    strand = lambda s: '+' if s == '1' else '-'

    # Get max SpliceAI scores
    ds_ag: float = splai[splaidx['DS_AG']]
    ds_al: float = splai[splaidx['DS_AL']]
    ds_dg: float = splai[splaidx['DS_DG']]
    ds_dl: float = splai[splaidx['DS_DL']]
    if splai[splaidx['DP_AG']] == 'NA':
        maxsplai: str = "NA"
    maxsplai: float = max(ds_ag, ds_al, ds_dg, ds_dl)

    # Get Pangplin scores
    pang_gene: str = pangolin[0]
    pang_pos_score_gain: str = pangolin[1]
    pang_pos_score_loss: str = pangolin[2]
    pang_warning: str = pangolin[3]
    if pang_gene == 'NA':
        maxpangolin = "NA"
    else:
        maxpangolin = max(
            np.abs(float(pang_pos_score_gain.split(':')[1])), 
            np.abs(float(pang_pos_score_loss.split(':')[1]))
            )

    # Add df row
    df = pd.concat([df, pd.DataFrame([[
        v.CHROM, v.POS, v.REF, v.ALT[0], 
        vep[vepidx['SYMBOL']], vep[vepidx['SYMBOL_SOURCE']], 
        vep[vepidx['HGNC_ID']], vep[vepidx['Feature']], hgvsc, 
        vep[vepidx['Consequence']], 
        vep[vepidx['EXON']], vep[vepidx['INTRON']],
        strand(vep[vepidx['STRAND']]), 
        ds_ag, ds_al, ds_dg, ds_dl,
        splai[splaidx['DP_AG']], splai[splaidx['DP_AL']], 
        splai[splaidx['DP_DG']], splai[splaidx['DP_DL']],
        maxsplai, vep[vepidx['LoF']], 
        vep[vepidx['MaxEntScan_alt']], 
        vep[vepidx['MaxEntScan_diff']], 
        vep[vepidx['MaxEntScan_ref']],
        pang_gene, pang_pos_score_gain, pang_pos_score_loss, pang_warning,
        maxpangolin]],
        columns=cols)], ignore_index=True)
        

df['ENST_Full'] = df.apply(posparser.fetch_enst_full, db=db, axis=1)


In [15]:
df.to_pickle(f"{fp_dir}/{fp_stem}.enst.pkl")

## ここから解析

In [3]:
df = pd.read_pickle(f"{fp_dir}/{fp_stem}.enst.pkl")

df = df.fillna(
    {'loftee': '.', 'maxentscan_alt': '.', 'maxentscan_diff': '.', 
        'maxentscan_ref': '.', 'pang_gene': '.', 'pang_pos_socre_gain': '.',
        'pang_pos_score_loss': '.', 'pang_warning': '.'}
        )

logger.info('Calculate the distance to the nearest splice site in intron variant...')
df['IntronDist'] = df.progress_apply(
    posparser.signed_distance_to_exon_boundary, 
    db=db, db_intron=db_intron, axis=1)

logger.info('Classify "Canonical" splice site or "Non-canonical" splice site...')
df = posparser.classifying_canonical(df)

df['Ex_or_Int'] = np.where(
    df['IntronDist'] == "[Warning] Invalid ENST ID", "[Warning] Invalid ENST ID",
    np.where(df['IntronDist'].isnull(), 'Exonic', 'Intronic'))

tbx_anno = pysam.TabixFile(gencode_gff)
df['exon_loc'] = df.progress_apply(
    posparser.calc_exon_loc, tabixfile=tbx_anno, enstcolname='ENST', axis=1)
df = pd.concat([df, df['exon_loc'].str.split(':', expand=True)], axis=1)
df.rename(columns={0: 'ex_up_dist', 1: 'ex_down_dist'}, inplace=True)
df.drop(columns=['exon_loc'], inplace=True)

#2-2. Select minimum distance from upstream distance and downstream distance
df['exon_pos'] = df.parallel_apply(posparser.select_exon_pos, axis=1)
#2-3. Relative exon location
df['prc_exon_loc'] = df.parallel_apply(posparser.calc_prc_exon_loc, axis=1)

#2-4. Decision exonic splice sites (1 nt in acceptor site or 3 nts on Donor site)
df['exon_splice_site'] = df.parallel_apply(posparser.extract_splicing_region, axis=1)

#3.   Additional Splicing information
logger.info('Annotating splicing information...')
#3-1. Annotate splicing type ('Exonic Acceptor' etc.)
df['SpliceType'] = df.parallel_apply(posparser.select_donor_acceptor, axis=1)

#5.   Annotate ClinVar varaints interpretations
logger.info('Annotating ClinVar varaints interpretations...')
clinvar_file = '../../../clinvar/Filtered_BCF_GRCh37_20241211-044124/clinvar_GRCh37.germline.nocoflicted.bcf.gz'
cln_bcf = pysam.VariantFile(clinvar_file)
df['clinvar_same_pos'] = df.progress_apply(
    anno_clinvar.anno_same_pos_vars, cln_bcf=cln_bcf, axis=1)
df['clinvar_same_motif'] = df.progress_apply(
    anno_clinvar.anno_same_motif_vars, cln_bcf=cln_bcf, axis=1)
df['same_motif_clinsigs'] = df['clinvar_same_motif'].parallel_apply(
    anno_clinvar.extract_same_motif_clinsigs)

logger.info('Parsing SpliceAI results...')
logger.info('Annotating Exon/Intron position information...')
df['ExInt_INFO'] = df.progress_apply(
    splaiparser.calc_exint_info, db=db, db_intron=db_intron, axis=1)

#6-3. Predict splicing effects
df['Pseudoexon'] = df.progress_apply(
    splaiparser.pseudoexon_activation,
    thresholds=thresholds_SpliceAI_parser, 
    db_intron=db_intron,
    axis=1)

df['Part_IntRet'] = df.parallel_apply(
    splaiparser.partial_intron_retention,
    thresholds=thresholds_SpliceAI_parser, 
    axis=1)

df['Part_ExDel'] = df.parallel_apply(
    splaiparser.partial_exon_deletion,
    thresholds=thresholds_SpliceAI_parser, 
    axis=1)

df['Exon_skipping'] = df.parallel_apply(
    splaiparser.exon_skipping, 
    thresholds=thresholds_SpliceAI_parser, 
    axis=1)
                                        
df['Int_Retention'] = df.parallel_apply(
    splaiparser.intron_retention, 
    thresholds=thresholds_SpliceAI_parser, 
    axis=1)

df['multiexs'] = df.parallel_apply(
    splaiparser.multi_exon_skipping, 
    thresholds=thresholds_SpliceAI_parser, 
    axis=1)

#7.   Annotate aberrant splicing size (bp)
logger.info('Annotating aberrant splicing size (bp)...')
#7-1. Annotate size of 
df['Size_Part_ExDel'] = df.parallel_apply(
    splaiparser.anno_partial_exon_del_size, 
    thresholds=thresholds_SpliceAI_parser, 
    axis=1)

#7-3. Annotate size of partial intron retention
df['Size_Part_IntRet'] = df.parallel_apply(
    splaiparser.anno_partial_intron_retention_size, 
    thresholds=thresholds_SpliceAI_parser,
    axis=1)

#7-2. Annotate size of pseudoexon
df['Size_pseudoexon'] = df.parallel_apply(
    splaiparser.anno_gained_exon_size, 
    thresholds=thresholds_SpliceAI_parser, 
    axis=1)

#7-4. Annotate size of intron retention
df['Size_IntRet'] = df.parallel_apply(
    splaiparser.anno_intron_retention_size, 
    thresholds=thresholds_SpliceAI_parser,
    axis=1)

#7-5. Annotate size of exon skipping
df['Size_skipped_exon'] = df.parallel_apply(
    splaiparser.anno_skipped_exon_size, 
    thresholds=thresholds_SpliceAI_parser,
    axis=1)

df['variant_id'] = df['CHROM'].astype(str) + '-' \
    + df['POS'].astype(str) + '-' + df['REF'] + '-' + df['ALT']

#8.   Evaluate splicing effects
logger.info('Predicting CDS change...')
#8-1. Predict CDS change
df['CDS_Length'] = df.progress_apply(predeffect.calc_cds_len, db=db, axis=1)
df['is_10%_truncation'] = df.progress_apply(predeffect.calc_cds_len_shorten, axis=1)

#8-2. Determine if the gene is included in eLoFs genes
df['is_eLoF'] = df.parallel_apply(predeffect.elofs_judge, axis=1)

#8-3. Determine causing NMD or not
df['is_NMD_at_Canon'] = df.parallel_apply(predeffect.nmd_judge, axis=1)

#8-4. Frame check
# Covert to str (Cannot predict splicing event) to np.nan
cannot_predict: str = 'Cannot predict splicing event'
df['Size_Part_ExDel'] = df['Size_Part_ExDel'].replace(cannot_predict, np.nan)
df['Size_Part_IntRet'] = df['Size_Part_IntRet'].replace(cannot_predict, np.nan)
df['Size_pseudoexon'] = df['Size_pseudoexon'].replace(cannot_predict, np.nan)
df['Size_IntRet'] = df['Size_IntRet'].replace(cannot_predict, np.nan)
df['Size_skipped_exon'] = df['Size_skipped_exon'].replace(cannot_predict, np.nan)

df['is_Frameshift_Part_ExDel'] = df['Size_Part_ExDel'].parallel_apply(
    predeffect.frame_check)
df['is_Frameshift_Part_IntRet'] = df['Size_Part_IntRet'].parallel_apply(
    predeffect.frame_check)
df['is_Frameshift_pseudoexon'] = df['Size_pseudoexon'].parallel_apply(
    predeffect.frame_check)
df['is_Frameshift_IntRet'] = df['Size_IntRet'].parallel_apply(
    predeffect.frame_check)
df['is_Frameshift_skipped_exon'] = df['Size_skipped_exon'].parallel_apply(
    predeffect.frame_check)
df['is_Frameshift'] = df[['is_Frameshift_Part_ExDel', 
                        'is_Frameshift_Part_IntRet', 
                        'is_Frameshift_pseudoexon', 
                        'is_Frameshift_IntRet', 
                        'is_Frameshift_skipped_exon'
                        ]].any(axis=1)

#9.   CCRs
logger.info('Annotating CCRs info...')
#9-1. Annotate truncated regions 
df['skipped_region'] = df.parallel_apply(
    splaiparser.anno_skipped_regions, axis=1)
df['deleted_region'] = df.parallel_apply(
    splaiparser.anno_deleted_regions, 
    thresholds=thresholds_SpliceAI_parser, axis=1)

#9-2. Intersect with CCRs
logger.info('Annotate CCR score')
df = predeffect.anno_ccr_score(df)

# Extract data with SymbolSource == 'HGNC'
# df = df[df['SymbolSource'] == 'HGNC']

2025/03/14 02:12:18 [INFO   ] (__main__) - Calculate the distance to the nearest splice site in intron variant...


100%|██████████| 16353/16353 [00:08<00:00, 1842.75it/s]

2025/03/14 02:12:26 [INFO   ] (__main__) - Classify "Canonical" splice site or "Non-canonical" splice site...



100%|██████████| 16353/16353 [00:06<00:00, 2709.18it/s]


2025/03/14 02:12:33 [INFO   ] (__main__) - Annotating splicing information...
2025/03/14 02:12:33 [INFO   ] (__main__) - Annotating ClinVar varaints interpretations...


100%|██████████| 16353/16353 [00:00<00:00, 20679.45it/s]
100%|██████████| 16353/16353 [00:00<00:00, 19091.06it/s]


2025/03/14 02:12:35 [INFO   ] (__main__) - Parsing SpliceAI results...
2025/03/14 02:12:35 [INFO   ] (__main__) - Annotating Exon/Intron position information...


100%|██████████| 16353/16353 [00:06<00:00, 2496.31it/s]
100%|██████████| 16353/16353 [00:00<00:00, 18824.74it/s]


2025/03/14 02:12:45 [INFO   ] (__main__) - Annotating aberrant splicing size (bp)...
2025/03/14 02:12:45 [INFO   ] (__main__) - Predicting CDS change...


100%|██████████| 16353/16353 [00:04<00:00, 3516.30it/s]
100%|██████████| 16353/16353 [00:00<00:00, 168212.83it/s]


2025/03/14 02:12:51 [INFO   ] (__main__) - Annotating CCRs info...
2025/03/14 02:12:51 [INFO   ] (__main__) - Annotate CCR score


In [4]:
df.to_pickle(f"{fp_dir}/{fp_stem}.enst.prescore.pkl")

### グラフ書くときはここから

In [3]:
df = pd.read_pickle(f"{fp_dir}/{fp_stem}.enst.prescore.pkl")

scoring = Scoring()
df['insilico_screening'] = df.parallel_apply(scoring.insilico_screening, axis=1)
df['clinvar_screening'] = df.parallel_apply(scoring.clinvar_screening, axis=1)
df['recalibrated_splai'] = df.parallel_apply(scoring.recal_scores_in_canon, axis=1)

def map_and_calc_score(row, score_map: dict) -> int:
    """
    Map the score to the solution
    s1, s2, s3, and s15 are clinvar_screening
    s4, s5, s6, s7, s8, s9, s10 and s11 are insilico_screening
    s12, s13 and s14 are recalibrated_splai
    PriortiyScore is the sum of the "clinvar_screening", "insilico_screening", and "recalibrated_splai"
    """
    if row['insilico_screening'] == "Not available":
        return np.nan

    return int(score_map[row['recalibrated_splai']]) + int(score_map[row['insilico_screening']]) + int(score_map[row['clinvar_screening']])

solution = {'s1': 9.0, 's2': 6.0, 's3': 0.0, 's4': -5.0, 
            's5': -3.0, 's6': 0.0, 's7': 2.0, 's8': 3.0, 's9': 2.0,
            's10': 4.0, 's11': 2.0, 's12': -1.0, 's13': 0.0, 's14': 1.0, 
            's15': -5.0, 's0': 0.0}

df['PriorityScore'] = df.parallel_apply(map_and_calc_score, args=(solution,), axis=1)
df.to_pickle(f"{fp_dir}/{fp_stem}.enst.scored.pkl")

In [4]:
# Extract screen positive variants only (PriorityScore >= 1)
df = pd.read_pickle(f"{fp_dir}/{fp_stem}.enst.scored.pkl")
df = df[df['PriorityScore'] >= 1]

#### De novo filtering

In [5]:
def anno_hgmd(df: pd.DataFrame) -> pd.DataFrame:
    path_to_hgmd = '../../../Resources/07_HGMD_GeneBasedInfo/HGMD_GeneBasedInfo_2024.1.tsv.gz'
    hgmd = pd.read_table(path_to_hgmd, header=0, dtype=str)
    hgmd = hgmd[['gene', 'altsymbol', 'refseq', 
                 'expected_inheritance', 'hgncID', 'omimid', 'DM']]
    hgmd = hgmd.astype({'DM': 'float64'})
    df = pd.merge(df, hgmd, left_on='HGNC_ID', right_on='hgncID', how='left')
    return df

def anno_sf(df: pd.DataFrame) -> pd.DataFrame:
    df_sf = pd.read_table('../../../Resources/ACMG_SFv3.2.txt', header=0, dtype=str)
    df_sf = df_sf[['Gene', 'Disease/Phentyope', 'Inheritance ', 'Variants to report']]
    df = pd.merge(df, df_sf, left_on='GeneSymbol', right_on='Gene', how='left')
    return df

mydata = './original.snpeff.state.disease.identifiedgene.filtered.splai.tsv'
df = pd.read_table(mydata, sep='\t', dtype=str)
print(f"Total number of variants: {len(df)}")
df = variantfilter.extract_denovo(df)
df.loc[:,'is_denovo'] = True
df = df[df['vqslod'] > -7.18]
df = df[((df['denovogear'] > 0.02) | (df['denovogear'].isnull()))
        & ((df['triodenovo'] > 5.72) | (df['triodenovo'].isnull()))
        & ((df['dnmfilter'] > 0.196) | (df['dnmfilter'].isnull()))]
df.drop(columns=['variant_id', 'ID_y'], inplace=True)
df['variant_id'] = df['CHROM'] + '-' + df['POS'] + '-' + df['REF'] + '-' + df['ALT']

# merge with mydata
df_variant = pd.read_pickle(f"{fp_dir}/{fp_stem}.enst.scored.pkl")
df_variant.drop_duplicates(subset=['variant_id'], inplace=True)


Total number of variants: 16353
Start extract_denovo
Filtering : 16353 --> 9944



In [6]:
df = df[['sample', 'fa', 'mo', 'Disease', 'is_denovo', 'variant_id', 'State', 'Identified_Gene']]
df = pd.merge(df, df_variant, on='variant_id')
print(f"Number of variants after filtering: {len(df)}")

df = anno_hgmd(df)
df = anno_sf(df)
df['skipped_ccrs'] = df['skipped_ccrs'].replace('.', np.nan).astype(float).copy()
df['deleted_ccrs'] = df['deleted_ccrs'].replace('.', np.nan).astype(float)
df['is_95%_CCRs'] = df.apply(lambda row: True if (row['skipped_ccrs'] > 95 or row['deleted_ccrs'] > 95) else False, axis=1)

# Exclude error calling
df = df.loc[df['variant_id'] != "8-145138872-T-G"]

solved_case_ids = [
    'Sample_4143', 'Sample_8803', 'Sample_17110', 'Sample_9768', 'Sample_16992',
    'Sample_16970', 'Sample_4938', 'Sample_11555', 'Sample_10713']

non_trios = [
	'Sample_17367', 'Sample_19880', 'Sample_7118', 'Sample_22831',
	'Sample_13784', 'Sample_8021', 'Sample_23636', 'Sample_5766', 
	'Sample_12102', 'Sample_52', 'Sample_7700', 'Sample_3986', 'Sample_20591', 
	'Sample_18910', 'Sample_11219', 'Sample_11895', 'Sample_10507', 
	'Sample_14446', 'Sample_13089', 'Sample_2325', 'Sample_20287', 
	'Sample_6024', 'Sample_16152', 'Sample_6584', 'Sample_10875', 'Sample_8436',
	'Sample_11750', 'Sample_13765', 'Sample_16783', 'Sample_15778']

df = df.loc[~df['sample'].isin(non_trios)]
df.loc[df['sample'].isin(solved_case_ids), 'State'] = 'Identified'
df.loc[~df['Identified_Gene'].isnull(), 'State'] = 'Identified'

# remove false_variants by variant_id key
df = df.loc[~df['variant_id'].isin(false_variants)]

df_solved = df[df['State'] == 'Identified']
df_unsolved = df[df['State'] == 'Undetermined']

print(f"{len(df_solved)} + {len(df_unsolved)}")
print(len(df_unsolved['sample'].unique().tolist()))

df = df.loc[df['State'] == 'Undetermined']
df.rename(columns={'is_eLoF': 'eLoF', 'PriorityScore': 'Priority Score', 
                   'is_Canonical': 'Canonical splice cite'}, inplace=True)

cutoff = 1
n_pos: int = len(df.loc[df['Priority Score'] >= cutoff])
n_neg: int = len(df.loc[df['Priority Score'] < cutoff])
print(f"Screen Positive: {n_pos}, Screen Negative: {n_neg}")

n_pos = 40
n_neg = 2289


def add_screening_result_col(x) -> str:
    if x >= cutoff:
        return f"Positive (n = {n_pos})"
    else:
        return f"Negative (n = {n_neg})"
    
# def change_boolen_to_str(x) -> str:
#     if x == 'true':
#         return 'eLoF gene'
#     else:
#         return 'Non-eLoF gene'
    
df['Screening Result'] = df['Priority Score'].apply(add_screening_result_col)
# df['eLoF'] = df['eLoF'].replace({True: 'eLoF gene', False: 'Non-eLoF gene'})
df['Canonical splice cite'] = df['Canonical splice cite'].replace({'True': 'Canonical', 'False': 'Non-canonical'})

Number of variants after filtering: 6406


NameError: name 'false_variants' is not defined

In [7]:
### Add Sample_IDs that have been solved to the true_list below 
true_list = ['Sample_20526', 'Sample_17367', 'Sample_5100', 'Sample_21599',
            'Sample_11750', 'Sample_7528', 'Sample_2288', 'Sample_17367',
            'Sample_11873', 'Sample_6024', 'Sample_21407', 'Sample_7605',
            'Sample_11467', 'Sample_17483', 'Sample_8021', 'Sample_5037',
            'Sample_12927', 'Sample_20526', 'Sample_17367', 'Sample_22460',
            'Sample_13765', 'Sample_52', 'Sample_5766', 'Sample_16260',
            'Sample_5766', 'Sample_3986', 'Sample_13920', 'Sample_22025',
            'Sample_13635', 'Sample_7723', 'Sample_19560', 'Sample_8928',
            'Sample_17579', 'Sample_20287', 'Sample_12988', 'Sample_9869', 
            'Sample_20078', 'Sample_21789', 'Sample_21156', 'Sample_19227', 
            'Sample_17367', 'Sample_14452', 'Sample_11444', 'Sample_10713', 
            'Sample_9091', 'Sample_8436', 'Sample_4752', 'Sample_372', 
            'Sample_20468', 'Sample_9043', 'Sample_6024', 'Sample_21206',
            'Sample_19880', 'Sample_13387', 'Sample_12988', 'Sample_12291', 
            'Sample_11555', 'Sample_4938', 'Sample_4413', 'Sample_2325']

df = df.loc[df['maxsplai'] != 'NA']
df = df.astype({'maxsplai': float})

# Extract false samples from dataframe
df_false = df.loc[~df['sample'].isin(true_list)]

# Extract above samples from dataframe
# print(len(df))
# df = df.loc[df['sample'].isin(true_list)]
# print(len(df))

for s in list(df.loc[df['Screening Result'] == f"Positive (n = {n_pos})", 'sample']):
    if s not in set(true_list):
        print(s)

# total number of unsoloved cases
print(f"Total number of unsolved cases: {len(df['sample'].unique().tolist())}")

df.fillna({'DM': 0}, inplace=True)
df.loc[df['DM'].isnull(), 'is_known_disease_gene'] = False
df.loc[df['DM'] == 0, 'is_known_disease_gene'] = False
df.loc[df['DM'] >= 1, 'is_known_disease_gene'] = True
num_known_disease_genes: int = len(df.loc[df['is_known_disease_gene'] == True])
num_unknown_disease_genes: int = len(df.loc[df['is_known_disease_gene'] == False])
print(f"Number of known disease genes: {num_known_disease_genes}")
print(f"Number of unknown disease genes: {num_unknown_disease_genes}")

df.loc[df['is_known_disease_gene'] == True, 'Known disease gene'] = "Known"
df.loc[df['is_known_disease_gene'] == False, 'Known disease gene'] = "Unknown"

# Drop duplicates in the dataframe by 'sample' and 'variant_id'
df = df.drop_duplicates(subset=['sample', 'variant_id'])

# Sort by maxsplai
df = df.loc[df['maxsplai'] != 'NA']
df = df.loc[df['Priority Score'] != None]
df = df.sort_values(by='maxsplai', ascending=True)
print(f"Total number of variants: {len(df)}")

KeyError: 'Screening Result'

In [50]:
false_variants = df_false.loc[df_false['Screening Result'] == f"Positive (n = {n_pos})", 'variant_id'].unique().tolist()
print(len(df.loc[df['Screening Result'] == f"Positive (n = {n_pos})", 'variant_id']))

40


In [51]:
print(f"All revisited cases: {len(df['sample'].unique().tolist())} cases")

All revisited cases: 1257 cases


In [52]:
import plotly.graph_objects as go
from ipywidgets import widgets

# dimensions = [dict(values=cars_df[label], label=label) for label in categorical_dimensions]
categorical_dimensions = ["Canonical splice cite", "eLoF", "Priority Score", "Screening Result"]
dimensions = [dict(values=df[label], label=label) for label in categorical_dimensions]

# Build colorscale
color = np.zeros(len(df), dtype='uint8')
colorscale = [[0, 'lightgray'], [1, 'red']]

# df = df.sort_values(by='Priority Score', ascending=True)

# Default color
# If Priority Score is greater than 1, the color is red in scatter plot
color = np.zeros(len(df), dtype='uint8')
# color[df['Priority Score'] >= 1] = 1

# Build figure as FigureWidget
fig = go.FigureWidget(
    data=[
        go.Scatter(
            x=df['Priority Score'], y=df['maxsplai'],
            marker={'color': 'gray'}, mode='markers', selected={'marker': {'color': 'firebrick'}},
            unselected={'marker': {'opacity': 0.6}}), 
        go.Parcats(
            domain={'y': [0, 0.5]}, 
            dimensions=dimensions,
            line={'colorscale': colorscale, 'cmin': 0,'cmax': 1, 'color': color, 'shape': 'hspline'})
    ])

# fig.data[0].marker.color = color

fig.update_layout(
        height=800, 
        xaxis={'title': 'Priority Score', 
               'tickvals': list(range(-12, 15, 1)),
               'range': [-12.5, 14.5]},
        yaxis={'title': 'Maximum SpliceAI ∆score', 
               'domain': [0.6, 1], 
               'range': [-0.05, 1.05], 
               'tickmode': 'array',
               'tickvals': [0, 0.2, 0.5, 0.8, 1], 
               'ticktext': ['0.0', '0.20', '0.50', '0.80', '1.0']},
        dragmode='lasso', hovermode='closest')

def update_color(trace, points, state):
    # Update scatter selection
    fig.data[0].selectedpoints = points.point_inds

    # Update parcats colors
    new_color = np.zeros(len(df), dtype='uint8')
    new_color[points.point_inds] = 1
    fig.data[1].line.color = new_color

# Register callback on scatter selection...
fig.data[0].on_selection(update_color)
# and parcats click
fig.data[1].on_click(update_color)

# Update fig size
fig.update_layout(width=1000, height=800)

# Save as html
# fig.write_html('FigureS3.html')

FigureWidget({
    'data': [{'marker': {'color': 'gray'},
              'mode': 'markers',
              'selected': {'marker': {'color': 'firebrick'}},
              'type': 'scatter',
              'uid': '759d13cc-7bfd-4867-9c73-99cb6b9c11eb',
              'unselected': {'marker': {'opacity': 0.6}},
              'x': array([-5., -3., -3., ...,  3.,  5.,  3.]),
              'y': array([0., 0., 0., ..., 1., 1., 1.])},
             {'dimensions': [{'label': 'Canonical splice cite',
                              'values': array(['No', 'No', 'No', ..., 'Yes', 'Yes', 'Yes'], dtype=object)},
                             {'label': 'eLoF',
                              'values': array([False, True, True, ..., False, True, False], dtype=object)},
                             {'label': 'Priority Score', 'values': array([-5., -3., -3., ...,  3.,  5.,  3.])},
                             {'label': 'Screening Result',
                              'values': array(['Negative (n = 2289)', 'Negat

In [117]:
df.to_pickle(f"mydata_plotly.pkl")

In [69]:
df = df.loc[df['Priority Score'] >=cutoff]
df2 = df
df2 = df2.rename(columns={'sample': 'Case'})

df2['Removed'] = df2['Case'].apply(lambda x: 'Removed' if x not in set(df['sample']) else 'Remain')
df2 = df2.loc[df2['Removed'] == 'Remain']
df2_known = df2.loc[df2['is_known_disease_gene'] == True].copy()
df2_unknown = df2.loc[df2['is_known_disease_gene'] == False].copy()
print(f"Number of variants in known disease genes: {len(df2_known)}")
print(f"Number of variants in unknown disease genes: {len(df2_unknown)}")

Number of variants in known disease genes: 20
Number of variants in unknown disease genes: 20


In [70]:
# Add interpriation
interpretations: dict = {
	"6-51920384-C-T": "Mismatched phenotype",
	"6-107050772-C-T": 	"Mismatched phenotype",
	"7-107414366-G-A": "Mismatched phenotype",
	"9-111681091-T-C": 	"Mismatched phenotype",
	"5-13717643-T-C": "Mismatched phenotype",
	"12-48387611-G-A": "P (Reported variant)",
	"X-153363075-G-A": "P (Reported variant)",
	"15-64047525-C-T": 	"Mismatched phenotype",
	"4-6086572-C-G": "LP (Novel variant)",
	"8-145138104-C-T": "Mismatched inheritance",
	"7-17854455-A-G": "Mismatched inheritance",
	"10-95262868-A-G": 	"Mismatched phenotype",
	"11-113196245-G-A": "Mismatched phenotype",
	"12-53204538-A-G": 	"Mismatched phenotype",
	"6-35277588-G-C": 	"Mismatched phenotype",
	"X-153283572-C-A": "Mismatched phenotype",
	"7-70246662-G-T": 	"Mismatched phenotype",
	"X-19373601-C-T": "P (Novel variant)",
	"8-8873831-G-A": "Mismatched phenotype",
	"10-35360126-C-T":  "VUS",
	"6-84624133-A-C":   "VUS",
	"17-45891202-T-G":  "VUS",
	"20-55206743-G-A":  "VUS",
	"2-95539264-G-A":   "VUS",
	"12-111311766-G-A": "VUS",
	"16-4918905-G-A":   "Novel candidate gene",
	"20-35826897-T-G":  "VUS",
	"8-89198703-A-T":   "VUS",
	"1-11129615-C-G":   "VUS",
	"17-46134392-A-C":  "Novel candidate gene",
	"1-155012960-G-A":  "VUS",
	"1-168211737-A-C":  "VUS",
	"5-141005760-C-A":  "VUS",
	"3-128853674-C-T":  "VUS",
	"14-69988999-G-T":  "VUS",
	"9-34618759-T-C":   "VUS",
	"2-39156972-G-C":   "VUS",
	"8-99028891-G-A":   "VUS",
	"11-119230372-T-C": "VUS"
}

# Add inheritanace
inheritance_list: dict = {
"8-145138104-C-T": "AR",
"7-107414366-G-A": "AR",
"7-70246662-G-T": "AD",
"15-100794364-T-C": "AR",
"6-107050772-C-T": "AR",
"9-111681091-T-C": "AD/AR",
"X-19373601-C-T": "XLD",
"7-17854455-A-G": "AR",
"12-48387611-G-A": "AD",
"5-13717643-T-C": "AR",
"10-95262868-A-G": "AR",
"12-53204538-A-G": "AD",
"11-113196245-G-A": "AR",
"8-8873831-G-A": "AR",
"6-51920384-C-T": "AR",
"X-153363075-G-A": "XLD",
"6-35277588-G-C": "AR",
"4-6086572-C-G": "AD",
"X-153283572-C-A": "AD",
"15-64047525-C-T": "AR",}


In [71]:
df2['Final interpretation'] = df2['variant_id'].apply(
	lambda x: interpretations[x] if x in interpretations.keys() else "VUS")
df2['Reported inheritance'] = df2['variant_id'].apply(
	lambda x: inheritance_list[x] if x in inheritance_list.keys() else "Unknown")

df2_unkown = df2.loc[df2['is_known_disease_gene'] == False].copy()

In [118]:
df2_known.to_excel('excel/positive_known_fixed.xlsx', index=False)
df2_unkown.to_excel('excel/positive_unknown_fixed.xlsx', index=False)

In [72]:
# counting
# known_txt = f"Known (n = {num_known_disease_genes})"
# unknown_txt = f"Unknown (n = {num_unknown_disease_genes})"
n_knwon = len(df2.loc[df2['Known disease gene'] == "known"])
n_unknown = len(df2.loc[df2['Known disease gene'] == "Unknown"])

n_novel = len(df2.loc[df2['Final interpretation'] == 'Novel candidate gene'])
n_vus = len(df2.loc[df2['Final interpretation'] == 'VUS'])
n_phenomismatch = len(df2.loc[df2['Final interpretation'] == 'Mismatched phenotype'])
n_lp = len(df2.loc[df2['Final interpretation'] == 'LP (Novel variant)'])
n_p_reported = len(df2.loc[df2['Final interpretation'] == 'P (Reported variant)'])
n_p_novel = len(df2.loc[df2['Final interpretation'] == 'P (Novel variant)'])
n_inhmismatch = len(df2.loc[df2['Final interpretation'] == 'Mismatched inheritance'])

print(f"Number of known disease genes: {n_knwon}")
print(f"Number of novel candidate genes: {n_novel}")
print(f"Number of VUS: {n_vus}")
print(f"Number of unknown disease genes: {n_unknown}")
print(f"Number of phenotype mismatch: {n_phenomismatch}")
print(f"Number of LP (Novel): {n_lp}")
print(f"Number of P (Reported): {n_p_reported}")
print(f"Number of P (Novel): {n_p_novel}")
print(f"Number of inheritance mismatch: {n_inhmismatch}")


Number of known disease genes: 0
Number of novel candidate genes: 2
Number of VUS: 19
Number of unknown disease genes: 20
Number of phenotype mismatch: 13
Number of LP (Novel): 1
Number of P (Reported): 2
Number of P (Novel): 1
Number of inheritance mismatch: 2


In [73]:

def add_num_to_gene_class(x) -> str:
    if x == 'Known':
        return f"Known <br>(n = {n_knwon})"
    else:
        return f"Unknown <br>(n = {n_unknown})"

def add_num_to_interpretation(x) -> str:
    if x == 'Novel candidate gene':
        return f"Novel candidate gene (n = {n_novel})"
    elif x == 'VUS':
        return f"VUS (n = {n_vus})"
    elif x == 'Phenotype mismatch':
        return f"Mismatched phenotype (n = {n_phenomismatch})"
    elif x == 'LP (Novel)':
        return f"LP (Novel) (n = {n_lp})"
    elif x == 'P (Reported)':
        return f"P (Reported) (n = {n_p_reported})"
    elif x == 'P (Novel)':
        return f"P (Novel) (n = {n_p_novel})"
    elif x == 'Inheritance mismatch':
        return f"Mismatched inheritance (n = {n_inhmismatch})"
    else:
        return x

# df2['Known disease gene'] = df2['Known disease gene'].apply(add_num_to_gene_class)
df2['Final interpretation'] = df2['Final interpretation'].apply(add_num_to_interpretation)    

categorical_dimensions = ["Known disease gene", "Reported inheritance", "eLoF", "Priority Score", "Final interpretation"]
dimensions = [dict(values=df2[label], label=label) for label in categorical_dimensions]

# Build colorscale
color = np.zeros(len(df), dtype='uint8')
colorscale = [[0, 'lightgray'], [1, 'red']]

In [None]:
# Build colorscale
color = np.zeros(len(df2), dtype='uint8')
colorscale = [[0, 'gray'], [0.14, 'gray'],                  # Gray
              [0.14, 'firebrick'], [0.28, 'firebrick'],     # Red
              [0.28, '#82C6EF'], [0.42, '#82C6EF'],          # Blue
              [0.42, '#EAA63A'], [0.56, '#EAA63A'],          # Orange
              [0.56, '#4FAC7E'], [0.70, '#4FAC7E'],          # Green
              [0.70, '#E3948D'], [0.84, '#E3948D'],         # Pink
              [0.84, '#804031'], [1, '#804031']]            # Brown
cmin, cmax = 0, 7

# cmin, cmax = 0, 1
fig = go.FigureWidget(
    data=[
        go.Scatter(
            x=df2['Final interpretation'], 
            y=df2['Priority Score'],
            marker={
                'color': color, 'cmin': cmin, 'cmax': cmax,
                'colorscale': colorscale, 'showscale': False,
                'colorbar': {
                    'tickvals': [0, 1, 2, 3, 4, 5, 6], 
                    'ticktext': ['None', 'Red', 'Blue', 'Pink', 'Orange', 'Green', 'Yellow']
                    }
                },
            # Edit size and line
            
            mode='markers'),
        go.Parcats(
            domain={'y': [0, 0.4]}, dimensions=dimensions,
            line={'colorscale': colorscale, 'cmin': cmin,
                   'cmax': cmax, 'color': color, 'shape': 'hspline'})]
)

x_labels = ['Novel<br>candidate gene', 'VUS', 'P<br>(Reported)', 'P<br>(Novel)', 
            'LP<br>(Novel)', 'Mismatched<br>phenotype', 'Mismatched<br>inhritance']

fig.update_layout(height=600, width=800,
                  xaxis={
                      'title': 'Final Interpretation', 
                      'tickvals': [0, 1, 2, 3, 4, 5, 6],
                      'ticktext': x_labels
                      },
                  yaxis={'title': 'Priority Score', 'domain': [0.6, 1]},
                  dragmode='lasso', hovermode='closest')

# fig.update_xaxes(categoryorder='array', categoryarray=x_labels)

# Build color selection widget
color_toggle = widgets.ToggleButtons(
    options=['None', 'Red', 'Blue', 'Orange', 'Green', 'Pink', 'Brown'],
    style={'button_width': '96px', 'font_size': '14px'},
    index=1, description='Brush Color:', disabled=False)

# Update color callback
def update_color(trace, points, state):
    # Compute new color array
    new_color = np.array(fig.data[0].marker.color)
    new_color[points.point_inds] = color_toggle.index

    with fig.batch_update():
        # Update scatter color
        fig.data[0].marker.color = new_color

        # Update parcats colors
        fig.data[1].line.color = new_color

# Register callback on scatter selection...
fig.data[0].on_selection(update_color)
# and parcats click
fig.data[1].on_click(update_color)
fig.update_layout(margin=dict(t=20, b=20, l=10, r=120))
# Update fig size
fig.update_layout(width=800, height=1000)

# Display figure
widgets.VBox([color_toggle, fig])


# Save as html
# fig.write_html('FigureS4.html')

VBox(children=(ToggleButtons(description='Brush Color:', index=1, options=('None', 'Red', 'Blue', 'Orange', 'G…