In [53]:
# conda create -n sss python=3.8 -y && conda activate sss
# pip install gffutils jupyter tqdm cyvcf2 pathlib2 pandarallel pysam liftover 
# conda install -y -c bioconda pybedtools

import os
import numpy as np
import pandas as pd
# from Bio.Seq import Seq
# from liftover import get_lifter
from pathlib2 import Path
from pandarallel import pandarallel
from tqdm import tqdm
import gffutils
import pysam
from cyvcf2 import VCF

### Logging setup
from logging import getLogger, config
import yaml
parent_directory = os.path.dirname(os.path.dirname('__file__'))

config_path: str = os.path.join(parent_directory, '../../../config/logging.yaml')
with open(config_path, 'r') as f:
    config.dictConfig(yaml.safe_load(f))
logger = getLogger(__name__)

########   Initialize and setup pandas methods   ########
os.environ['JOBLIB_TEMP_FOLDER'] = '/tmp' 
# pandarallel.initialize(nb_workers=os.cpu_count()-8, progress_bar=False, verbose=2, use_memory_fs=False) 
pandarallel.initialize(nb_workers=16, progress_bar=False, verbose=2, use_memory_fs=False) 

tqdm.pandas()


import sys
try: 
    __file__
    sys.path.append(os.path.join(os.path.dirname('__file__')))
except NameError:
    Path().resolve()
    sys.path.append(os.path.join(Path().resolve(), '../../../'))

from libs import utils, preprocess, variantfilter, posparser, splaiparser
# from libs import predeffect, scoring
from libs import anno_spliceai, anno_clinvar
from libs.deco import print_filtering_count
# from libs import predeffect
from libs.scoring import Scoring
from libs import predeffect


gencode_gff = '../../../Resources/05_GENCODE_v43lift37/gencode.v43lift37.annotation.sort.gff3.gz'

try:
    db_anno_gencode = '../../../Resources/06_gffutilsdb/gencode.v43lift37.annotation.gtf.db'
    db_anno_intron = '../../../Resources/06_gffutilsdb/gencode.v43lift37.annotation.intron.gtf.db'
    db = gffutils.FeatureDB(db_anno_gencode)
    db_intron = gffutils.FeatureDB(db_anno_intron)
except ValueError:
    db_anno_gencode = '/resources/DBs/gencode.v43lift37.annotation.gtf.db'
    db_anno_intron = '/resources/DBs/gencode.v43lift37.annotation.intron.gtf.db'
    db = gffutils.FeatureDB(db_anno_gencode)
    db_intron = gffutils.FeatureDB(db_anno_intron)

## Thresholds configuration
thresholds_SpliceAI_parser: dict = {
    'TH_min_sALDL': 0.02, 'TH_max_sALDL': 0.2, 
    'TH_min_sAGDG': 0.01, 'TH_max_sAGDG': 0.05,
    'TH_min_GExon': 25, 'TH_max_GExon': 500,
    'TH_sAG': 0.2, 'TH_sDG': 0.2
    }

from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt
import seaborn as sns
import scipy

import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
from plotly.subplots import make_subplots

try: 
    __file__
    sys.path.append(os.path.join(os.path.dirname('__file__')))
except NameError:
    Path().resolve()
    sys.path.append(os.path.join(Path().resolve(), '../../'))

from libs.scoring import Scoring

import warnings
warnings.simplefilter('ignore')

INFO: Pandarallel will run on 16 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [54]:
# 関数の設定
def specificity_sensitivity_plotly(data):
    thresholds = np.arange(0, 11, 1)
    results = []

    for threshold in thresholds:
        tp = data[(data['PriorityScore'] >= threshold) & (data['LABEL'] == 1)].shape[0]
        fn = data[(data['PriorityScore'] < threshold) & (data['LABEL'] == 1)].shape[0]
        tn = data[(data['PriorityScore'] < threshold) & (data['LABEL'] == 0)].shape[0]
        fp = data[(data['PriorityScore'] >= threshold) & (data['LABEL'] == 0)].shape[0]
        specificity = tn / (tn + fp) if (tn + fp) else 0
        sensitivity = tp / (tp + fn) if (tp + fn) else 0
        # print(f"Threshold: {threshold}, TP: {tp}, FN: {fn}, TN: {tn}, FP: {fp}")
        # print(f"Threshold: {threshold}, Specificity: {specificity:.6f}, Sensitivity: {sensitivity:.6f}")
        results.append({'Threshold': threshold, 'Metric': 'Specificity', 'Value': specificity})
        results.append({'Threshold': threshold, 'Metric': 'Sensitivity', 'Value': sensitivity})

    results_df = pd.DataFrame(results)
    return results_df

def plot_sensitivity_specificity_plotly(
        results_df: pd.DataFrame, w: int, h: int):
    # Separate the dataframes for specificity and sensitivity
    specificity_df = results_df[results_df['Metric'] == 'Specificity']
    sensitivity_df = results_df[results_df['Metric'] == 'Sensitivity']

    # Plotly Graph Objectsを使用してプロット
    fig = go.Figure()

    # 特異性
    fig.add_trace(go.Scatter(
        x=specificity_df['Threshold'],
        y=specificity_df['Value'],
        marker=dict(color='#665990'),
        mode='lines+markers',
        name='Specificity',
        text=[f'Threshold: {th}, Specificity: {val:.3f}' for th, val in zip(specificity_df['Threshold'], specificity_df['Value'])],
        hoverinfo='text'
    ))
    
    # 感度
    fig.add_trace(go.Scatter(
        x=sensitivity_df['Threshold'],
        y=sensitivity_df['Value'],
        marker=dict(color='#F8ACAC'),
        mode='lines+markers',
        name='Sensitivity',
        text=[f'Threshold: {th}, Sensitivity: {val:.3f}' for th, val in zip(sensitivity_df['Threshold'], sensitivity_df['Value'])],
        hoverinfo='text'
    ))

    # Y軸のフォーマット設定
    fig.update_yaxes(tickformat=".1f")

    # グラフのレイアウト設定
    fig.update_layout(title='Sensitivity and Specificity for each threshold',
                      xaxis_title='Threshold',
                      yaxis_title='Sensitivity/Specificity',
                      plot_bgcolor='rgba(243, 243, 243, 1)',
                      paper_bgcolor='rgba(243, 243, 243, 0)',
                      font=dict(family="Arial, sans-serif", size=12, color="black"),
                      legend=dict(y=0.075, x=0.75, xanchor='right', yanchor='bottom', 
                              bgcolor='rgba(243, 243, 243, 1)',
                              font=dict(family="Arial, sans-serif", size=12, color="black"))
                              )

    # グラフサイズの調整
    fig.update_layout(width=w, height=h)
    fig.write_html("sensitivity_specificity_plot.html")

    # fig.show()
    return fig

def plot_sensitivity_specificity_plotly_without_legened(results_df):
    # Separate the dataframes for specificity and sensitivity
    specificity_df = results_df[results_df['Metric'] == 'Specificity']
    sensitivity_df = results_df[results_df['Metric'] == 'Sensitivity']

    # Plotly Graph Objectsを使用してプロット
    fig = go.Figure()

    # 特異性
    fig.add_trace(go.Scatter(
        x=specificity_df['Threshold'],
        y=specificity_df['Value'],
        marker=dict(color='green'),
        mode='lines+markers',
        name='Specificity',
        text=[f'Threshold: {th}, Specificity: {val:.8f}' for th, val in zip(specificity_df['Threshold'], specificity_df['Value'])],
        hoverinfo='text',
        showlegend=False 
    ))
    
    # 感度
    fig.add_trace(go.Scatter(
        x=sensitivity_df['Threshold'],
        y=sensitivity_df['Value'],
        marker=dict(color='orange'),
        mode='lines+markers',
        name='Sensitivity',
        text=[f'Threshold: {th}, Sensitivity: {val:.8f}' for th, val in zip(sensitivity_df['Threshold'], sensitivity_df['Value'])],
        hoverinfo='text',
        showlegend=False
    ))

    # Y軸のフォーマット設定
    fig.update_yaxes(tickformat=".1f")

    # グラフのレイアウト設定
    fig.update_layout(title='Sensitivity and Specificity for each threshold',
                      xaxis_title='Threshold',
                      yaxis_title='Sensitivity/Specificity',
                      plot_bgcolor='rgba(243, 243, 243, 1)',
                      paper_bgcolor='rgba(243, 243, 243, 0)',
                      font=dict(family="Arial, sans-serif", size=12, color="black"),
                      legend=dict(y=0.075, x=0.75, xanchor='right', yanchor='bottom', 
                              bgcolor='rgba(243, 243, 243, 1)',
                              font=dict(family="Arial, sans-serif", size=12, color="black"))
                              )

    # グラフサイズの調整
    fig.update_layout(width=600, height=600)
    fig.write_html("sensitivity_specificity_plot.html")

    # fig.show()
    return fig


# Code below is adapted from Netflix's VMAF and BesenbacherLab's ROC-utils
# https://github.com/Netflix/vmaf/
# https://github.com/BesenbacherLab/ROC-utils
# Modifications: np.float -> np.float64

def compute_midrank(x):
    """Computes midranks.
    Args:
       x - a 1D numpy array
    Returns:
       array of midranks
    """
    J = np.argsort(x)
    Z = x[J]
    N = len(x)
    T = np.zeros(N, dtype=np.float64)
    i = 0
    while i < N:
        j = i
        while j < N and Z[j] == Z[i]:
            j += 1
        T[i:j] = 0.5*(i + j - 1)
        i = j
    T2 = np.empty(N, dtype=np.float64)
    # Note(kazeevn) +1 is due to Python using 0-based indexing
    # instead of 1-based in the AUC formula in the paper
    T2[J] = T + 1
    return T2


def fastDeLong(predictions_sorted_transposed, label_1_count):
    """
    The fast version of DeLong's method for computing the covariance of
    unadjusted AUC.
    Args:
       predictions_sorted_transposed: a 2D numpy.array[n_classifiers, n_examples]
          sorted such as the examples with label "1" are first
    Returns:
       (AUC value, DeLong covariance)
    Reference:
     @article{sun2014fast,
       title={Fast Implementation of DeLong's Algorithm for
              Comparing the Areas Under Correlated Receiver Operating Characteristic Curves},
       author={Xu Sun and Weichao Xu},
       journal={IEEE Signal Processing Letters},
       volume={21},
       number={11},
       pages={1389--1393},
       year={2014},
       publisher={IEEE}
     }
    """
    # Short variables are named as they are in the paper
    m = label_1_count
    n = predictions_sorted_transposed.shape[1] - m
    positive_examples = predictions_sorted_transposed[:, :m]
    negative_examples = predictions_sorted_transposed[:, m:]
    k = predictions_sorted_transposed.shape[0]

    tx = np.empty([k, m], dtype=np.float64)
    ty = np.empty([k, n], dtype=np.float64)
    tz = np.empty([k, m + n], dtype=np.float64)
    for r in range(k):
        tx[r, :] = compute_midrank(positive_examples[r, :])
        ty[r, :] = compute_midrank(negative_examples[r, :])
        tz[r, :] = compute_midrank(predictions_sorted_transposed[r, :])
    aucs = tz[:, :m].sum(axis=1) / m / n - float(m + 1.0) / 2.0 / n
    v01 = (tz[:, :m] - tx[:, :]) / n
    v10 = 1.0 - (tz[:, m:] - ty[:, :]) / m
    sx = np.cov(v01)
    sy = np.cov(v10)
    delongcov = sx / m + sy / n
    return aucs, delongcov


def calc_pvalue(aucs, sigma):
    """Computes log(10) of p-values.
    Args:
       aucs: 1D array of AUCs
       sigma: AUC DeLong covariances
    Returns:
       log10(pvalue)
    """
    l = np.array([[1, -1]])
    z = np.abs(np.diff(aucs)) / np.sqrt(np.dot(np.dot(l, sigma), l.T))
    return np.log10(2) + scipy.stats.norm.logsf(z, loc=0, scale=1) / np.log(10)


def compute_ground_truth_statistics(ground_truth):
    assert np.array_equal(np.unique(ground_truth), [0, 1])
    order = (-ground_truth).argsort()
    label_1_count = int(ground_truth.sum())
    return order, label_1_count


def delong_roc_variance(ground_truth, predictions):
    """
    Computes ROC AUC variance for a single set of predictions
    Args:
       ground_truth: np.array of 0 and 1
       predictions: np.array of floats of the probability of being class 1
    """
    order, label_1_count = compute_ground_truth_statistics(ground_truth)
    predictions_sorted_transposed = predictions[np.newaxis, order]
    aucs, delongcov = fastDeLong(predictions_sorted_transposed, label_1_count)
    assert len(aucs) == 1, "There is a bug in the code, please forward this to the developers"
    return aucs[0], delongcov


def delong_roc_test(ground_truth, predictions_one, predictions_two):
    """
    Computes log(p-value) for hypothesis that two ROC AUCs are different
    Args:
       ground_truth: np.array of 0 and 1
       predictions_one: predictions of the first model,
          np.array of floats of the probability of being class 1
       predictions_two: predictions of the second model,
          np.array of floats of the probability of being class 1
    """
    order, label_1_count = compute_ground_truth_statistics(ground_truth)
    predictions_sorted_transposed = np.vstack((predictions_one, predictions_two))[:, order]
    aucs, delongcov = fastDeLong(predictions_sorted_transposed, label_1_count)
    return calc_pvalue(aucs, delongcov)

# Calculate AUC confidence interval (95%)
def compute_auc_confidence_interval(auc, var, confidence_level=0.95):
    alpha = 1 - confidence_level
    z_score = scipy.stats.norm.ppf(1 - alpha/2)  # 2-tailed z score
    se = np.sqrt(var)  # Calculate SE from variance
    lower_bound = auc - z_score * se
    upper_bound = auc + z_score * se
    return lower_bound, upper_bound



#### 前処理．SQLから抽出したallmut.csvを編集する．(ここは処理済み)

In [None]:
all_mut_default_colnames: list = [
    "disase", "gene", "chrom", "genename", "gdbid", "omimid", "amino", 
    "deletion", "insertion", "codon", "codonAff", "descr", "refseq", "hgvs", 
    "hgvsAll", "dbsnp", "chromosome", "startCoord", "endCoord", 
    "expected_inheritance", "gnomad_AC", "gnomad_AF", "gnomad_AN", "tag", 
    "dmsupport", "rankscore", "mutype", "author", "title", "fullname", 
    "allname", "vol", "page", "year", "pmid", "pmidAll", "reftag", "comments", 
    "acc_num", "new_date", "base", "clinvarID", "clinvar_clnsig"
]

allmut: pd.DataFrame = pd.read_csv(
    'allmut.csv', sep=';', encoding='cp1252', names=all_mut_default_colnames, 
    skiprows=1,low_memory=False)

allmut = allmut[
    ["gene", "genename", "mutype", "clinvar_clnsig", "tag",
     "refseq", "hgvs", "hgvsAll", "chromosome", "startCoord", "endCoord", 
     "amino", "deletion", "insertion", "expected_inheritance", "gnomad_AF"]]

# Drop non-numeric values in 'startCoord'
allmut = allmut.dropna(subset=['startCoord'])

# Drop duplicates in 'chrom', 'startCoord', and 'endCoord'
allmut = allmut.drop_duplicates(subset=['chromosome', 'startCoord', 'endCoord'])

# Extract tag == "DM" from allmut
allmut_dm = allmut[allmut.tag == "DM"].copy()
print(f"A total of {len(allmut_dm)} DM mutations are found in allmut.")

allmut_dm['startCoord'] = allmut_dm['startCoord'].astype(int)
allmut_dm = allmut_dm.rename(columns={'chromosome': 'CHROM', 'startCoord': 'POS_hg38'})

# Fillna with empty string in "gnomad_AF" colmun in allmut_dm
# Extratct MAF 0 from allmut_dm
allmut_dm['gnomad_AF'].fillna(0, inplace=True)
allmut_dm_maf0 = allmut_dm[allmut_dm['gnomad_AF'] == 0].copy()
print(f"A total of {len(allmut_dm_maf0)} DM mutations are found in allmut with MAF 0.")

# Extract non-deletion or non-insertion from allmut_dm
allmut_dm_maf0_snv = allmut_dm_maf0[(allmut_dm_maf0['deletion'].isnull()) & (allmut_dm_maf0['insertion'].isnull())]
print(f"A total of {len(allmut_dm_maf0_snv)} DM mutations are found in allmut with MAF 0 and non-deletion or non-insertion.")

# Extract the mutation type from the mutype column
splice_mutations = allmut_dm_maf0_snv[allmut_dm_maf0_snv["mutype"].str.contains("splice")].copy()
non_splice_mutations = allmut_dm_maf0_snv[~allmut_dm_maf0_snv["mutype"].str.contains("splice")]
print(f"Splicing_DM: {len(splice_mutations)}, Non-splicing_DM: {len(non_splice_mutations)}")

# Convert startCoord to hg19
from liftover import get_lifter

def _liftover_to_hg19(chrom, pos):
    converter = get_lifter('hg38', 'hg19')
    result = converter.query(chrom, pos)
    if result:
        return result[0]
    else:
        return None
    
def anno_hg19_pos(row):
    converted = _liftover_to_hg19(row['CHROM'], row['POS_hg38'])
    if converted:
        return converted[0]
    else:
        return None

allmut_dm_maf0_snv['POS_hg19'] = allmut_dm_maf0_snv.parallel_apply(anno_hg19_pos, axis=1)
# allmut_dm_maf0_snv.to_pickle('allmut_dm_maf0_snv_liftover.pkl', mode='x')

#### 計算済み - 重みづけのパターンの探索

In [None]:
from ortools.sat.python import cp_model

class SolutionCollector(cp_model.CpSolverSolutionCallback):
    def __init__(self, variables):
        cp_model.CpSolverSolutionCallback.__init__(self)
        self.__variables = variables
        self.__solutions = []

    def OnSolutionCallback(self):
        solution = {v.Name(): self.Value(v) for v in self.__variables}
        self.__solutions.append(solution)

    def GetAllSolutions(self):
        return self.__solutions

def find_all_solutions(lowerlimit: int):
    # Initialize the model
    model = cp_model.CpModel()

    # Define the variables (s1, s2, ..., s15)
    s = {i: model.NewIntVar(lowerlimit, 9, f's{i}') for i in range(1, 16)}

    """
    Rule 1	Score 1 > Score 2 > Score 3 = 0 > Score 15
    Rule 2	Score 9 ≥ Score 7 > Score 6 ≥ 0 ≥ Score 5 > Score 4
    Rule 3	Score 10 > Score 8 > Score 9 > Score 11 ≥ 0
    Rule 4	Score 14 > 0 ≥ Score 13 ≥ Score 4 ≥ Score12
    Rule 5	Score 1 + Score 10 + Score 14 = 9
    Rule 6	Score 3 + Score 11 + Score 12 = 0
            Score 15 + Score 4 
    """
    # Add constraints
    # Rule 1: 
    # Most higheset score 
    # model.Add(s[1] + s[10] + s[14] == 9)
    # model.Add(s[15] + s[10] + s[14] == 0)

    # Baseline constraints
    # model.Add(s[15] + s[4] <= 0)
    # model.Add(s[3] + s[11] + s[12] == 0)    #
    # model.Add(s[15] + s[11] + s[12] == 0)    #
    # model.Add(s[1] + s[7] < 9)
    # model.Add(s[1] == 3)

    
    """
    Maximum: s[1] + s[10] + s[14]
    Minimum: s[15] + s[4]
    Max - min > 9
    """
    # model.Add(s[1] + s[10] + s[14] - s[15] - s[4] > 9)

    # Knowledge-based
    # s[15] < 0 = s[3] < s[2] < s[1]
    model.Add(s[15] < 0)
    model.Add(0 == s[3])
    model.Add(s[3] < s[2])
    model.Add(s[2] < s[1])  

    # s[4] < s[5] < 0 <= s[6] < s[7] <= s[9] < s[8] < s[10]
    model.Add(s[4] < s[5])
    model.Add(s[5] < 0)
    model.Add(0 <= s[6])
    model.Add(s[6] < s[7])  
    model.Add(s[7] <= s[9])  
    model.Add(s[9] < s[8])
    model.Add(s[8] < s[10])  
    
    # 0 < s[11] && s[6] <= s[11] <= s[9]
    model.Add(0 < s[11])
    model.Add(s[6] <= s[11])
    model.Add(s[11] <= s[9]) 

    """
    ClinVarでは, Benignだけど, CaonicalでSpliceAIも高く病原性が疑われそうな
    バリアントをPriortiyScore 0 とした.
    s[15] + s[10] + s[14] = 0
    """
    # model.Add(s[15] + s[10] + s[14] == 0)

    """
    canonical splice siteのバリアントは, ClinGenのガイドラインを参考にすると, 
    insilico予測ツールの結果をあまり重視していないようだ. 
    そのため, s[14] < s[11] という制約を追加した．
    s[12] < 0 && 0 < s[14] && s[12] < s[13] < s[14] < s[11]
    """
    model.Add(s[12] < 0)
    model.Add(s[12] < s[13])
    model.Add(s[13] < s[14])
    model.Add(0 < s[14])
    model.Add(s[14] < s[11])

    # model.Add(s[1] + s[10] + s[14] - s[15] - s[4] > 9)

    # Create a solver and solve the model
    solver = cp_model.CpSolver()
    solution_collector = SolutionCollector([s[i] for i in range(1, 16)])
    solver.SearchForAllSolutions(model, solution_collector)
    
    # Return all solutions
    return solution_collector.GetAllSolutions()

# Find all solutions

for lowerlimit in range(-5, -4):
    all_solutions = find_all_solutions(lowerlimit=lowerlimit)
    print(f'Total solutions found (lowerlimit {lowerlimit}): {len(all_solutions)}')


In [16]:
all_solutions_pkl = 'all_solutions_-5-9_withoutbaseline.pkl'

In [17]:
# To pickle the results of ortools
import pickle
with open(all_solutions_pkl, 'wb') as f:
    pickle.dump(all_solutions, f)

#### Liftoverなどの処理後（ここから解析すればOK）

In [None]:
# Loading allmut variants from pickle
allmut_dm_maf0_snv_hg19 = pd.read_pickle('allmut_dm_maf0_snv_liftover.pkl')

# Rename POS_hg19 to POS
allmut_dm_maf0_snv_hg19.rename(columns={'POS_hg19': 'POS'}, inplace=True)

# Drop unknown positions in 'POS' column and assign integer type
allmut_dm_maf0_snv_hg19.dropna(subset=['POS'], inplace=True)
allmut_dm_maf0_snv_hg19 = allmut_dm_maf0_snv_hg19.astype({'POS': int})

# Change object name to allmut
allmut = allmut_dm_maf0_snv_hg19

# Generate ID column
allmut['ID'] = allmut['CHROM'].astype(str) + '-' + allmut['POS'].astype(str) + '-' + allmut['hgvs']

# Extract useful columns
allmut = allmut[['ID', 'mutype', 'clinvar_clnsig', 'tag', 'deletion', 'insertion', 'expected_inheritance', 'gnomad_AF']]

# Load VCF file annoteted by analysis pipeline

# df = pd.read_pickle('splai_vep_vcfs/hgmd_dm/allchr.DM.splai.vep.nondel.enst.prescore.hgnconly.v2.pkl')
df = pd.read_pickle("variant_data_set_vcfs/hgmd_all.prescore.onlyhgnc.pkl")

df['HGVSc'] = df['HGVSc'].str.replace('c.', '')
df['ID'] = df['CHROM'].astype(str) + '-' + df['POS'].astype(str) + '-' + df['HGVSc']

# merge df and allmut on 'ID' column with inner join
print(len(df))
df = pd.merge(df, allmut, on='ID', how='inner')
print(len(df))

exclude_csq = {
    '3_prime_UTR_variant', '5_prime_UTR_variant', 'mature_miRNA_variant',
    'mature_miRNA_variant', 'downstream_gene_variant', 'upstream_gene_variant',
    'non_coding_transcript_exon_variant'
}

exclude_non_spl_dm: set = {'splice_region_variant'}

def is_orf_variants(row):
    csqs: list = row['Consequence'].split('&')
    if set(csqs).isdisjoint(exclude_csq):
        return True
    else:
        return False
    
def is_non_spl_tn(row):
    csqs: list = row['Consequence'].split('&')
    if set(csqs).isdisjoint(exclude_non_spl_dm):
        return True
    else:
        return False

def is_gnomad_tn(row):
    if row['gnomad_AF'] == 0:
        return True
    else:
        return False


df = df[df.apply(is_orf_variants, axis=1)]

# df_spl contains splicing mutations (splice, canonical-splice, exonic-splice)
df_spl = df[df['mutype'].str.contains('splice')].copy()

# df_non_spl contains non-splicing mutations
df_non_spl = df[df['mutype'].str.contains('missense|nonsense|synonymous')].copy()
# df_non_spl = df[df['mutype'].str.contains('missense|synonymous')].copy()
# df_non_spl = df[df['mutype'].str.contains('missense|nonsense')].copy()
# df_non_spl = df[df['mutype'].str.contains('missense')].copy()
# df_non_spl = df[df['mutype'].str.contains('synonymous')].copy()
# df_non_spl = df[df['mutype'].str.contains('nonsense')].copy()

df_non_spl = df_non_spl[df_non_spl.apply(is_non_spl_tn, axis=1)]

print(f"Splicing: {len(df_spl)}, Non-splicing: {len(df_non_spl)}, total: {len(df_spl) + len(df_non_spl)}")

179423
126859
Splicing: 18432, Non-splicing: 102095, total: 120527


In [31]:
### Annotating the label and variant_id (CHROM-POS-REF-ALT)
### When mutype is splice, the label is 1, otherwise 0
df_gnomad = pd.read_pickle('variant_data_set_vcfs/gnomad_all.prescore.onlyhgnc.pkl')
df_gnomad = df_gnomad[df_gnomad.apply(is_orf_variants, axis=1)]

df_spl['LABEL'] = 1
df_non_spl['LABEL'] = 0
df_gnomad['LABEL'] = 0

df_spl['variant_id'] = df_spl['CHROM'].astype(str) + '-' + df_spl['POS'].astype(str) + '-' + df_spl['REF'] + '-' + df_spl['ALT']
df_non_spl['variant_id'] = df_non_spl['CHROM'].astype(str) + '-' + df_non_spl['POS'].astype(str) + '-' + df_non_spl['REF'] + '-' + df_non_spl['ALT']
df_gnomad['variant_id'] = df_gnomad['CHROM'].astype(str) + '-' + df_gnomad['POS'].astype(str) + '-' + df_gnomad['REF'] + '-' + df_gnomad['ALT']

### Create a dataframe tp (True Positive)
tp = df_spl.copy()

# Concatenate df_gnomad and df_non_spl
n = int(len(tp) / 2)
# df_non_spl = df_non_spl.sample(n=int(n/10), random_state=42)
# df_gnomad = df_gnomad.sample(n=n, random_state=42)
tn = pd.concat([df_gnomad, df_non_spl], ignore_index=True)
tn = tn.drop_duplicates(subset=['variant_id'], keep='first')

# tn = df_non_spl.copy()
tn = df_gnomad.copy()

### Exclude non-ORF variants
tn['is_ORF'] = tn.apply(is_orf_variants, axis=1)
tn = tn[tn['is_ORF']]

## Summary of the dataset
print(f"TP: {len(tp)}, TN: {len(tn)}")

TP: 18432, TN: 78048


### trainとtestのデータセットの分割

In [32]:
frac = 0.8
random_state = 12

print(f"TP: {len(tp)}, TN: {len(tn)}")
tp = tp[tp['ENST_Full'] != "[Warning] ENST_with_Ver_not_available"]
tn = tn[tn['ENST_Full'] != "[Warning] ENST_with_Ver_not_available"]
tp = tp[tp['maxsplai'] != "NA"]
tn = tn[tn['maxsplai'] != "NA"]
print(f"Filtered out [Warning] ENST_with_Ver_not_available, TP: {len(tp)}, TN: {len(tn)}")

# Split the data into training and test sets
tp_train = tp.sample(frac=frac, random_state=random_state)
tp_test = tp.drop(tp_train.index)
tn_train = tn.sample(frac=frac, random_state=random_state)
tn_test = tn.drop(tn_train.index)

# Save the dataframes as pickle files
tp_train.to_pickle(f'train_test_pkls/tp_prescore_train_{random_state}.pkl')
tp_test.to_pickle(f'train_test_pkls/tp_prescore_test_{random_state}.pkl')
tn_train.to_pickle(f'train_test_pkls/tn_prescore_train_{random_state}.pkl')
tn_test.to_pickle(f'train_test_pkls/tn_prescore_test_{random_state}.pkl')

TP: 18432, TN: 78048


### 変数によるスコアリング（具体的なスコアはまだ入れない）

In [14]:
## Scoring with "s*"
scoring = Scoring()

# Load the dataframes
tp_train = pd.read_pickle(f'train_test_pkls/tp_prescore_train_{random_state}.pkl')
tn_train = pd.read_pickle(f'train_test_pkls/tn_prescore_train_{random_state}.pkl')

# Scoring
tp_train['insilico_screening'] = tp_train.parallel_apply(scoring.insilico_screening, axis=1)
tp_train['clinvar_screening'] = tp_train.parallel_apply(scoring.clinvar_screening, axis=1)
tp_train['recalibrated_splai'] = tp_train.parallel_apply(scoring.recal_scores_in_canon, axis=1)
tn_train['insilico_screening'] = tn_train.parallel_apply(scoring.insilico_screening, axis=1)
tn_train['clinvar_screening'] = tn_train.parallel_apply(scoring.clinvar_screening, axis=1)
tn_train['recalibrated_splai'] = tn_train.parallel_apply(scoring.recal_scores_in_canon, axis=1)

tp_train = tp_train[tp_train['insilico_screening'] != 'Not available']
tn_train = tn_train[tn_train['insilico_screening'] != 'Not available']
tp_train = tp_train[['variant_id', 'LABEL', 'maxsplai', 'recalibrated_splai', 'insilico_screening', 'clinvar_screening']]
tn_train = tn_train[['variant_id', 'LABEL', 'maxsplai', 'recalibrated_splai', 'insilico_screening', 'clinvar_screening']]

# Save the dataframes as pickle files
tp_train.to_pickle(f'train_test_pkls/tp_prescore_train_{random_state}_scored.pkl')
tn_train.to_pickle(f'train_test_pkls/tn_prescore_train_{random_state}_scored.pkl')

### 具体的なスコアをマッピングしてPSを算出する関数
def map_and_calc_score(row, score_map: dict) -> int:
    """
    Map the score to the solution
    s1, s2, s3, and s15 are clinvar_screening
    s4, s5, s6, s7, s8, s9, s10 and s11 are insilico_screening
    s12, s13 and s14 are recalibrated_splai
    PriortiyScore is the sum of the "clinvar_screening", "insilico_screening", and "recalibrated_splai"
    """
    # print(row)
    return int(score_map[row['recalibrated_splai']]) + int(score_map[row['insilico_screening']]) + int(score_map[row['clinvar_screening']])

data = pd.concat([tp_train, tn_train], ignore_index=True)
data.drop_duplicates(subset='variant_id', keep=False, inplace=True)
# data = data[data['PriorityScore'] != 'Not available']
data['LABEL'] = data['LABEL'].astype(int)
# data['PriorityScore'] = data['PriorityScore'].astype(float)
data['maxsplai'] = data['maxsplai'].astype(float)
ground_truth = np.array(data['LABEL'])
predictions_sp = np.array(data['maxsplai'])
auc2, var2 = delong_roc_variance(ground_truth, predictions_sp)
cilower2, ciupper2 = compute_auc_confidence_interval(auc2, var2)

print(f"AUC - SpliceAI (95%CI) : {auc2:.3f} [{cilower2:.4f}-{ciupper2:.4f}]")

AUC - SpliceAI (95%CI) : 0.983 [0.9818-0.9849]


### 重みづけのパターンの呼び出し 

In [11]:
# Load the results of ortools
import pickle
all_solutions = pickle.load(open(all_solutions_pkl, 'rb'))

### 最適な重み付けの検索 - search_optimal.pyで4つのチャンクに分けて実行

In [None]:
start = 0

def search_optimal(all_solutions, start, tp_train, tn_train, auc2=auc2):
    results = []
    buf: float = auc2
    max_auc: float = 0

    for i, solution in enumerate(all_solutions):
        if i <= start:
            continue

        solution.update({"s0": 0})

        tp_train['PriorityScore'] = tp_train.parallel_apply(
            map_and_calc_score, args=(solution,), axis=1)
        tn_train['PriorityScore'] = tn_train.parallel_apply(
            map_and_calc_score, args=(solution,), axis=1)

        data = pd.concat([tp_train, tn_train], ignore_index=True)
        data.drop_duplicates(subset='variant_id', keep=False, inplace=True)
        data = data[data['PriorityScore'] != 'Not available']
        data['LABEL'] = data['LABEL'].astype(int)
        data['PriorityScore'] = data['PriorityScore'].astype(float)
        data['maxsplai'] = data['maxsplai'].astype(float)
        ground_truth = np.array(data['LABEL'])

        predictions_fw = np.array(data['PriorityScore'])
        auc1, var1 = delong_roc_variance(ground_truth, predictions_fw)
        cilower1, ciupper1 = compute_auc_confidence_interval(auc1, var1)

        results.append({'index': i+1, 'solution': solution, 
                        'AUC': auc1, 'CI_lower': cilower1, 'CI_upper': ciupper1})

        max_auc = max(max_auc, auc1)
        
        if auc1 > buf:
            buf = auc1
            print(f"\n===== New best AUC: {auc1:.10f} with solution {i+1} =======")
            print(f"New best solution {i}: {solution} \n")
            predictions_sp = np.array(data['maxsplai'])
            auc2, var2 = delong_roc_variance(ground_truth, predictions_sp)
            cilower2, ciupper2 = compute_auc_confidence_interval(auc2, var2)
            p_value_log = delong_roc_test(ground_truth, predictions_fw, predictions_sp)
            print(f"AUC - Framework (95%CI): {auc1:.3f} [{cilower1:.4f}-{ciupper1:.4f}]")
            print(f"AUC - SpliceAI (95%CI) : {auc2:.3f} [{cilower2:.4f}-{ciupper2:.4f}]")
            print(f"p-value (DeLong Test)  : {10**p_value_log[0][0]:.2e}\n")

        if i % 10 == 0:
            print(f"###  Processed {i+1} solutions. Max AUC: {max_auc:.8f}  ###")
    
    return results


# Load train data from pickle
tp_train = pd.read_pickle(f'train_test_pkls/tp_prescore_train_{random_state}_scored.pkl')
tn_train = pd.read_pickle(f'train_test_pkls/tn_prescore_train_{random_state}_scored.pkl')

results = search_optimal(all_solutions, start=0, tp_train=tp_train, tn_train=tn_train)
# Save the results with start index
results_pkl = f'results/results_{start}_to_end.pkl'
with open(results_pkl, 'wb') as f:
    pickle.dump(results, f)


# for i, solution in enumerate(all_solutions):
#     if i < start:
#         continue
    
#     solution.update({"s0": 0})
    
#     tp_train['PriorityScore'] = tp_train.parallel_apply(
#         map_and_calc_score, args=(solution,), axis=1)
#     tn_train['PriorityScore'] = tn_train.parallel_apply(
#         map_and_calc_score, args=(solution,), axis=1)
    
#     data = pd.concat([tp_train, tn_train], ignore_index=True)
#     data.drop_duplicates(subset='variant_id', keep=False, inplace=True)
#     data = data[data['PriorityScore'] != 'Not available']
#     data['LABEL'] = data['LABEL'].astype(int)
#     data['PriorityScore'] = data['PriorityScore'].astype(float)
#     data['maxsplai'] = data['maxsplai'].astype(float)
#     ground_truth = np.array(data['LABEL'])

#     predictions_fw = np.array(data['PriorityScore'])
#     auc1, var1 = delong_roc_variance(ground_truth, predictions_fw)
#     cilower1, ciupper1 = compute_auc_confidence_interval(auc1, var1)

#     results.append({'index': i+1, 'solution': solution, 
#                     'AUC': auc1, 'CI_lower': cilower1, 'CI_upper': ciupper1})

#     if auc1 > buf:
#         buf = auc1
#         print(f"\n===== New best AUC: {auc1:.10f} with solution {i+1} =======")
#         print(f"New best solution {i}: {solution} \n")
#         predictions_sp = np.array(data['maxsplai'])
#         auc2, var2 = delong_roc_variance(ground_truth, predictions_sp)
#         cilower2, ciupper2 = compute_auc_confidence_interval(auc2, var2)
#         p_value_log = delong_roc_test(ground_truth, predictions_fw, predictions_sp)
#         print(f"AUC - Framework (95%CI): {auc1:.3f} [{cilower1:.4f}-{ciupper1:.4f}]")
#         print(f"AUC - SpliceAI (95%CI) : {auc2:.3f} [{cilower2:.4f}-{ciupper2:.4f}]")
#         print(f"p-value (DeLong Test)  : {10**p_value_log[0][0]:.2e}\n")
#         print("===========================================================")

#     if i % 10 == 0:
#         print(f"###  Processed {i+1} solutions  ###")
#         print(f"Solution {i+1}: {solution}, AUC: {auc1:.10f}")

#     if i > 1000:
#         break

###  Processed 11 solutions. Max AUC: 0.96722315  ###
###  Processed 21 solutions. Max AUC: 0.96757415  ###
###  Processed 31 solutions. Max AUC: 0.96757415  ###
###  Processed 41 solutions. Max AUC: 0.96757415  ###
###  Processed 51 solutions. Max AUC: 0.96757415  ###


Exception ignored in: <function _releaseLock at 0x7086c127a940>
Traceback (most recent call last):
  File "/home/utsu/miniforge3/envs/nar/lib/python3.8/logging/__init__.py", line 227, in _releaseLock
    def _releaseLock():
KeyboardInterrupt: 


###  Processed 61 solutions. Max AUC: 0.96757415  ###


## Scoring System Test

In [44]:
results = []
buf: float = 0.984

scoring = Scoring()

tp_train = pd.read_pickle(f'train_test_pkls/tp_prescore_train_{random_state}.pkl')
tn_train = pd.read_pickle(f'train_test_pkls/tn_prescore_train_{random_state}.pkl')

tp_train['recalibrated_splai'] = tp_train.parallel_apply(scoring.recal_scores_in_canon, axis=1)
tp_train['insilico_screening'] = tp_train.parallel_apply(scoring.insilico_screening, axis=1)
tp_train['clinvar_screening'] = tp_train.parallel_apply(scoring.clinvar_screening, axis=1)
tp_train = tp_train[tp_train['insilico_screening'] != 'Not available']

tn_train['recalibrated_splai'] = tn_train.parallel_apply(scoring.recal_scores_in_canon, axis=1)
tn_train['insilico_screening'] = tn_train.parallel_apply(scoring.insilico_screening, axis=1)
tn_train['clinvar_screening'] = tn_train.parallel_apply(scoring.clinvar_screening, axis=1)
tn_train = tn_train[tn_train['insilico_screening'] != 'Not available']

tp_train = tp_train[['variant_id', 'LABEL', 'maxsplai', 'recalibrated_splai', 'insilico_screening', 'clinvar_screening']]
tn_train = tn_train[['variant_id', 'LABEL', 'maxsplai', 'recalibrated_splai', 'insilico_screening', 'clinvar_screening']]

def map_and_calc_score(row, score_map: dict):
    """
    Mapping the scores.
    s1, s2, s3, and s15: clinvar_screening
    s4, s5, s6, s7, s8, s9, s10 and s11: insilico_screening
    s12, s13 and s14: recalibrated_splai
    PriortiyScore is the sum of the "clinvar_screening", "insilico_screening", and "recalibrated_splai"
    """
    if row['recalibrated_splai'] == "s0":
        recal_score: int = 0
    else:
        recal_score: int = int(score_map[row['recalibrated_splai']])
        
    priority_score = recal_score + int(score_map[row['insilico_screening']]) + int(score_map[row['clinvar_screening']])
    return priority_score

In [45]:
solution = all_solutions[4]

In [56]:
tp_train['PriorityScore'] = tp_train.parallel_apply(map_and_calc_score, args=(solution,), axis=1)
tn_train['PriorityScore'] = tn_train.parallel_apply(map_and_calc_score, args=(solution,), axis=1)
data = pd.concat([tp_train, tn_train], ignore_index=True)
data.drop_duplicates(subset='variant_id', keep=False, inplace=True)

data['LABEL'] = data['LABEL'].astype(int)
# Extract rows with PriorityScore not 'Not available'
data = data[data['PriorityScore'] != 'Not available']
data['PriorityScore'] = data['PriorityScore'].astype(float)
data['maxsplai'] = data['maxsplai'].astype(float)

ground_truth = np.array(data['LABEL'])
predictions_fw = np.array(data['PriorityScore'])

auc1, var1 = delong_roc_variance(ground_truth, predictions_fw)
cilower1, ciupper1 = compute_auc_confidence_interval(auc1, var1)

i = 1
if auc1 > buf:
    buf = auc1
    print(f"\n===== New best AUC: {auc1:.10f} with solution {i+1} =======")
    print(f"New best solution {i}: {solution} \n")
    predictions_sp = np.array(data['maxsplai'])
    auc2, var2 = delong_roc_variance(ground_truth, predictions_sp)
    cilower2, ciupper2 = compute_auc_confidence_interval(auc2, var2)
    p_value_log = delong_roc_test(ground_truth, predictions_fw, predictions_sp)
    print(f"AUC - Framework (95%CI): {auc1:.3f} [{cilower1:.4f}-{ciupper1:.4f}]")
    print(f"AUC - SpliceAI (95%CI) : {auc2:.3f} [{cilower2:.4f}-{ciupper2:.4f}]")
    print(f"p-value (DeLong Test)  : {10**p_value_log[0][0]:.2e}\n")
    print("===========================================================")


New best solution 1: {'s1': 3, 's2': 2, 's3': 1, 's4': 0, 's5': 1, 's6': 2, 's7': 3, 's8': 4, 's9': 3, 's10': 9, 's11': 2, 's12': 0, 's13': 0, 's14': 1, 's15': 0} 

AUC - Framework (95%CI): 0.987 [0.9851-0.9888]
AUC - SpliceAI (95%CI) : 0.985 [0.9823-0.9880]
p-value (DeLong Test)  : 1.97e-01



In [54]:
print(auc1, cilower1, ciupper1)

0.9869208764921695 0.9850764378574689 0.9887653151268702


In [None]:
# tp_train['PriorityScore'] = tp_train.parallel_apply(scoring.calc_priority_score, axis=1)
# tp_train = scoring.calc_priority_score(tp_train)

# tp_train['PriorityScore'] = tp_train['insilico_screening'] + tp_train['clinvar_screening']
# If PriorityScore under 0, set it to 0
tp_train['PriorityScore'] = tp_train['PriorityScore'].apply(lambda x: 0 if x < 0 else x)

tn_train['insilico_screening'] = tn_train.parallel_apply(scoring.insilico_screening, axis=1)
tn_train['clinvar_screening'] = tn_train.parallel_apply(scoring.clinvar_screening, axis=1)
# tn_train['PriorityScore'] = tn_train.parallel_apply(scoring.calc_priority_score, axis=1)
# tn_train = scoring.calc_priority_score(tn_train)
tn_train = tn_train[tn_train['insilico_screening'] != 'Not available']
tn_train['PriorityScore'] = tn_train['insilico_screening'] + tn_train['clinvar_screening']
# If PriorityScore under 0, set it to 0
tn_train['PriorityScore'] = tn_train['PriorityScore'].apply(lambda x: 0 if x < 0 else x)



In [None]:


for i, solution in enumerate(all_solutions):
    # if i < 2000:
    #     continue

    ths_scores = {'clinvar_same_pos': solution['s1'],
            'clinvar_same_motif': solution['s2'],
            'clinvar_else': solution['s3'],
            'non_canon_splai_lte_0.1_outside': solution['s4'],    
            'non_canon_splai_lte_0.1_other': solution['s5'],
            'non_canon_splai_bet_0.1_0.2': solution['s6'],
            'non_canon_splai_gte_0.2': solution['s7'],
            'canon_strong': solution['s8'], 
            'canon_moderate': solution['s9'], 
            'frameshift_nmd_eloF': solution['s10'], 
            'frameshift_nmd_not_eloF': solution['s11'],
            'canon_splai_lte_0.1': solution['s12'],
            'canon_splai_bet_0.1_0.2': solution['s13'],
            'canon_splai_gte_0.2': solution['s14'],
            'clinvar_blb': solution['s15']
            }
    

    # Extract the columns needed
    tp_train = tp_train[['variant_id', 'LABEL', 'PriorityScore', 'maxsplai']]
    tn_train = tn_train[['variant_id', 'LABEL', 'PriorityScore', 'maxsplai']]

    ### ========================================================== ##
    data = pd.concat([tp_train, tn_train], ignore_index=True)
    data.drop_duplicates(subset='variant_id', keep=False, inplace=True)

    # Cast the columns to float type
    data['LABEL'] = data['LABEL'].astype(int)
    # Extract rows with PriorityScore not 'Not available'
    data = data[data['PriorityScore'] != 'Not available']
    data['PriorityScore'] = data['PriorityScore'].astype(float)
    data['maxsplai'] = data['maxsplai'].astype(float)

    ## DeLong test and AUC confidence interval
    ground_truth = np.array(data['LABEL'])
    predictions_fw = np.array(data['PriorityScore'])

    auc1, var1 = delong_roc_variance(ground_truth, predictions_fw)
    cilower1, ciupper1 = compute_auc_confidence_interval(auc1, var1)

    results.append(
        {'index': i+1, 's1': solution['s1'], 's2': solution['s2'], 
         's3': solution['s3'], 's4': solution['s4'], 's5': solution['s5'], 
         's6': solution['s6'], 's7': solution['s7'], 's8': solution['s8'], 
         's9': solution['s9'], 's10': solution['s10'], 's11': solution['s11'], 
         's12': solution['s12'], 's13': solution['s13'], 's14': solution['s14'], 's15': solution['s15'],
         'auROC': f"{auc1:.10f}, '95% Confidence Interval': {cilower1:.12f}-{ciupper1:.12f}"
        }
    )
    # logger.info(f"Processed solution {i+1}: AUC: {auc1:.10f}")
    if i % 50 == 0:
        print(f"###  Processed {i} solutions  ###")
    
    if auc1 > buf:
        buf = auc1
        print(f"\n===== New best AUC: {auc1:.10f} with solution {i+1} =======")
        print(f"New best solution {i}: {solution} \n")
        predictions_sp = np.array(data['maxsplai'])
        auc2, var2 = delong_roc_variance(ground_truth, predictions_sp)
        cilower2, ciupper2 = compute_auc_confidence_interval(auc2, var2)
        p_value_log = delong_roc_test(ground_truth, predictions_fw, predictions_sp)
        print(f"AUC - Framework (95%CI): {auc1:.3f} [{cilower1:.4f}-{ciupper1:.4f}]")
        print(f"AUC - SpliceAI (95%CI) : {auc2:.3f} [{cilower2:.4f}-{ciupper2:.4f}]")
        print(f"p-value (DeLong Test)  : {10**p_value_log[0][0]:.2e}\n")
        print("===========================================================")


#### 昔の計算テスト

In [None]:
for i in range(len(tn_train)):
	col_num = tn_train.columns.get_loc('clinvar_same_pos')
	same_pos = tn_train.iat[i, tn_train.columns.get_loc('clinvar_same_pos')].replace("'", "")
	same_motif_clinsigs:list = tn_train.iat[i, tn_train.columns.get_loc('same_motif_clinsigs')]
	print(f"Pos: {same_pos}  Motifs: {same_motif_clinsigs}")
	if same_pos in ['Benign', 'Likely_benign', 'Benign/Likely_benign']:
		print("B/LB")
	else:
		if same_pos in ['Pathogenic', 'Likely_pathogenic', 'Pathogenic/Likely_pathogenic']:
			print("P/LP")
		else:
			if 'Pathogenic' in same_motif_clinsigs:
				print("Same motif pathogenic")
			elif 'pathogenic' in same_motif_clinsigs:
				print("Same motif pathogenic")
			else:
				print("VUS")

In [10]:
solution = all_solutions[4]
ths_scores = {'clinvar_same_pos': solution['s1'],
        'clinvar_same_motif': solution['s2'],
        'clinvar_else': solution['s3'],
        'non_canon_splai_lte_0.1_outside': solution['s4'],    
        'non_canon_splai_lte_0.1_other': solution['s5'],
        'non_canon_splai_bet_0.1_0.2': solution['s6'],
        'non_canon_splai_gte_0.2': solution['s7'],
        'canon_strong': solution['s8'], 
        'canon_moderate': solution['s9'], 
        'frameshift_nmd_eloF': solution['s10'], 
        'frameshift_nmd_not_eloF': solution['s11'],
        'canon_splai_lte_0.1': solution['s12'],
        'canon_splai_bet_0.1_0.2': solution['s13'],
        'canon_splai_gte_0.2': solution['s14'],
        'clinvar_blb': solution['s15']
        }
scoring = Scoring(ths=ths_scores)

In [None]:
tp_train = pd.read_pickle(f'train_test_pkls/tp_prescore_train_{random_state}.pkl')
tn_train = pd.read_pickle(f'train_test_pkls/tn_prescore_train_{random_state}.pkl')

tp_train['insilico_screening'] = tp_train.parallel_apply(scoring.insilico_screening, axis=1)
tp_train['clinvar_screening'] = tp_train.parallel_apply(scoring.clinvar_screening, axis=1)
# tp_train['PriorityScore'] = tp_train.parallel_apply(scoring.calc_priority_score, axis=1)
# tp_train = scoring.calc_priority_score(tp_train)
tp_train = tp_train[tp_train['insilico_screening'] != 'Not available']
tp_train['PriorityScore'] = tp_train['insilico_screening'] + tp_train['clinvar_screening']
# If PriorityScore under 0, set it to 0
tp_train['PriorityScore'] = tp_train['PriorityScore'].apply(lambda x: 0 if x < 0 else x)

In [13]:
tp_train[['clinvar_same_pos', 'same_motif_clinsigs', 'clinvar_screening', 'insilico_screening', 'PriorityScore']].head(10)

Unnamed: 0,clinvar_same_pos,same_motif_clinsigs,clinvar_screening,insilico_screening,PriorityScore
25929,'Pathogenic',[Pathogenic],2,5,7
93761,'Pathogenic',"[Pathogenic, Pathogenic/Likely_pathogenic, Pat...",2,1,3
121362,No_ClinVar_info_found,"[Likely_benign, Likely_benign, Pathogenic]",1,7,8
41523,'Pathogenic/Likely_pathogenic',"[Likely_pathogenic, Likely_pathogenic, Likely_...",2,7,9
87754,No_ClinVar_info_found,[No_ClinVar_info_found],0,5,5
40326,No_ClinVar_info_found,"[Likely_benign, Likely_benign, Likely_benign, ...",1,7,8
2221,No_ClinVar_info_found,[No_ClinVar_info_found],0,5,5
70327,No_ClinVar_info_found,[Likely_pathogenic],0,5,5
51058,No_ClinVar_info_found,[No_ClinVar_info_found],0,1,1
113035,No_ClinVar_info_found,"[Likely_benign, Likely_benign]",0,5,5


テストここまで -------------------

### Find optimal weights

In [None]:
results = []
buf: float = 0.984

for i, solution in enumerate(all_solutions):
    # if i < 2000:
    #     continue

    ths_scores = {'clinvar_same_pos': solution['s1'],
            'clinvar_same_motif': solution['s2'],
            'clinvar_else': solution['s3'],
            'non_canon_splai_lte_0.1_outside': solution['s4'],    
            'non_canon_splai_lte_0.1_other': solution['s5'],
            'non_canon_splai_bet_0.1_0.2': solution['s6'],
            'non_canon_splai_gte_0.2': solution['s7'],
            'canon_strong': solution['s8'], 
            'canon_moderate': solution['s9'], 
            'frameshift_nmd_eloF': solution['s10'], 
            'frameshift_nmd_not_eloF': solution['s11'],
            'canon_splai_lte_0.1': solution['s12'],
            'canon_splai_bet_0.1_0.2': solution['s13'],
            'canon_splai_gte_0.2': solution['s14'],
            'clinvar_blb': solution['s15']
            }
    
    scoring = Scoring(ths=ths_scores)

    tp_train = pd.read_pickle(f'train_test_pkls/tp_prescore_train_{random_state}.pkl')
    tn_train = pd.read_pickle(f'train_test_pkls/tn_prescore_train_{random_state}.pkl')

    tp_train['insilico_screening'] = tp_train.parallel_apply(scoring.insilico_screening, axis=1)
    tp_train['clinvar_screening'] = tp_train.parallel_apply(scoring.clinvar_screening, axis=1)
    # tp_train['PriorityScore'] = tp_train.parallel_apply(scoring.calc_priority_score, axis=1)
    # tp_train = scoring.calc_priority_score(tp_train)
    tp_train = tp_train[tp_train['insilico_screening'] != 'Not available']
    tp_train['PriorityScore'] = tp_train['insilico_screening'] + tp_train['clinvar_screening']
    # If PriorityScore under 0, set it to 0
    tp_train['PriorityScore'] = tp_train['PriorityScore'].apply(lambda x: 0 if x < 0 else x)

    tn_train['insilico_screening'] = tn_train.parallel_apply(scoring.insilico_screening, axis=1)
    tn_train['clinvar_screening'] = tn_train.parallel_apply(scoring.clinvar_screening, axis=1)
    # tn_train['PriorityScore'] = tn_train.parallel_apply(scoring.calc_priority_score, axis=1)
    # tn_train = scoring.calc_priority_score(tn_train)
    tn_train = tn_train[tn_train['insilico_screening'] != 'Not available']
    tn_train['PriorityScore'] = tn_train['insilico_screening'] + tn_train['clinvar_screening']
    # If PriorityScore under 0, set it to 0
    tn_train['PriorityScore'] = tn_train['PriorityScore'].apply(lambda x: 0 if x < 0 else x)

    # Extract the columns needed
    tp_train = tp_train[['variant_id', 'LABEL', 'PriorityScore', 'maxsplai']]
    tn_train = tn_train[['variant_id', 'LABEL', 'PriorityScore', 'maxsplai']]

    ### ========================================================== ##
    data = pd.concat([tp_train, tn_train], ignore_index=True)
    data.drop_duplicates(subset='variant_id', keep=False, inplace=True)

    # Cast the columns to float type
    data['LABEL'] = data['LABEL'].astype(int)
    # Extract rows with PriorityScore not 'Not available'
    data = data[data['PriorityScore'] != 'Not available']
    data['PriorityScore'] = data['PriorityScore'].astype(float)
    data['maxsplai'] = data['maxsplai'].astype(float)

    ## DeLong test and AUC confidence interval
    ground_truth = np.array(data['LABEL'])
    predictions_fw = np.array(data['PriorityScore'])

    auc1, var1 = delong_roc_variance(ground_truth, predictions_fw)
    cilower1, ciupper1 = compute_auc_confidence_interval(auc1, var1)

    results.append(
        {'index': i+1, 's1': solution['s1'], 's2': solution['s2'], 
         's3': solution['s3'], 's4': solution['s4'], 's5': solution['s5'], 
         's6': solution['s6'], 's7': solution['s7'], 's8': solution['s8'], 
         's9': solution['s9'], 's10': solution['s10'], 's11': solution['s11'], 
         's12': solution['s12'], 's13': solution['s13'], 's14': solution['s14'], 's15': solution['s15'],
         'auROC': f"{auc1:.10f}, '95% Confidence Interval': {cilower1:.12f}-{ciupper1:.12f}"
        }
    )
    # logger.info(f"Processed solution {i+1}: AUC: {auc1:.10f}")
    if i % 50 == 0:
        print(f"###  Processed {i} solutions  ###")
    
    if auc1 > buf:
        buf = auc1
        print(f"\n===== New best AUC: {auc1:.10f} with solution {i+1} =======")
        print(f"New best solution {i}: {solution} \n")
        predictions_sp = np.array(data['maxsplai'])
        auc2, var2 = delong_roc_variance(ground_truth, predictions_sp)
        cilower2, ciupper2 = compute_auc_confidence_interval(auc2, var2)
        p_value_log = delong_roc_test(ground_truth, predictions_fw, predictions_sp)
        print(f"AUC - Framework (95%CI): {auc1:.3f} [{cilower1:.4f}-{ciupper1:.4f}]")
        print(f"AUC - SpliceAI (95%CI) : {auc2:.3f} [{cilower2:.4f}-{ciupper2:.4f}]")
        print(f"p-value (DeLong Test)  : {10**p_value_log[0][0]:.2e}\n")
        print("===========================================================")

# 1800から続き

### パフォーマンスの確認 - テストバリアントセットを使う

In [33]:
start = [0, 4050, 8100, 12151]

results_1 = pd.read_pickle(f'results/results_{start[0]}_to_end.pkl')
df1 = pd.DataFrame(results_1)
results_2 = pd.read_pickle(f'results/results_{start[1]}_to_end.pkl')
df2 = pd.DataFrame(results_2)
results_3 = pd.read_pickle(f'results/results_{start[2]}_to_end.pkl')
df3 = pd.DataFrame(results_3)
results_4 = pd.read_pickle(f'results/results_{start[3]}_to_end.pkl')
df4 = pd.DataFrame(results_4)

# Concatenate the dataframes
df = pd.concat([df1, df2, df3, df4], ignore_index=True)
# Separate the solution into columns
df = pd.concat([df.drop(['solution'], axis=1), df['solution'].apply(pd.Series)], axis=1)
def calc_sample_var(row) -> float:
    score_arr = np.array([row['s1'], row['s2'], row['s3'], row['s4'], row['s5'],
                          row['s6'], row['s7'], row['s8'], row['s9'], row['s10'], 
                          row['s11'], row['s12'], row['s13'], row['s14'], row['s15']])
    return np.var(score_arr)

df['sample_var'] = df.apply(calc_sample_var, axis=1)
max_auc = df['AUC'].max()
df_max_auc = df.loc[df['AUC'] == df['AUC'].max(), :]
solution = df_max_auc.iloc[0, 1:].to_dict()

s = solution.pop('AUC')
s = solution.pop('CI_lower')
s = solution.pop('CI_upper')
s = solution.pop('sample_var')
# solution.update({"s0": 0})

In [34]:
random_state = 12

## Scoring with "s*"
scoring = Scoring()

tp_test = pd.read_pickle(f'train_test_pkls/tp_prescore_test_{random_state}.pkl')
tn_test = pd.read_pickle(f'train_test_pkls/tn_prescore_test_{random_state}.pkl')
tp_test['insilico_screening'] = tp_test.parallel_apply(scoring.insilico_screening, axis=1)
tp_test['clinvar_screening'] = tp_test.parallel_apply(scoring.clinvar_screening, axis=1)
tp_test['recalibrated_splai'] = tp_test.parallel_apply(scoring.recal_scores_in_canon, axis=1)
tn_test['insilico_screening'] = tn_test.parallel_apply(scoring.insilico_screening, axis=1)
tn_test['clinvar_screening'] = tn_test.parallel_apply(scoring.clinvar_screening, axis=1)
tn_test['recalibrated_splai'] = tn_test.parallel_apply(scoring.recal_scores_in_canon, axis=1)
# Save the dataframes as pickle files
tp_test.to_pickle(f'train_test_pkls/tp_prescore_test_{random_state}_scored.pkl')
tn_test.to_pickle(f'train_test_pkls/tn_prescore_test_{random_state}_scored.pkl')

In [35]:
def map_and_calc_score(row, score_map: dict) -> int:
    return int(score_map[row['recalibrated_splai']]) + int(score_map[row['insilico_screening']]) + int(score_map[row['clinvar_screening']])

# Load test data
tp_test = pd.read_pickle(f'train_test_pkls/tp_prescore_test_{random_state}_scored.pkl')
tn_test = pd.read_pickle(f'train_test_pkls/tn_prescore_test_{random_state}_scored.pkl')

tp_test['PriorityScore'] = tp_test.parallel_apply(
    map_and_calc_score, args=(solution,), axis=1)
tn_test['PriorityScore'] = tn_test.parallel_apply(
    map_and_calc_score, args=(solution,), axis=1)

# Extract the columns needed
tp_test = tp_test[['variant_id', 'LABEL', 'PriorityScore', 'maxsplai', 
				   'maxentscan_diff', 'maxpangolin', 'Squirls']]
tn_test = tn_test[['variant_id', 'LABEL', 'PriorityScore', 'maxsplai', 
				   'maxentscan_diff', 'maxpangolin', 'Squirls']]


In [36]:

data = pd.concat([tp_test, tn_test], ignore_index=True)
data.drop_duplicates(subset='variant_id', keep=False, inplace=True)
data = data[data['PriorityScore'] != 'Not available']
data['LABEL'] = data['LABEL'].astype(int)
data['PriorityScore'] = data['PriorityScore'].astype(float)
data['maxsplai'] = data['maxsplai'].astype(float)

# data['maxentscan_diff'] = data['maxentscan_diff'].replace('', np.nan)
# data['maxentscan_diff'] = data['maxentscan_diff'].fillna(np.nan)
data.loc[data['maxentscan_diff'] == '', 'maxentscan_diff'] = np.nan
data2 = data.copy()
data2= data2.dropna(subset=['maxentscan_diff'])
data2['maxentscan_diff'] = data2['maxentscan_diff'].astype(float)
# data['maxentscan_diff'] = data['maxentscan_diff'].astype(float)
data['maxpangolin'] = data['maxpangolin'].astype(float)
data.loc[data['Squirls'] == "NA", 'Squirls'] = np.nan
data3 = data.copy()
data3 = data3.dropna(subset=['Squirls'])
data3['Squirls'] = data3['Squirls'].astype(float)


In [37]:
## DeLong test and AUC confidence interval
ground_truth = np.array(data['LABEL'])
ground_truth2 = np.array(data2['LABEL'])
ground_truth3 = np.array(data3['LABEL'])
predictions_fw = np.array(data['PriorityScore'])
predictions_sp = np.array(data['maxsplai'])
predictions_en = np.array(data2['maxentscan_diff'])
predictions_pa = np.array(data['maxpangolin'])
predictions_sq = np.array(data3['Squirls'])


In [38]:
auc1, var1 = delong_roc_variance(ground_truth, predictions_fw)
cilower1, ciupper1 = compute_auc_confidence_interval(auc1, var1)
auc2, var2 = delong_roc_variance(ground_truth, predictions_sp)
cilower2, ciupper2 = compute_auc_confidence_interval(auc2, var2)
auc3, var3 = delong_roc_variance(ground_truth2, predictions_en)
cilower3, ciupper3 = compute_auc_confidence_interval(auc3, var3)
auc4, var4 = delong_roc_variance(ground_truth, predictions_pa)
cilower4, ciupper4 = compute_auc_confidence_interval(auc4, var4)
auc5, var5 = delong_roc_variance(ground_truth3, predictions_sq)
cilower5, ciupper5 = compute_auc_confidence_interval(auc5, var5)

In [40]:
p_value_log = delong_roc_test(ground_truth, predictions_fw, predictions_sp)

print(f"AUC - Framework (95%CI): {auc1:.3f} [{cilower1:.4f}-{ciupper1:.4f}]")
print(f"AUC - SpliceAI (95%CI) : {auc2:.3f} [{cilower2:.4f}-{ciupper2:.4f}]")
print(f"p-value (DeLong Test)  : {10**p_value_log[0][0]:.2e}")

print(f"AUC - MaxEntScan (95%CI): {auc3:.3f} [{cilower3:.4f}-{ciupper3:.4f}]")
print(f"AUC - MaxPangolin (95%CI): {auc4:.3f} [{cilower4:.4f}-{ciupper4:.4f}]")
print(f"AUC - Squirls (95%CI): {auc5:.3f} [{cilower5:.4f}-{ciupper5:.4f}]")

AUC - Framework (95%CI): 0.993 [0.9907-0.9948]
AUC - SpliceAI (95%CI) : 0.984 [0.9804-0.9879]
p-value (DeLong Test)  : 3.73e-07
AUC - MaxEntScan (95%CI): 0.989 [0.9850-0.9928]
AUC - MaxPangolin (95%CI): 0.950 [0.9441-0.9566]
AUC - Squirls (95%CI): 0.984 [0.9810-0.9872]


In [41]:
import plotly.graph_objects as go
import numpy as np
from sklearn.metrics import roc_curve, roc_auc_score

# ROC curve (LABEL, PriorityScore)
fpr1, tpr1, thresholds1 = roc_curve(data['LABEL'], data['PriorityScore'])
auc1 = roc_auc_score(data['LABEL'], data['PriorityScore'])

fpr2, tpr2, thresholds2 = roc_curve(data['LABEL'], data['maxsplai'])
auc2 = roc_auc_score(data['LABEL'], data['maxsplai'])

fpr3, tpr3, thresholds3 = roc_curve(data2['LABEL'], data2['maxentscan_diff'])
auc3 = roc_auc_score(data2['LABEL'], data2['maxentscan_diff'])

fpr4, tpr4, thresholds4 = roc_curve(data['LABEL'], data['maxpangolin'])
auc4 = roc_auc_score(data['LABEL'], data['maxpangolin'])

fpr5, tpr5, thresholds5 = roc_curve(data3['LABEL'], data3['Squirls'])
auc5 = roc_auc_score(data3['LABEL'], data3['Squirls'])

# Calculate optimal threshold from ROC curve by Youden's J statistic
Youden_index = np.argmax(tpr1 - fpr1)
optimal_threshold = thresholds1[Youden_index]
print('Optimal threshold (using Youden index):', optimal_threshold)

youden_maxent = np.argmax(tpr3 - fpr3)
optimal_threshold_maxent = thresholds3[youden_maxent]
print('Optimal threshold (using Youden index) - MaxEntScan:', optimal_threshold_maxent)
youden_pangolin = np.argmax(tpr4 - fpr4)
optimal_threshold_pangolin = thresholds4[youden_pangolin]
print('Optimal threshold (using Youden index) - Pangolin:', optimal_threshold_pangolin)
youden_squirls = np.argmax(tpr5 - fpr5)
optimal_threshold_squirls = thresholds5[youden_squirls]
print('Optimal threshold (using Youden index) - Squirls:', optimal_threshold_squirls)


# plot ROC curve using Plotly
fig = go.Figure()

# Add traces
fig.add_trace(go.Scatter(
    x=fpr1, y=tpr1, mode='lines', 
    name=f"Framework      ({auc1:.3f} [{cilower1:.4f}-{ciupper1:.4f}])", 
    line=dict(color='#E41A1C', width=2))
    )
fig.add_trace(go.Scatter(
    x=fpr2, y=tpr2, mode='lines', 
    name=f"SpliceAI Alone ({auc2:.3f} [{cilower2:.4f}-{ciupper2:.4f}])", 
    line=dict(color='#377EB8', width=2))
    )
fig.add_trace(go.Scatter(
    x=fpr3, y=tpr3, mode='lines',
    name=f"MaxEntScan   ({auc3:.3f} [{cilower3:.4f}-{ciupper3:.4f}])",
    line=dict(color='#4DAF4A', width=2))
    )
fig.add_trace(go.Scatter(
    x=fpr4, y=tpr4, mode='lines',
    name=f"Pangolin         ({auc4:.3f} [{cilower4:.4f}-{ciupper4:.4f}])",
    line=dict(color='#984EA3', width=2))
    )

fig.add_trace(go.Scatter(
    x=fpr5, y=tpr5, mode='lines',
    name=f"Squirls            ({auc5:.3f} [{cilower5:.4f}-{ciupper5:.4f}])",
    line=dict(color='#FF7F00', width=2))
    )

fig.add_trace(go.Scatter(
    x=[0, 1], y=[0, 1], mode='lines', name='Chance', 
    line=dict(color='gray', width=2, dash='dash'), showlegend=False)
    )

# Add an annotation
# fig.add_annotation(x=0.6, y=0.05, xref="paper", yref="paper",
#                    text=f"DeLong's test p-value = {10**p_value_log[0][0]:.2e}",
#                    showarrow=False,
#                    font=dict(family="Arial, sans-serif", size=12, color="black"),
#                    bgcolor='rgba(243, 243, 243, 1)',
#                 #    bordercolor="black",
#                    borderwidth=2)

# Add titles and labels
fig.update_layout(title='ROC Curve Comparison',
                  xaxis_title='False Positive Rate',
                  yaxis_title='True Positive Rate',
                  plot_bgcolor='rgba(243, 243, 243, 1)',
                  paper_bgcolor='rgba(243, 243, 243, 0)',
                  legend_title='Prediction methods (AUC [95%CI])',
                  legend=dict(y=0.09, x=0.925, xanchor='right', yanchor='bottom', 
                              bgcolor='rgba(243, 243, 243, 1)',
                              font=dict(family="Arial, sans-serif", size=12, color="black")),
                  margin=dict(l=40, r=40, t=40, b=40))

fig.update_xaxes(range=[-0.05, 1.05])
fig.update_yaxes(range=[-0.05, 1.05])
fig.update_layout(width=480, height=480)
fig.write_html("figs/roc-auc.html")

# Change the font size of axix labels
fig.update_xaxes(tickfont=dict(size=16))
fig.update_yaxes(tickfont=dict(size=16))

# Add zoomed-in ROC curve in the top left corner
fig.add_trace(go.Scatter(
    x=fpr1, y=tpr1, mode='lines', 
    name=f"Framework      ({auc1:.3f} [{cilower1:.4f}-{ciupper1:.4f}])", 
    line=dict(color='#E41A1C', width=2), showlegend=False)
    )

# fig.update_layout(showlegend=False)

# Show figure
fig.show()


Optimal threshold (using Youden index): 1.0
Optimal threshold (using Youden index) - MaxEntScan: 1.798
Optimal threshold (using Youden index) - Pangolin: 0.15000000596046448
Optimal threshold (using Youden index) - Squirls: 0.03


In [49]:
np.round(optimal_threshold_pangolin, 3)


0.15

In [110]:
data['PriorityScore'].min()

-9.0

In [25]:
# Plot Sensitivity-Specificity of PriorityScore curve
# x-axis: PriortyScore, y-axis: Sensitivity and Specificity

# Calculate Sensitivity and Specificity
def calc_sensitivity_specificity(df: pd.DataFrame, threshold: float) -> tuple:
	tp = df[(df['PriorityScore'] >= threshold) & (df['LABEL'] == 1)].shape[0]
	tn = df[(df['PriorityScore'] < threshold) & (df['LABEL'] == 0)].shape[0]
	fp = df[(df['PriorityScore'] >= threshold) & (df['LABEL'] == 0)].shape[0]
	fn = df[(df['PriorityScore'] < threshold) & (df['LABEL'] == 1)].shape[0]
	sensitivity = tp / (tp + fn)
	specificity = tn / (tn + fp)
	return sensitivity, specificity

thresholds = np.linspace(-9,12,22)
sensitivity = []
specificity = []

for threshold in thresholds:
	sens, spec = calc_sensitivity_specificity(data, threshold)
	sensitivity.append(sens)
	specificity.append(spec)

fig = go.Figure()

# Add traces
fig.add_trace(go.Scatter(
	x=thresholds, y=sensitivity, mode='lines', 
	name='Sensitivity', line=dict(color='#1F77B4', width=2))
	)
fig.add_trace(go.Scatter(
	x=thresholds, y=specificity, mode='lines', 
	name='Specificity', line=dict(color='#FF7F0E', width=2))
	)

# Add an annotation
fig.add_annotation(x=0.6, y=0.05, xref="paper", yref="paper",
				   text=f"Optimal threshold (Youden index) = {optimal_threshold:.2f}",
				   showarrow=False,
				   font=dict(family="Arial, sans-serif", size=12, color="black"),
				   bgcolor='rgba(243, 243, 243, 1)',
				#    bordercolor="black",
				   borderwidth=2)

# Add titles and labels
fig.update_layout(title='Sensitivity-Specificity Curve',
				  xaxis_title='Threshold',
				  yaxis_title='Rate',
				  plot_bgcolor='rgba(243, 243, 243, 1)',
				  paper_bgcolor='rgba(243, 243, 243, 0)',
				  legend_title='Rate',
				  legend=dict(y=0.09, x=0.925, xanchor='right', yanchor='bottom', 
							  bgcolor='rgba(243, 243, 243, 1)',
							  font=dict(family="Arial, sans-serif", size=12, color="black")),
				  margin=dict(l=40, r=40, t=40, b=40))

fig.update_xaxes(range=[-10, 13.5])
fig.update_yaxes(range=[-0.05, 1.05])
fig.update_layout(width=480, height=480)
fig.write_html("figs/sens-spec.html")

# Change the font size of axix labels
fig.update_xaxes(tickfont=dict(size=16))
fig.update_yaxes(tickfont=dict(size=16))

# Legened off
fig.update_layout(showlegend=False)


# Show figure
fig.show()

In [30]:
thsdict = {
    'opti': 
            {'clinvar_same_pos': 3,
             'clinvar_same_motif': 1,
             'clinvar_else': 0,
             'non_canon_splai_lte_0.1_outside': -2,
             'non_canon_splai_lte_0.1_other': 0,
             'non_canon_splai_bet_0.1_0.2': 1,
             'non_canon_splai_gte_0.2': 2,
             'canon_strong': 4, 
             'canon_moderate': 3, 
             'frameshift_nmd_eloF': 6, 
             'frameshift_nmd_not_eloF': 1,
             'canon_splai_lte_0.1': -1,
             'canon_splai_bet_0.1_0.2': 0,
             'canon_splai_gte_0.2': 2,
             'clinvar_blb': -6
			 },
}

# Laod the dataframes from the pickle files as the test set
tp_test = pd.read_pickle(f'train_test_pkls/tp_prescore_test_{random_state}.pkl')
tn_test = pd.read_pickle(f'train_test_pkls/tn_prescore_test_{random_state}.pkl')

ths = thsdict['opti']
scoring = Scoring(ths=ths)

tp_test['insilico_screening'] = tp_test.parallel_apply(scoring.insilico_screening, axis=1)
tp_test['clinvar_screening'] = tp_test.parallel_apply(scoring.clinvar_screening, axis=1)
# tp_test['PriorityScore'] = tp_test.parallel_apply(scoring.calc_priority_score, axis=1)
# tp_test = scoring.calc_priority_score(tp_test)
tp_test = tp_test[tp_test['insilico_screening'] != 'Not available']
tp_test['PriorityScore'] = tp_test['insilico_screening'] + tp_test['clinvar_screening']
# If PriorityScore under 0, set it to 0
tp_test['PriorityScore'] = tp_test['PriorityScore'].apply(lambda x: 0 if x < 0 else x)

tn_test['insilico_screening'] = tn_test.parallel_apply(scoring.insilico_screening, axis=1)
tn_test['clinvar_screening'] = tn_test.parallel_apply(scoring.clinvar_screening, axis=1)
# tn_test['PriorityScore'] = tn_test.parallel_apply(scoring.calc_priority_score, axis=1)
# tn_test = scoring.calc_priority_score(tn_test)
tn_test = tn_test[tn_test['insilico_screening'] != 'Not available']
tn_test['PriorityScore'] = tn_test['insilico_screening'] + tn_test['clinvar_screening']
# If PriorityScore under 0, set it to 0
tn_test['PriorityScore'] = tn_test['PriorityScore'].apply(lambda x: 0 if x < 0 else x)

# Extract the columns needed
tp_test = tp_test[['variant_id', 'LABEL', 'PriorityScore', 'maxsplai', 
				   'maxentscan_diff', 'maxpangolin', 'Squirls']]
tn_test = tn_test[['variant_id', 'LABEL', 'PriorityScore', 'maxsplai', 
				   'maxentscan_diff', 'maxpangolin', 'Squirls']]

### ========================================================== ##
data = pd.concat([tp_test, tn_test], ignore_index=True)
data.drop_duplicates(subset='variant_id', keep=False, inplace=True)

# Cast the columns to float type
data['LABEL'] = data['LABEL'].astype(int)
data = data[data['PriorityScore'] != 'Not available']   # Exclude rows with PriorityScore not 'Not available'
data['PriorityScore'] = data['PriorityScore'].astype(float)
data['maxsplai'] = data['maxsplai'].astype(float)
# data['maxentscan_diff'] = data['maxentscan_diff'].astype(float) # ToDo: remove string
data['maxpangolin'] = data['maxpangolin'].astype(float)
# data['Squirls'] = data['Squirls'].astype(float) # ToDo: remove NA

# Plot the sensitivity and specificity for each threshold
results_df = specificity_sensitivity_plotly(data)
fig_opti = plot_sensitivity_specificity_plotly(results_df, 800, 800)
# fig2 = plot_sensitivity_specificity_plotly_without_legened(results_df)
# print(tp_test['PriorityScore'].isnull().sum(), tn_test['PriorityScore'].isnull().sum())

## DeLong test and AUC confidence interval
ground_truth = np.array(data['LABEL'])
predictions_fw = np.array(data['PriorityScore'])
predictions_sp = np.array(data['maxsplai'])
predictions_en = np.array(data['maxentscan_diff'])
predictions_pa = np.array(data['maxpangolin'])
predictions_sq = np.array(data['Squirls'])

auc1, var1 = delong_roc_variance(ground_truth, predictions_fw)
cilower1, ciupper1 = compute_auc_confidence_interval(auc1, var1)
auc2, var2 = delong_roc_variance(ground_truth, predictions_sp)
cilower2, ciupper2 = compute_auc_confidence_interval(auc2, var2)

p_value_log = delong_roc_test(ground_truth, predictions_fw, predictions_sp)

print(f"AUC - Framework (95%CI): {auc1:.3f} [{cilower1:.4f}-{ciupper1:.4f}]")
print(f"AUC - SpliceAI (95%CI) : {auc2:.3f} [{cilower2:.4f}-{ciupper2:.4f}]")
print(f"p-value (DeLong Test)  : {10**p_value_log[0][0]:.2e}")


AUC - Framework (95%CI): 0.985 [0.9835-0.9871]
AUC - SpliceAI (95%CI) : 0.983 [0.9813-0.9856]
p-value (DeLong Test)  : 3.00e-02


<!-- ## auROCの比較とperformance metricsの比較 -->

In [140]:
def specificity_sensitivity_plotly(data):
    thresholds = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
    results = []

    for threshold in thresholds:
        tp = data[(data['PriorityScore'] >= threshold) & (data['LABEL'] == 1)].shape[0]
        fn = data[(data['PriorityScore'] < threshold) & (data['LABEL'] == 1)].shape[0]
        tn = data[(data['PriorityScore'] < threshold) & (data['LABEL'] == 0)].shape[0]
        fp = data[(data['PriorityScore'] >= threshold) & (data['LABEL'] == 0)].shape[0]
        specificity = tn / (tn + fp) if (tn + fp) else 0
        sensitivity = tp / (tp + fn) if (tp + fn) else 0
        results.append({'Threshold': threshold, 'Metric': 'Specificity', 'Value': specificity})
        results.append({'Threshold': threshold, 'Metric': 'Sensitivity', 'Value': sensitivity})

    return results_df

import plotly.graph_objects as go
import numpy as np
from sklearn.metrics import roc_curve, roc_auc_score

# ROC curve (LABEL, PriorityScore)
fpr1, tpr1, thresholds1 = roc_curve(data['LABEL'], data['PriorityScore'])
auc1 = roc_auc_score(data['LABEL'], data['PriorityScore'])
fpr2, tpr2, thresholds2 = roc_curve(data['LABEL'], data['maxsplai'])
auc2 = roc_auc_score(data['LABEL'], data['maxsplai'])

# Calculate optimal threshold from ROC curve by Youden's J statistic
Youden_index = np.argmax(tpr1 - fpr1)
optimal_threshold = thresholds1[Youden_index]
print('Optimal threshold (using Youden index):', optimal_threshold)

# Calculate Youden's J statistic for the optimal threshold


# plot ROC curve using Plotly
fig = go.Figure()

# Add traces
fig.add_trace(go.Scatter(
    x=fpr1, y=tpr1, mode='lines', 
    name=f"Framework      ({auc1:.3f} [{cilower1:.4f}-{ciupper1:.4f}])", 
    line=dict(color='#E41A1C', width=2))
    )
fig.add_trace(go.Scatter(
    x=fpr2, y=tpr2, mode='lines', 
    name=f"SpliceAI Alone ({auc2:.3f} [{cilower2:.4f}-{ciupper2:.4f}])", 
    line=dict(color='#377EB8', width=2))
    )
fig.add_trace(go.Scatter(
    x=[0, 1], y=[0, 1], mode='lines', name='Chance', 
    line=dict(color='gray', width=2, dash='dash'), showlegend=False)
    )

# Add an annotation
fig.add_annotation(x=0.6, y=0.05, xref="paper", yref="paper",
                   text=f"DeLong's test p-value = {10**p_value_log[0][0]:.2e}",
                   showarrow=False,
                   font=dict(family="Arial, sans-serif", size=12, color="black"),
                   bgcolor='rgba(243, 243, 243, 1)',
                #    bordercolor="black",
                   borderwidth=2)

# Add titles and labels
fig.update_layout(title='ROC Curve Comparison',
                  xaxis_title='False Positive Rate',
                  yaxis_title='True Positive Rate',
                  legend_title='Prediction methods (AUC [95%CI])',
                  plot_bgcolor='rgba(243, 243, 243, 1)',
                  paper_bgcolor='rgba(243, 243, 243, 0)',
                  legend=dict(y=0.09, x=0.925, xanchor='right', yanchor='bottom', 
                              bgcolor='rgba(243, 243, 243, 1)',
                              font=dict(family="Arial, sans-serif", size=12, color="black")),
                  margin=dict(l=40, r=40, t=40, b=40))

fig.update_xaxes(range=[-0.05, 1.05])
fig.update_yaxes(range=[-0.05, 1.05])
fig.update_layout(width=500, height=500)
fig.write_html("roc-auc.html")

# Add zoomed-in ROC curve in the top left corner
fig.add_trace(go.Scatter(
    x=fpr1, y=tpr1, mode='lines', 
    name=f"Framework      ({auc1:.3f} [{cilower1:.4f}-{ciupper1:.4f}])", 
    line=dict(color='#E41A1C', width=2), showlegend=False)
    )


# Show figure
fig.show()


Optimal threshold (using Youden index): 2.0


In [43]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, f1_score, matthews_corrcoef

# Compare the performance of the two models
# Confusion matrix
th_4, th_5 = 1, 2
th_02, th_05, th_08 = 0.2, 0.5, 0.8
thresholds_fw = [th_4, th_5]
thresholds_spl = [th_02, th_05, th_08]

def calculate_performance_metrics_fw(data, threshold):
    data['Prediction'] = data['PriorityScore'] >= threshold
    tn, fp, fn, tp = confusion_matrix(data['LABEL'], data['Prediction']).ravel()
    specificity = tn / (tn + fp) if (tn + fp) else 0
    sensitivity = tp / (tp + fn) if (tp + fn) else 0
    accuracy = accuracy_score(data['LABEL'], data['Prediction'])
    precision = precision_score(data['LABEL'], data['Prediction'])
    f1 = f1_score(data['LABEL'], data['Prediction'])
    mcc = matthews_corrcoef(data['LABEL'], data['Prediction'])
    return specificity, sensitivity, accuracy, precision, f1, mcc

def calculate_performance_metrics_spl(data, threshold):
    data['Prediction'] = data['maxsplai'] >= threshold
    tn, fp, fn, tp = confusion_matrix(data['LABEL'], data['Prediction']).ravel()
    specificity = tn / (tn + fp) if (tn + fp) else 0
    sensitivity = tp / (tp + fn) if (tp + fn) else 0
    accuracy = accuracy_score(data['LABEL'], data['Prediction'])
    precision = precision_score(data['LABEL'], data['Prediction'])
    f1 = f1_score(data['LABEL'], data['Prediction'])
    mcc = matthews_corrcoef(data['LABEL'], data['Prediction'])

    return specificity, sensitivity, accuracy, precision, f1, mcc

def calculate_performance_metrics(data, target, threshold):
    data['Prediction'] = data[target] >= threshold
    tn, fp, fn, tp = confusion_matrix(data['LABEL'], data['Prediction']).ravel()
    specificity = tn / (tn + fp) if (tn + fp) else 0
    sensitivity = tp / (tp + fn) if (tp + fn) else 0
    accuracy = accuracy_score(data['LABEL'], data['Prediction'])
    precision = precision_score(data['LABEL'], data['Prediction'])
    f1 = f1_score(data['LABEL'], data['Prediction'])
    mcc = matthews_corrcoef(data['LABEL'], data['Prediction'])
    return specificity, sensitivity, accuracy, precision, f1, mcc

threshold_maxent = np.round(optimal_threshold_maxent, 2)
threshold_pangolin = np.round(optimal_threshold_pangolin, 3)
threshold_sqirls = np.round(optimal_threshold_squirls, 3)

performance_metrics = []
for threshold in thresholds_fw:
    specificity, sensitivity, accuracy, precision, f1, mcc = calculate_performance_metrics_fw(data, threshold)
    performance_metrics.append({'Threshold': threshold, 'Specificity': specificity, 'Sensitivity': sensitivity, 'Accuracy': accuracy, 'Precision': precision, 'F1': f1, 'MCC': mcc})

for threshold in thresholds_spl:
    specificity, sensitivity, accuracy, precision, f1, mcc = calculate_performance_metrics(data, 'maxsplai', threshold)
    performance_metrics.append({'Threshold': threshold, 'Specificity': specificity, 'Sensitivity': sensitivity, 'Accuracy': accuracy, 'Precision': precision, 'F1': f1, 'MCC': mcc})

sensitivity, specificity, accuracy, precision, f1, mcc = calculate_performance_metrics(data2, 'maxentscan_diff', 1.8)
performance_metrics.append({'Threshold': 'MaxEntScan', 'Specificity': specificity, 'Sensitivity': sensitivity, 'Accuracy': accuracy, 'Precision': precision, 'F1': f1, 'MCC': mcc})
sensitivity, specificity, accuracy, precision, f1, mcc = calculate_performance_metrics(data, 'maxpangolin', 0.15)
performance_metrics.append({'Threshold': 'Pangolin', 'Specificity': specificity, 'Sensitivity': sensitivity, 'Accuracy': accuracy, 'Precision': precision, 'F1': f1, 'MCC': mcc})
sensitivity, specificity, accuracy, precision, f1, mcc = calculate_performance_metrics(data3, 'Squirls', 0.03)
performance_metrics.append({'Threshold': 'Squirls', 'Specificity': specificity, 'Sensitivity': sensitivity, 'Accuracy': accuracy, 'Precision': precision, 'F1': f1, 'MCC': mcc}) 

performance_metrics_df = pd.DataFrame(performance_metrics)
columns = ['Category', 'Specificity', 'Sensitivity', 'Accuracy', 'Precision', 'F1', 'MCC']
# columns = ['Category', 'Specificity', 'Sensitivity', 'Accuracy', 'MCC', 'F1']
performance_metrics_df.columns = columns
performance_metrics_df.replace(
    {'Category': {
        1.0: 'Framework (1.0)<br> High sensitivity', 
        2.0: 'Framework (2.0)<br> High specificity', 
        0.2: 'SpliceAI (0.2)<br> High sensitivity', 
        0.5: 'SpliceAI (0.5)<br> Recommended', 
        0.8: 'SpliceAI (0.8)<br> High precision',
        'MaxEntScan': 'MaxEntScan ({threshold_maxent})',
        'Pangolin': 'Pangolin ({thrshold_pangolin})',
        'Squirls': 'Squirls ({threshold_sqirls})'
        }}, inplace=True)


In [52]:
print(np.round(optimal_threshold_maxent, 2))
print(np.round(optimal_threshold_pangolin, 3))
print(np.round(optimal_threshold_squirls, 3))

1.8
0.15
0.03


Optimal threshold (using Youden index): 1.0
Optimal threshold (using Youden index) - MaxEntScan: 2.073
Optimal threshold (using Youden index) - Pangolin: 0.20000000298023224
Optimal threshold (using Youden index) - Squirls: 0.041

In [44]:
performance_metrics_df

Unnamed: 0,Category,Specificity,Sensitivity,Accuracy,Precision,F1,MCC
0,Framework (1.0)<br> High sensitivity,0.983666,0.975666,0.976767,0.997335,0.986382,0.909971
1,Framework (2.0)<br> High specificity,0.983666,0.969003,0.971022,0.997317,0.982956,0.890532
2,SpliceAI (0.2)<br> High sensitivity,0.981851,0.950753,0.955034,0.996962,0.97331,0.840706
3,SpliceAI (0.5)<br> Recommended,0.99637,0.886153,0.901324,0.999347,0.939352,0.716771
4,SpliceAI (0.8)<br> High precision,0.998185,0.763036,0.795403,0.99962,0.865451,0.552954
5,MaxEntScan (2.1)<br> Youden index,0.969501,0.978261,0.969623,0.999682,0.98436,0.543943
6,Pangolin (0.2)<br> Youden index,0.89803,0.980036,0.909318,0.996464,0.94469,0.726953
7,Squirls (0.04)<br> Youden index,0.946987,0.989111,0.952785,0.998168,0.971904,0.836106


In [27]:
performance_metrics_df

Unnamed: 0,Category,Specificity,Sensitivity,Accuracy,Precision,F1,MCC
0,Framework (1.0)<br> High sensitivity,0.976669,0.972284,0.973133,0.994272,0.983155,0.918262
1,Framework (2.0)<br> High specificity,0.978278,0.966972,0.969161,0.994636,0.980609,0.907548
2,SpliceAI (0.2)<br> High sensitivity,0.973451,0.94872,0.953508,0.993327,0.970511,0.866073
3,SpliceAI (0.5)<br> Recommended,0.996782,0.890005,0.910677,0.999133,0.941417,0.778952
4,SpliceAI (0.8)<br> High precision,0.998793,0.763979,0.809439,0.999621,0.866057,0.619771
5,MaxEntScan (2.1)<br> Youden index,0.962575,0.956938,0.962456,0.99904,0.980468,0.57025
6,Pangolin (0.2)<br> Youden index,0.889812,0.981496,0.907562,0.995032,0.939485,0.767697
7,Squirls (0.04)<br> Youden index,0.942443,0.985087,0.950686,0.996223,0.968587,0.8615


In [45]:
df = performance_metrics_df

metric = "MCC"
metric = "F1" 
fig = go.Figure()

for i, row in df.iterrows():
    fig.add_shape(
        type="line",
        xref="x", yref="y",
        x0=0,      # 線の始点 (横軸 0)
        x1=row[metric],  # 線の終点 (行のAccuracy値)
        y0=i,      # 線の縦位置 (カテゴリ index)
        y1=i,
        line=dict(color="skyblue", width=3)
    )

fig.add_trace(
    go.Scatter(
        x=df[metric],              # 丸の x 座標
        y=list(range(len(df))),    # 丸の y 座標（0,1,2,...）
        mode="markers+text",
        marker=dict(color="steelblue", size=10),
        text=df[metric].apply(lambda x: f"{x:.3f}"),  # 値をラベル表示（小数第3位まで）
        textposition="middle right",
        textfont=dict(size=10),
        name=metric
    )
)

fig.update_layout(
    title=f"Lollipop Chart of {metric}",
    xaxis_title=metric,
    yaxis=dict(
        tickmode="array",
        tickvals=list(range(len(df))),
        ticktext=df["Category"],  # カスタムの軸ラベルにカテゴリ名をセット
        autorange="reversed"      # y=0 が上に来るのを防ぎたいときは削除
    ),
    width=800,
    height=500
)

fig.show()

In [None]:
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# df = performance_metrics_df
# metrics = ["Specificity", "Sensitivity", "Accuracy", "Precision", "F1", "MCC"]
metrics = ["Specificity", "Sensitivity", "F1", "MCC"]

categories = [
    'Framework (1.0)<br> High sensitivity', 
    'Framework (2.0)<br> High specificity', 
    'SpliceAI (0.2)<br> High sensitivity', 
    'SpliceAI (0.5)<br> Recommended', 
    'SpliceAI (0.8)<br> High precision',
    'MaxEntScan (2.1)<br> Youden index',
    'Pangolin (0.2)<br> Youden index',
    'Squirls (0.04)<br> Youden index']

desired_order = [
    'Framework (1.0)<br> High sensitivity', 
    'Framework (2.0)<br> High specificity', 
    'SpliceAI (0.2)<br> High sensitivity', 
    'SpliceAI (0.5)<br> Recommended', 
    'SpliceAI (0.8)<br> High precision',
    'MaxEntScan (2.1)<br> Youden index',
    'Pangolin (0.2)<br> Youden index',
    'Squirls (0.04)<br> Youden index',
]

# 1) まず 'Category' 列をインデックスにセット
df_reindexed = performance_metrics_df.set_index("Category")
# 2) desired_order の順に並べ替え (loc[リスト] で並べ替え)
df_reindexed = df_reindexed.loc[desired_order]
# 3) Plotly で使いやすいように連番インデックスへ戻す
df_reindexed = df_reindexed.reset_index()
df = df_reindexed.copy()

# categories の要素を desired_order のインデックス順に並べ替えたリストを作る
# ここでは単純なサンプル用に x 軸の値も用意
x_values = list(range(len(categories)))  # 適当な x 値

# desired_order の各カテゴリが、元の categories のどのインデックスに該当するかを探す
idx_map = {cat: i for i, cat in enumerate(categories)}
ordered_indices = [idx_map[cat] for cat in desired_order]
# ordered_indices = [idx_map[cat] for cat in categories]

# サブプロットの作成（横に6つ並べる例）
fig = make_subplots(rows=1, cols=len(metrics), shared_yaxes=True, horizontal_spacing=0.03)

for col_idx, metric in enumerate(metrics, start=1):
    fig.add_trace(
        go.Scatter(
            x=df[metric],
            y=list(range(len(df))),
            mode="markers+text",
            marker=dict(color="steelblue", size=10),
            text=df[metric].apply(lambda x: f"{x:.3f}"),
            textposition="bottom center",
            showlegend=False
        ),
        row=1,
        col=col_idx
    )
    for i, row in df.iterrows():
        # 線を追加
        fig.add_shape(
            type="line",
            xref=f"x{col_idx}",
            yref=f"y{col_idx}",
            x0=0,
            x1=row[metric],
            y0=i,
            y1=i,
            line=dict(color="steelblue", width=2.5),
            row=1,
            col=col_idx,
            # layer="below"
        )
    # 丸（marker）を追加
    # 各サブプロットごとに x 軸ラベルを設定
    fig.update_xaxes(title_text=metric, row=1, col=col_idx)

# 左端のサブプロットだけにカテゴリ名を表示
fig.update_yaxes(
    tickmode="array",
    tickvals=list(range(len(df))),
    ticktext=df["Category"],
    row=1,
    col=1
)
# y 軸を desired_order の並びで表示
fig.update_yaxes(
    tickmode="array",
    tickvals=list(range(len(desired_order))),  # 0,1,2,3,4,5,6,7
    ticktext=desired_order                     # 表示ラベル
)

fig.update_layout(
    width=300 * len(metrics),
    height=600,
    title="Lollipop Charts for Each Metric"
)

fig.show()

In [99]:
df

Unnamed: 0,Category,Specificity,Sensitivity,Accuracy,Precision,F1,MCC
0,Framework (1.0)<br> High sensitivity,0.976669,0.972284,0.973133,0.994272,0.983155,0.918262
1,Framework (2.0)<br> High specificity,0.978278,0.966972,0.969161,0.994636,0.980609,0.907548
2,SpliceAI (0.2)<br> High sensitivity,0.973451,0.94872,0.953508,0.993327,0.970511,0.866073
3,SpliceAI (0.5)<br> Recommended,0.996782,0.890005,0.910677,0.999133,0.941417,0.778952
4,SpliceAI (0.8)<br> High precision,0.998793,0.763979,0.809439,0.999621,0.866057,0.619771
5,MaxEntScan (2.1)<br> Youden index,0.962575,0.956938,0.962456,0.99904,0.980468,0.57025
6,Pangolin (0.2)<br> Youden index,0.889812,0.981496,0.907562,0.995032,0.939485,0.767697
7,Squirls (0.04)<br> Youden index,0.942443,0.985087,0.950686,0.996223,0.968587,0.8615


In [125]:
fig = make_subplots(rows=1, cols=len(metrics), shared_yaxes=True, horizontal_spacing=0.03)

desired_order = [
    'Framework (1.0)<br> High sensitivity', 
    'Framework (2.0)<br> High specificity', 
    'SpliceAI (0.2)<br> High sensitivity', 
    'SpliceAI (0.5)<br> Recommended', 
    'SpliceAI (0.8)<br> High precision',
    'MaxEntScan (2.1)<br> Youden index',
    'Pangolin (0.2)<br> Youden index',
    'Squirls (0.04)<br> Youden index',
]

# 1) dfの行を並べ替える
df_reordered = df.set_index("Category").loc[desired_order].reset_index()

for col_idx, metric in enumerate(metrics, start=1):
    # 丸マーカー
    fig.add_trace(
        go.Scatter(
            x=df_reindexed[metric],
            y=list(range(len(df_reindexed))),     # ← 並べ替え後の行順をそのまま 0,1,2,... に
            mode="markers+text",
            marker=dict(color="steelblue", size=10),
            text=df_reindexed[metric].apply(lambda x: f"{x:.3f}"),
            textposition="bottom center",
            showlegend=False
        ),
        row=1,
        col=col_idx
    )
    # 棒の部分 (Shapes)
    for i, row_ in df_reindexed.iterrows():
        fig.add_shape(
            type="line",
            xref=f"x{col_idx}",
            yref=f"y{col_idx}",
            x0=0,
            x1=row_[metric],
            y0=i,
            y1=i,
            line=dict(color="steelblue", width=2.5),
            row=1,
            col=col_idx
        )
    # X 軸ラベル
    fig.update_xaxes(title_text=metric, row=1, col=col_idx)

# 左端サブプロットの Y 軸だけカテゴリ名を表示
fig.update_yaxes(
    tickmode="array",
    tickvals=list(range(len(df_reindexed))),
    ticktext=df_reindexed["Category"],  # reindexed後の Category
    autorange="reversed",
    row=1,
    col=1
)



fig.update_layout(
    width=300 * len(metrics),
    height=600,
    plot_bgcolor='rgba(243, 243, 243, 1)',
    paper_bgcolor='rgba(243, 243, 243, 0)',
    title="Lollipop Charts for Each Metric (Ordered by desired_order)"
)
fig.show()


In [86]:
desired_order = [
    "Framework (1.0)<br> High sensitivity",
    "Squirls (0.03)<br> Youden index",
    "Pangolin (0.15)<br> Youden index",
    "MaxEntScan (1.8)<br> Youden index",
    "SpliceAI (0.8)<br> High precision",
    "SpliceAI (0.5)<br> Recommended",
    "SpliceAI (0.2)<br> High sensitivity",
    "Framework (2.0)<br> High specificity",
]

# 1) dfの行を並べ替える
df_reordered = df.set_index("Category").loc[desired_order].reset_index()

# 2) あとは df_reordered を使ってLollipopチャートを作る
fig = make_subplots(rows=1, cols=len(metrics), shared_yaxes=True, horizontal_spacing=0.03)
for col_idx, metric in enumerate(metrics, start=1):
    # 丸の Scatter
    fig.add_trace(
        go.Scatter(
            x=df_reordered[metric],
            y=list(range(len(df_reordered))),
            mode="markers+text",
            ...
        ),
        row=1,
        col=col_idx
    )
    # 線 (Shapes)
    for i, row_ in df_reordered.iterrows():
        fig.add_shape(
            type="line",
            xref=f"x{col_idx}",
            yref=f"y{col_idx}",
            x0=0,
            x1=row_[metric],
            y0=i,
            y1=i,
            ...
        )
    ...
# 左端のサブプロットだけにカテゴリ名を設定
fig.update_yaxes(
    tickmode="array",
    tickvals=list(range(len(df_reordered))),            # [0,1,2,...,7]
    ticktext=df_reordered["Category"],                  # 並べ替え後のカテゴリ順
    row=1, 
    col=1
)
fig.update_layout(...)
fig.show()


SyntaxError: positional argument follows keyword argument (1000424662.py, line 24)