In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt
import seaborn as sns
import scipy
import os, sys
from pathlib import Path
from pandarallel import pandarallel

import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
from plotly.subplots import make_subplots

########   Initialize and setup pandas methods   ########
pandarallel.initialize(nb_workers=os.cpu_count()-1, progress_bar=False, 
                       verbose=2, use_memory_fs=False) 
os.environ['JOBLIB_TEMP_FOLDER'] = '/tmp' 

try: 
    __file__
    sys.path.append(os.path.join(os.path.dirname('__file__')))
except NameError:
    Path().resolve()
    sys.path.append(os.path.join(Path().resolve(), '../../'))

from libs.scoring import Scoring
import pickle

import warnings
warnings.simplefilter('ignore')

INFO: Pandarallel will run on 7 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [2]:
# Create a DataFrame of the 4,850 solutions
fp_solution_pkl = "bootstrap_results/pkls/all_solution_main.pkl"
with open(fp_solution_pkl, 'rb') as f:
    all_solutions = pickle.load(f)

solutions = pd.DataFrame(all_solutions)
solutions_list = [f"Solution {i}" for i in range(1, 4851)]
patterns_sr = pd.Series(solutions_list, name='Solution')
solutions = pd.concat([patterns_sr, solutions], axis=1)

# Load pkls (all_results_1_10.pkl, all_results_11_20.pkl, ..., all_results_91_100.pkl)
results_1_10 = "bootstrap_results/all_results_1_10.pkl"
results_11_20 = "bootstrap_results/all_results_11_20.pkl"
results_21_30 = "bootstrap_results/all_results_21_30.pkl"
results_31_40 = "bootstrap_results/all_results_31_40.pkl"
results_41_50 = "bootstrap_results/all_results_41_50.pkl"
results_51_60 = "bootstrap_results/all_results_51_60.pkl"
results_61_70 = "bootstrap_results/all_results_61_70.pkl"
results_71_80 = "bootstrap_results/all_results_71_80.pkl"
results_81_90 = "bootstrap_results/all_results_81_90.pkl"
results_91_100 = "bootstrap_results/all_results_91_100.pkl"

# Load pickles of calculated AUCs
with open(results_1_10, 'rb') as f:
    dict_1_10 = pickle.load(f)

with open(results_11_20, 'rb') as f:
    dict_11_20 = pickle.load(f)

with open(results_21_30, 'rb') as f:
    dict_21_30 = pickle.load(f)

with open(results_31_40, 'rb') as f:
    dict_31_40 = pickle.load(f)

with open(results_41_50, 'rb') as f:
    dict_41_50 = pickle.load(f)

with open(results_51_60, 'rb') as f:
    dict_51_60 = pickle.load(f)

with open(results_61_70, 'rb') as f:
    dict_61_70 = pickle.load(f)

with open(results_71_80, 'rb') as f:
    dict_71_80 = pickle.load(f)

with open(results_81_90, 'rb') as f:
    dict_81_90 = pickle.load(f)

with open(results_91_100, 'rb') as f:
    dict_91_100 = pickle.load(f)

# To dataframes
df_1_10 = pd.DataFrame(columns=['Dataset', 'Candidate', 'Score'])
for set in dict_1_10.keys():
    buf_df = pd.DataFrame(
        list(dict_1_10[set].items()), columns=['Candidate', 'Score'])
    buf_df['Dataset'] = set
    df_1_10 = pd.concat([df_1_10, buf_df], ignore_index=True)

df_11_20 = pd.DataFrame(columns=['Dataset', 'Candidate', 'Score'])
for set in dict_11_20.keys():
    buf_df = pd.DataFrame(
        list(dict_11_20[set].items()), columns=['Candidate', 'Score'])
    buf_df['Dataset'] = set
    df_11_20 = pd.concat([df_11_20, buf_df], ignore_index=True)

df_21_30 = pd.DataFrame(columns=['Dataset', 'Candidate', 'Score'])
for set in dict_21_30.keys():
    buf_df = pd.DataFrame(
        list(dict_21_30[set].items()), columns=['Candidate', 'Score'])
    buf_df['Dataset'] = set
    df_21_30 = pd.concat([df_21_30, buf_df], ignore_index=True)

df_31_40 = pd.DataFrame(columns=['Dataset', 'Candidate', 'Score'])
for set in dict_31_40.keys():
    buf_df = pd.DataFrame(
        list(dict_31_40[set].items()), columns=['Candidate', 'Score'])
    buf_df['Dataset'] = set
    df_31_40 = pd.concat([df_31_40, buf_df], ignore_index=True)

df_41_50 = pd.DataFrame(columns=['Dataset', 'Candidate', 'Score'])
for set in dict_41_50.keys():
    buf_df = pd.DataFrame(
        list(dict_41_50[set].items()), columns=['Candidate', 'Score'])
    buf_df['Dataset'] = set
    df_41_50 = pd.concat([df_41_50, buf_df], ignore_index=True)

df_51_60 = pd.DataFrame(columns=['Dataset', 'Candidate', 'Score'])
for set in dict_51_60.keys():
    buf_df = pd.DataFrame(
        list(dict_51_60[set].items()), columns=['Candidate', 'Score'])
    buf_df['Dataset'] = set
    df_51_60 = pd.concat([df_51_60, buf_df], ignore_index=True)

df_61_70 = pd.DataFrame(columns=['Dataset', 'Candidate', 'Score'])
for set in dict_61_70.keys():
    buf_df = pd.DataFrame(
        list(dict_61_70[set].items()), columns=['Candidate', 'Score'])
    buf_df['Dataset'] = set
    df_61_70 = pd.concat([df_61_70, buf_df], ignore_index=True)

df_71_80 = pd.DataFrame(columns=['Dataset', 'Candidate', 'Score'])
for set in dict_71_80.keys():
    buf_df = pd.DataFrame(
        list(dict_71_80[set].items()), columns=['Candidate', 'Score'])
    buf_df['Dataset'] = set
    df_71_80 = pd.concat([df_71_80, buf_df], ignore_index=True)

df_81_90 = pd.DataFrame(columns=['Dataset', 'Candidate', 'Score'])
for set in dict_81_90.keys():
    buf_df = pd.DataFrame(
        list(dict_81_90[set].items()), columns=['Candidate', 'Score'])
    buf_df['Dataset'] = set
    df_81_90 = pd.concat([df_81_90, buf_df], ignore_index=True)

df_91_100 = pd.DataFrame(columns=['Dataset', 'Candidate', 'Score'])
for set in dict_91_100.keys():
    buf_df = pd.DataFrame(
        list(dict_91_100[set].items()), columns=['Candidate', 'Score'])
    buf_df['Dataset'] = set
    df_91_100 = pd.concat([df_91_100, buf_df], ignore_index=True)

# Concatenate all dataframes
df_all = pd.concat([df_1_10, df_11_20, df_21_30, df_31_40, df_41_50, df_51_60, df_61_70, df_71_80, df_81_90, df_91_100], ignore_index=True)

# Split the 'Score' column into 'auROC' and '95%CI'
df_all['auROC'] = df_all['Score'].str.split(' ', expand=True)[0].astype(float)
df_all['95%CI'] = df_all['Score'].str.split(' ', expand=True)[1]
df_all['CI_lower'] = df_all['95%CI'].str.extract(r'(\d\.\d*)').astype('float')
df_all['CI_upper'] = df_all['95%CI'].str.extract(r'((?<=-)\d\.\d*(?=]))').astype(float)
df_all.drop(['Score', '95%CI'], axis=1, inplace=True)

# Merge the two dataframes
df = pd.merge(df_all, solutions, left_on='Candidate', right_on='Solution')
df.drop('Solution', axis=1, inplace=True)

# Calculate the sample variance of the scores for each solution
df['SampleVariance'] = df.iloc[:, 5:20].var(axis=1, ddof=0)

# Calculate the maximum scores for each solution
ddf = df.groupby('Dataset')
maxdf = df.loc[ddf['auROC'].idxmax(),:]

# Replace 'Solution' to 'Pattern' in the 'Candidate' column
df['Candidate'] = df['Candidate'].str.replace('Solution', 'Pattern')

# Rename columns
df.rename(columns={'Dataset': 'No. of Bootstrap',
                   'CI_lower': '95%_CI_lower', 'CI_upper': '95%_CI_upper',
                   'SampleVariance': 'Sample Variance',
                   's1': 'Score 1', 's2': 'Score 2', 's3': 'Score 3', 
                   's4': 'Score 4', 's5': 'Score 5', 's6': 'Score 6', 
                   's7': 'Score 7', 's8': 'Score 8', 's9': 'Score 9', 
                   's10': 'Score 10', 's11': 'Score 11', 's12': 'Score 12', 
                   's13': 'Score 13', 's14': 'Score 14'}, inplace=True)

bootstrap = 100
set_max = []
for i in range(1, bootstrap + 1):
    set_max.append(df.loc[(df['No. of Bootstrap'] == i) & (df['auROC'] == maxdf.loc[maxdf['Dataset'] == i, 'auROC'].values[0]), :])

best = {}
for i in range(bootstrap):
    highest_variance = set_max[i].loc[set_max[i]['Sample Variance'].idxmax(), 'Sample Variance']
    # If other candidate has the same variance, add it to the 
    best[f'set {i + 1}'] = set_max[i].loc[set_max[i]['Sample Variance'] == highest_variance, 'Candidate'].values


In [54]:
# Output set_max
set_max_df = pd.concat(set_max)
set_max_df = set_max_df[['No. of Bootstrap', 'Candidate', 
                         'Score 1', 'Score 2', 'Score 3', 'Score 4', 'Score 5',
                         'Score 6', 'Score 7', 'Score 8', 'Score 9', 'Score 10',
                         'Score 11', 'Score 12', 'Score 13', 'Score 14',
                         'auROC', '95%_CI_lower', '95%_CI_upper', 
                         'Sample Variance']]
# To xlsx
set_max_df.to_excel('/Volumes/vol/work/Github/dev/ValidationData/AUC/bootstrap_results/set_max.xlsx', index=False)

In [55]:
# Output solutions as xlsx
solutions.to_excel('/Volumes/vol/work/Github/dev/ValidationData/AUC/bootstrap_results/solutions.xlsx', index=False)

In [8]:
# Plot maximumu auROCs

fig = go.Figure()

# Add vertical lines between CI upper and lower by set
for i in range(1, 101):
    fig.add_shape(
        dict(
            type='line',
            x0=i,
            y0=maxdf.loc[maxdf['Dataset'] == i, 'CI_upper'].values[0],
            x1=i,
            y1=maxdf.loc[maxdf['Dataset'] == i, 'CI_lower'].values[0],
            line=dict(color='red', width=1.5)
        )
    )

# Plot each 95% CI using error bars
fig.add_trace(go.Scatter(x=maxdf['Dataset'], y=maxdf['CI_upper'],
                    mode='markers',
                    name='CI upper',
                    marker=dict(color='#E0806E', size=4)))

fig.add_trace(go.Scatter(x=maxdf['Dataset'], y=maxdf['CI_lower'],
                    mode='markers',
                    name='CI lower',
                    marker=dict(color='#E0806E', size=4)))

# # Plot only maximum auROC
# fig.add_trace(go.Scatter(x=maxdf['Dataset'], y=maxdf['auROC'],
#                     mode='markers',
#                     name='Maximum auROC',
#                     marker=dict(color='blue', size=10)))


# Plot the best candidate
for i in range(bootstrap):
    fig.add_trace(go.Scatter(x=[i+1], y=[maxdf.loc[maxdf['Dataset'] == i+1, 'auROC'].values[0]],
                    mode='markers',
                    name='Best candidate',
                    marker=dict(color='#E0806E', size=8)))

# plot order of lines and markers
fig.update_layout(showlegend=False)

fig.update_layout(title='Maximum auROC and 95% CI using calibration data set',
                  xaxis_title='Number of bootstrap',
                  yaxis_title='auROC')
# Change font size
fig.update_layout(font=dict(size=21))

# y-axis range: 0.99 to 1.00
fig.update_yaxes(range=[0.99, 1.00])

# x-asis ticks and range
# Figure size
fig.update_layout(width=1200, height=600)

fig.show()


In [37]:
## Comparison analysis Framework vs SpliceAI alone
# Test data pkls
