In [1]:
import os
# os.environ['CUDA_VISIBLE_DEVICES'] = "7" 

import numpy as np
import random
import pandas as pd
import pickle
import sys
import timeit
from tqdm import tqdm
import glob

from sklearn.metrics import roc_auc_score, accuracy_score, roc_curve, auc,confusion_matrix
import matplotlib.pyplot as plt


## Hard gate classifier

In [2]:
def HardGateClassifier(Confs, Probs, Labels, Costs, C, cutoff_min_bound = 0.5, cutoff_max_bound = 0.95, nbins = 20):
    '''
    Confs: list
    Probs: list
    Labels: list
    C: cost constraint
    return cutoff
    '''

    cutoffs = []
    
    n_samples = len(Labels)
    n_models = len(Costs)

    # start with the smallest model  
    model_assignment = np.array([0 for i in range(n_samples)])
    cost = Costs[0] * n_samples
    if cost > C:
        cutoffs.append(max(cutoff_min_bound, min(Confs[0])))
        auc = roc_auc_score(Labels, [Probs[model_assignment[i]][i] for i in range(n_samples)])
        avg_conf = np.mean([Confs[model_assignment[i]][i] for i in range(n_samples)])
        return cutoffs, model_assignment, auc, cost, avg_conf
    
    for m in range(n_models - 1):
        cur_model_idx = np.where(model_assignment == m)[0]
        if len(cur_model_idx) == 0:
            break
            
        cur_cutoff = max(cutoff_min_bound, min(Confs[m][cur_model_idx]))
        #min(max(Confs[m][cur_model_idx]),cutoff_max_bound)
        cutoff_searchspace = np.linspace(cur_cutoff, cutoff_max_bound, nbins)
        for try_cutoff in cutoff_searchspace[1:]:
            next_model_idx = np.where((Confs[m][cur_model_idx] >= cur_cutoff) & (Confs[m][cur_model_idx] < try_cutoff))[0]
            if len(next_model_idx) > 0:
                try_cost = cost + (Costs[m+1] - Costs[m]) * len(next_model_idx)
                if try_cost > C:
                    cutoffs.append(cur_cutoff)
                    auc = roc_auc_score(Labels, [Probs[model_assignment[i]][i] for i in range(n_samples)])
                    avg_conf = np.mean([Confs[model_assignment[i]][i] for i in range(n_samples)])
                    return cutoffs, model_assignment, auc, cost, avg_conf
                
                cost = try_cost
                cur_cutoff = try_cutoff
                model_assignment[cur_model_idx[next_model_idx]] = m + 1
        cutoffs.append(cur_cutoff)
        
#     cutoffs.append(cur_cutoff)
    auc = roc_auc_score(Labels, [Probs[model_assignment[i]][i] for i in range(n_samples)])
    avg_conf = np.mean([Confs[model_assignment[i]][i] for i in range(n_samples)])
    
    asert_cost = [Costs[m] for m in model_assignment]
#     print('xxxx',np.sum(asert_cost), cost)
    return cutoffs, model_assignment, auc, cost, avg_conf
    
    

## Real data

In [3]:
def fit_cascade(validation_df, model_zoo, model_costs, cost_constraint, confidence='probability', cutoff_min_bound = 0, cutoff_max_bound = 0.97, nbins=20):
    '''
    confidence: probability, entropy
    '''    
    
    labels = np.array(validation_df['y_true'])
    N = len(labels)
    confs = []
    probs =[]

    for model,c in zip(model_zoo,model_costs):
        ps = validation_df[f'y_pred_{model}']
        probs.append(ps)
        if confidence == 'probability':
            cs = [max(p, 1-p) for p in ps]
        else:
            cs = [1 + p*np.log(p) + (1-p)*np.log(1-p) for p in ps]
            
        confs.append(cs)
#         print(roc_auc_score(labels, ps), c, np.mean(cs), f'[{np.min(cs)}, {np.quantile(cs,0.95)}, {np.max(cs)}]')
    confs = np.array(confs)
    probs = np.array(probs)
    
    cutoffs, model_assignment, auc, cost, avg_conf = HardGateClassifier(confs, probs, labels, model_costs, C = cost_constraint*N, cutoff_min_bound=cutoff_min_bound, cutoff_max_bound = cutoff_max_bound, nbins=nbins)
    return cutoffs, model_assignment, auc, cost*1./N, avg_conf

In [4]:
def eval_cascade(test_df, model_zoo, model_costs, cutoffs, confidence='probability'):
    
    labels = np.array(test_df['y_true'])
    N = len(test_df)
    
    model_assignment = np.zeros(N)
    for i in range(len(cutoffs)):
        cur_model_indices = np.where(model_assignment == i)[0]
        cur_pred = np.array(test_df[f'y_pred_{model_zoo[i]}'])[cur_model_indices]
        next_model_indices = np.where(cur_pred < cutoffs[i])[0]
        model_assignment[cur_model_indices[next_model_indices]] = i+1
    
    ps = [test_df.loc[i, f'y_pred_{model_zoo[int(model_assignment[i])]}'] for i in range(N)]
    if confidence == 'probability':
        cs = [max(p, 1-p) for p in ps]
    else:
        cs = [1 + p*np.log(p) + (1-p)*np.log(1-p) for p in ps]
    costs = [model_costs[int(model_assignment[i])] for i in range(N)]
    return model_assignment, roc_auc_score(labels, ps), np.mean(costs), np.mean(cs)

## Load full data

In [5]:
full_validation_df = pd.read_csv('./prediction_on_validation_results/single_models.csv')
full_test_df = pd.read_csv('./prediction_full_test_results/single_models.csv')
full_test_df.head(5)

Unnamed: 0,id,t_since_intime,y_sepsis,y_pred_sepsis_rnn_vitals_1hr,y_pred_sepsis_rnn_vitals_2hr,y_pred_sepsis_rnn_vitals_6hr,y_pred_sepsis_rnn_vitals_12hr,y_pred_sepsis_rnn_vitals_labs_1hr,y_pred_sepsis_rnn_vitals_labs_2hr,y_pred_sepsis_rnn_vitals_labs_6hr,...,y_pred_shock_rnn_vitals_labs_6hr,y_pred_shock_rnn_vitals_labs_12hr,y_pred_shock_rnn_vitals_labs_csu_1hr,y_pred_shock_rnn_vitals_labs_csu_2hr,y_pred_shock_rnn_vitals_labs_csu_6hr,y_pred_shock_rnn_vitals_labs_csu_12hr,y_pred_shock_rnn_vitals_labs_csu_med_1hr,y_pred_shock_rnn_vitals_labs_csu_med_2hr,y_pred_shock_rnn_vitals_labs_csu_med_6hr,y_pred_shock_rnn_vitals_labs_csu_med_12hr
0,200028,12,0,0.532764,0.561392,0.225359,0.006933,0.465791,0.683791,0.360447,...,0.713949,0.005605,0.897003,0.928212,0.585692,0.000411,0.96615,0.897094,0.713246,0.005291
1,200028,13,0,0.588746,0.457782,0.2308,0.039564,0.403893,0.738181,0.352118,...,0.721657,0.024613,0.867715,0.931123,0.563915,0.002775,0.96135,0.885748,0.753013,0.02744
2,200028,14,0,0.596804,0.405117,0.211669,0.074488,0.37222,0.516046,0.337053,...,0.715716,0.014446,0.732507,0.861759,0.537593,0.002519,0.939469,0.822398,0.766485,0.042255
3,200028,15,0,0.519611,0.367189,0.198843,0.10684,0.307604,0.394215,0.302898,...,0.686792,0.008925,0.548437,0.641073,0.48743,0.001834,0.624751,0.61027,0.75149,0.040753
4,200028,16,0,0.445124,0.324467,0.215854,0.131807,0.315257,0.300545,0.266929,...,0.656907,0.007048,0.751968,0.666474,0.494684,0.001416,0.891648,0.61855,0.745262,0.038273


In [31]:
# split train_validation test_validation 
shock_iids = np.unique(full_validation_df[full_validation_df['y_shock'] == 1]['id'])
sepsis_iids = np.unique(np.array(full_validation_df[full_validation_df['y_sepsis'] == 1]['id']))

test_shock_iids = set(np.random.choice(shock_iids, int(len(shock_iids) * 0.2)))
test_sepis_iids = set(np.random.choice(sepsis_iids, int(len(sepsis_iids) * 0.2)))
test_iids = set(list(test_shock_iids) + list(test_sepis_iids))
print(len(test_shock_iids), len(test_sepis_iids), len(test_iids))

5 26 30


## Fit Stage-1

In [32]:
validation_ml_results_dir = '../ml/prediction_on_validation_results/bk/'
model_ml_zoo = ['DecisionTree_vitals_labs']
model_ml_cost = [1]

model = model_ml_zoo[0]
validation_df = pd.read_csv(f'{validation_ml_results_dir}{model}.csv')
validation_df = validation_df[['id','t_since_intime','y_pred','y_true']]
validation_df = validation_df.rename(columns={'y_pred': f'y_pred_{model}'})
validation_df = validation_df.set_index(['id','t_since_intime','y_true'])

for model in model_ml_zoo[1:]:
    model_df = pd.read_csv(f'{validation_ml_results_dir}{model}.csv')
    model_df = model_df[['id','t_since_intime','y_pred','y_true']]
    model_df = model_df.rename(columns={'y_pred': f'y_pred_{model}'})
    model_df = model_df.set_index(['id','t_since_intime','y_true'])
    validation_df = validation_df.join(model_df, how='inner')


validation_results_dir = '../rnn/sepsis/prediction_on_validation_results/bk/'
model_rnn_zoo = ['rnn_vitals_labs_csu_1hr','rnn_vitals_labs_csu_2hr','rnn_vitals_labs_csu_6hr']
model_rnn_cost = [80,90,120]

for model in model_rnn_zoo:
    model_df = pd.read_csv(f'{validation_results_dir}{model}.split_validation.csv')
    model_df = model_df[['id','t_since_intime','y_pred','y_true']]
    model_df = model_df.rename(columns={'y_pred': f'y_pred_{model}'})
    model_df = model_df.set_index(['id','t_since_intime','y_true'])
    validation_df = validation_df.join(model_df, how='inner')

validation_df = validation_df.reset_index(drop=False)

mask = validation_df['id'].isin(test_iids)

train_validation_df = validation_df.loc[~mask].reset_index(drop=True)
test_validation_df = validation_df.loc[mask].reset_index(drop=True)

model_zoo = model_ml_zoo + model_rnn_zoo
model_costs = model_ml_cost + model_rnn_cost
model_aucs = []
model_aucs_train = []
model_aucs_test = []
for m in model_zoo:
    auc = roc_auc_score(validation_df['y_true'], validation_df[f'y_pred_{m}'])
    auc_train = roc_auc_score(train_validation_df['y_true'], train_validation_df[f'y_pred_{m}'])
    auc_test = roc_auc_score(test_validation_df['y_true'], test_validation_df[f'y_pred_{m}'])
    model_aucs.append(auc)
    model_aucs_train.append(auc_train)
    model_aucs_test.append(auc_test)
    
model_profiles = pd.DataFrame(list(zip(model_zoo, model_aucs, model_aucs_train, model_aucs_test, model_costs)), 
                              columns=['Model','Validation_AUC','Train_validation_AUC','Test_validation_AUC','Cost'])
model_profiles = model_profiles.sort_values(by=['Validation_AUC'])
model_profiles

Unnamed: 0,Model,Validation_AUC,Train_validation_AUC,Test_validation_AUC,Cost
0,DecisionTree_vitals_labs,0.814762,0.82703,0.500821,1
1,rnn_vitals_labs_csu_1hr,0.887411,0.898747,0.536494,80
2,rnn_vitals_labs_csu_2hr,0.891644,0.902944,0.568318,90
3,rnn_vitals_labs_csu_6hr,0.899808,0.910419,0.602585,120


In [35]:
model_zoo = model_profiles['Model'].values
model_costs = model_profiles['Cost'].values
cost_constraint = 120
cutoff_max_bound = 0.985 # tunable

print('Confidence: probability')
print('-------------------------')
cutoffs, train_model_assignment, auc, cost, avg_conf = fit_cascade(train_validation_df, model_zoo, model_costs, cost_constraint=cost_constraint, 
                                                             confidence='probability', cutoff_max_bound = cutoff_max_bound, nbins=20)
print(auc, cost, avg_conf)
print(cutoffs)

print('-------')
test_results_dir = '../rnn/prediction_on_test_results/'
test_model_assignment, auc, cost, avg_conf = eval_cascade(test_validation_df, model_zoo, model_costs, cutoffs, confidence='probability')
print(auc, cost, avg_conf)
print([sum(test_model_assignment==i) for i in range(len(model_zoo))])


Confidence: probability
-------------------------
0.9108510670556272 119.27474773578922 0.8186892100218615
[0.985, 0.985, 0.985]
-------
0.5950060836623116 115.93561368209255 0.8240489849682066
[0, 39, 150, 1302]


In [36]:
print('Confidence: probability')
print('-------------------------')
results = []
for c in np.linspace(1,120,20):
    cutoffs, model_assignment, auc, cost, avg_conf = fit_cascade(train_validation_df, model_zoo, model_costs, cost_constraint=c, confidence='probability', cutoff_max_bound = cutoff_max_bound)
    model_assignment, auc, cost, avg_conf = eval_cascade(test_validation_df, model_zoo, model_costs, cutoffs, confidence='probability')
    n_model = len(np.unique(model_assignment))
    
    cutoffs_full = [np.nan for i in range(len(model_zoo)-1)]
    cutoffs_full[:len(cutoffs)] = cutoffs
    results.append([c, auc, cost, avg_conf,n_model] + cutoffs_full)
results_df = pd.DataFrame(results, columns=['C','AUC','Cost','Avg Conf','Num Models'] + [f'cutoff_{i}' for i in range(len(cutoffs_full))])
results_df

Confidence: probability
-------------------------


Unnamed: 0,C,AUC,Cost,Avg Conf,Num Models,cutoff_0,cutoff_1,cutoff_2
0,1.0,0.509633,55.468142,0.789879,2,0.523165,,
1,7.263158,0.509633,55.468142,0.789879,2,0.523165,,
2,13.526316,0.517977,58.488263,0.797553,2,0.620393,,
3,19.789474,0.517977,58.488263,0.797553,2,0.620393,,
4,26.052632,0.517977,58.488263,0.797553,2,0.693315,,
5,32.315789,0.517977,58.488263,0.797553,2,0.693315,,
6,38.578947,0.517977,58.488263,0.797553,2,0.693315,,
7,44.842105,0.517977,58.488263,0.797553,2,0.693315,,
8,51.105263,0.513967,58.80617,0.797961,2,0.766236,,
9,57.368421,0.513967,58.80617,0.797961,2,0.839157,,


## Fit Stage-2

In [37]:
validation_results_dir = '../rnn/septic_shock/prediction_on_validation_results/bk/'
shock_model_rnn_zoo = ['rnn_vitals_labs_1hr','rnn_vitals_labs_csu_2hr','rnn_vitals_labs_csu_med_6hr']
shock_model_rnn_cost = [80,90,120]

model = shock_model_rnn_zoo[0]
shock_validation_df = pd.read_csv(f'{validation_results_dir}{model}.split_validation.csv')
shock_validation_df = shock_validation_df[['id','t_since_intime','y_pred','y_true']]
shock_validation_df = shock_validation_df.rename(columns={'y_pred': f'y_pred_{model}'})
shock_validation_df = shock_validation_df.set_index(['id','t_since_intime','y_true'])

for model in shock_model_rnn_zoo[1:]:
    model_df = pd.read_csv(f'{validation_results_dir}{model}.split_validation.csv')
    model_df = model_df[['id','t_since_intime','y_pred','y_true']]
    model_df = model_df.rename(columns={'y_pred': f'y_pred_{model}'})
    model_df = model_df.set_index(['id','t_since_intime','y_true'])
    shock_validation_df = shock_validation_df.join(model_df, how='inner')

shock_validation_df = shock_validation_df.reset_index(drop=False)
mask = shock_validation_df['id'].isin(test_iids)

train_shock_validation_df = shock_validation_df.loc[~mask].reset_index(drop=True)
test_shock_validation_df = shock_validation_df.loc[mask].reset_index(drop=True)

model_aucs = []
model_aucs_train = []
model_aucs_test = []
for m in shock_model_rnn_zoo:
    auc = roc_auc_score(shock_validation_df['y_true'], shock_validation_df[f'y_pred_{m}'])
    auc_train = roc_auc_score(train_shock_validation_df['y_true'], train_shock_validation_df[f'y_pred_{m}'])
    auc_test = roc_auc_score(test_shock_validation_df['y_true'], test_shock_validation_df[f'y_pred_{m}'])
    model_aucs.append(auc)
    model_aucs_train.append(auc_train)
    model_aucs_test.append(auc_test)
    
shock_model_profiles = pd.DataFrame(list(zip(shock_model_rnn_zoo, model_aucs, model_aucs_train, model_aucs_test, shock_model_rnn_cost)), 
                              columns=['Model','Validation_AUC','Train_validation_AUC','Test_validation_AUC','Cost'])
shock_model_profiles = shock_model_profiles.sort_values(by=['Validation_AUC'])
shock_model_profiles

Unnamed: 0,Model,Validation_AUC,Train_validation_AUC,Test_validation_AUC,Cost
0,rnn_vitals_labs_1hr,0.885872,0.914356,0.833961,80
1,rnn_vitals_labs_csu_2hr,0.934098,0.940919,0.913,90
2,rnn_vitals_labs_csu_med_6hr,0.961944,0.967156,0.95977,120


In [42]:
shock_model_zoo = shock_model_profiles['Model'].values
shock_model_costs = shock_model_profiles['Cost'].values
cost_constraint = 90
shock_cutoff_max_bound = 0.8

print('Confidence: probability')
print('-------------------------')
cutoffs, train_model_assignment, auc, cost, avg_conf = fit_cascade(train_shock_validation_df, shock_model_zoo, shock_model_costs, cost_constraint=cost_constraint, 
                                                             confidence='Probability', cutoff_max_bound = shock_cutoff_max_bound, nbins=20)
print(auc, cost, avg_conf)
print(cutoffs)

print('-------')
test_results_dir = '../rnn/prediction_on_test_results/'
test_model_assignment, auc, cost, avg_conf = eval_cascade(test_shock_validation_df, shock_model_zoo, shock_model_costs, cutoffs, confidence='Probability')
print(auc, cost, avg_conf)
print([sum(test_model_assignment==i) for i in range(len(shock_model_zoo))])


Confidence: probability
-------------------------
0.9376948173348175 86.82389274935075 0.9051109940912614
[0.8, 0.8]
-------
0.9592958513990569 119.23018292682927 0.9150446122533639
[14, 15, 1283]


In [43]:
print('Confidence: probability')
print('-------------------------')
shock_results = []
for c in np.linspace(1,120,20):
    cutoffs, model_assignment, auc, cost, avg_conf = fit_cascade(train_shock_validation_df, shock_model_zoo, shock_model_costs, cost_constraint=c, confidence='Probability', 
                                                                 cutoff_max_bound = shock_cutoff_max_bound, nbins=20)
    model_assignment, auc, cost, avg_conf = eval_cascade(test_shock_validation_df, shock_model_zoo, shock_model_costs, cutoffs, confidence='Probability')
    n_model = len(np.unique(model_assignment))
    
    cutoffs_full = [np.nan for i in range(len(shock_model_zoo)-1)]
    cutoffs_full[:len(cutoffs)] = cutoffs
    shock_results.append([c, auc, cost, avg_conf,n_model] + cutoffs_full)
shock_results_df = pd.DataFrame(shock_results, columns=['C','AUC','Cost','Avg Conf','Num Models'] + [f'cutoff_{i}' for i in range(len(cutoffs_full))])
shock_results_df

Confidence: probability
-------------------------


Unnamed: 0,C,AUC,Cost,Avg Conf,Num Models,cutoff_0,cutoff_1
0,1.0,0.912905,89.481707,0.8934,2,0.306853,
1,7.263158,0.912905,89.481707,0.8934,2,0.306853,
2,13.526316,0.912905,89.481707,0.8934,2,0.306853,
3,19.789474,0.912905,89.481707,0.8934,2,0.306853,
4,26.052632,0.912905,89.481707,0.8934,2,0.306853,
5,32.315789,0.912905,89.481707,0.8934,2,0.306853,
6,38.578947,0.912905,89.481707,0.8934,2,0.306853,
7,44.842105,0.912905,89.481707,0.8934,2,0.306853,
8,51.105263,0.912905,89.481707,0.8934,2,0.306853,
9,57.368421,0.912905,89.481707,0.8934,2,0.306853,


## Inference on full 2 stages test data

In [44]:
model_column_names = [f'y_pred_sepsis_{m}' for m in model_zoo] + [f'y_pred_shock_{m}' for m in shock_model_zoo]
ss_test_df = full_test_df[['id','t_since_intime','y_sepsis','y_shock'] + model_column_names]
ss_test_df = ss_test_df.set_index(['id'])
ss_test_df.head(5)

Unnamed: 0_level_0,t_since_intime,y_sepsis,y_shock,y_pred_sepsis_DecisionTree_vitals_labs,y_pred_sepsis_rnn_vitals_labs_csu_1hr,y_pred_sepsis_rnn_vitals_labs_csu_2hr,y_pred_sepsis_rnn_vitals_labs_csu_6hr,y_pred_shock_rnn_vitals_labs_1hr,y_pred_shock_rnn_vitals_labs_csu_2hr,y_pred_shock_rnn_vitals_labs_csu_med_6hr
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
200028,12,0,0,0.157858,0.529562,0.337834,0.323482,0.962371,0.928212,0.713246
200028,13,0,0,0.157858,0.237115,0.256148,0.320425,0.958531,0.931123,0.753013
200028,14,0,0,0.157858,0.358455,0.174439,0.362054,0.921778,0.861759,0.766485
200028,15,0,0,0.157858,0.134756,0.134621,0.349392,0.839972,0.641073,0.75149
200028,16,0,0,0.157858,0.213762,0.124174,0.373701,0.934275,0.666474,0.745262


In [45]:
predcut_result_df = pd.read_csv('../evaluation/sepsis_thresholds_prob2label.csv')
predcut_shock_result_df = pd.read_csv('../evaluation/shock_thresholds_prob2label.csv')
predcut_result_df = predcut_result_df.set_index('model')
predcut_shock_result_df = predcut_shock_result_df.set_index('model')
model_profiles

Unnamed: 0,Model,Validation_AUC,Train_validation_AUC,Test_validation_AUC,Cost
0,DecisionTree_vitals_labs,0.814762,0.82703,0.500821,1
1,rnn_vitals_labs_csu_1hr,0.887411,0.898747,0.536494,80
2,rnn_vitals_labs_csu_2hr,0.891644,0.902944,0.568318,90
3,rnn_vitals_labs_csu_6hr,0.899808,0.910419,0.602585,120


In [46]:
shock_model_profiles

Unnamed: 0,Model,Validation_AUC,Train_validation_AUC,Test_validation_AUC,Cost
0,rnn_vitals_labs_1hr,0.885872,0.914356,0.833961,80
1,rnn_vitals_labs_csu_2hr,0.934098,0.940919,0.913,90
2,rnn_vitals_labs_csu_med_6hr,0.961944,0.967156,0.95977,120


In [51]:
# stateful
def cascade_stateful_prediction(data_df, sepsis_cutoffs, shock_cutoffs):
    results = []

    for iid in tqdm(np.unique(data_df.index)):
        iid_data_df = data_df.loc[iid].sort_values(by=['t_since_intime'])
        iid_data_df = iid_data_df.reset_index(drop=True)
        sepsis_model_level = 0
        shock_model_level = 0
        sepsis = False
        shock = False
        for i in range(len(iid_data_df)):
            y_shock_pred = np.nan
            y_shock_pred_label = 0
            shock_pred_cut2label = np.nan

            y_sepsis_pred = np.nan
            y_sepsis_pred_label = 0
            sepsis_pred_cut2label = np.nan

            if not sepsis:
                # still using sepsis model
                model = f'sepsis_{model_zoo[sepsis_model_level]}'
                cost = model_costs[sepsis_model_level]
                y_sepsis_pred = iid_data_df.loc[i, f'y_pred_{model}']
                y_sepsis_pred_label = 0
                sepsis_pred_cut2label = predcut_result_df.loc[model_zoo[sepsis_model_level],'cutoff']
                if y_sepsis_pred > sepsis_pred_cut2label:
                    sepsis = True
                    y_sepsis_pred_label = 1

                if sepsis_model_level < len(sepsis_cutoffs):
                    if max(y_sepsis_pred, 1-y_sepsis_pred) < sepsis_cutoffs[sepsis_model_level]:
                        sepsis_model_level += 1

            else:
                y_sepsis_pred_label = 1
                # Now use shock model
                model = f'shock_{shock_model_zoo[shock_model_level]}'
                cost = shock_model_costs[shock_model_level]
                y_shock_pred = iid_data_df.loc[i, f'y_pred_{model}']
                y_shock_pred_label = 0
                shock_pred_cut2label = predcut_shock_result_df.loc[shock_model_zoo[shock_model_level],'cutoff']
                if y_shock_pred > shock_pred_cut2label:
                    shock = True
                    y_shock_pred_label = 1

                if shock_model_level < len(shock_cutoffs):
                    if max(y_shock_pred, 1-y_shock_pred) < shock_cutoffs[shock_model_level]:
                        shock_model_level += 1

            results.append([iid, iid_data_df.loc[i, 't_since_intime'], iid_data_df.loc[i, 'y_sepsis'],
                            iid_data_df.loc[i, 'y_shock'], y_sepsis_pred_label, y_shock_pred_label,
                           model, cost, y_sepsis_pred, sepsis_pred_cut2label,
                           y_shock_pred, shock_pred_cut2label])


    pred_data_df = pd.DataFrame(results, columns=['id','t_since_intime','y_sepsis', 'y_shock','y_sepsis_pred', 'y_shock_pred',
                                                          'model_assign','cost','y_sepsis_prob','sepsis_prob_cut2label', 'y_shock_prob', 'shock_prob_cut2label'])
    return pred_data_df

In [56]:
# stateful
def cascade_stateless_prediction(data_df, sepsis_cutoffs, shock_cutoffs):
    results = []

    for iid in tqdm(np.unique(data_df.index)):
        iid_data_df = data_df.loc[iid].sort_values(by=['t_since_intime'])
        iid_data_df = iid_data_df.reset_index(drop=True)
        
        for i in range(len(iid_data_df)):
            sepsis_model_level = 0
            shock_model_level = 0
        
            y_shock_pred = np.nan
            y_shock_pred_label = 0
            shock_pred_cut2label = np.nan

            y_sepsis_pred = np.nan
            y_sepsis_pred_label = 0
            sepsis_pred_cut2label = np.nan

            while sepsis_model_level <  len(sepsis_cutoffs):
                # Try sepsis model first
                if max(y_sepsis_pred, 1-y_sepsis_pred) < sepsis_cutoffs[sepsis_model_level]:
                        sepsis_model_level += 1
                else:
                    break
                    
            model = f'sepsis_{model_zoo[sepsis_model_level]}'
            y_sepsis_pred = iid_data_df.loc[i, f'y_pred_{model}']
            sepsis_pred_cut2label = predcut_result_df.loc[model_zoo[sepsis_model_level],'cutoff']
            if y_sepsis_pred > sepsis_pred_cut2label:
                y_sepsis_pred_label = 1
                # Now use shock model
                while shock_model_level < len(shock_cutoffs):
                    if max(y_shock_pred, 1-y_shock_pred) < shock_cutoffs[shock_model_level]:
                        shock_model_level += 1
                    else:
                        break
                    
                model = f'shock_{shock_model_zoo[shock_model_level]}'
                cost = shock_model_costs[shock_model_level]
                y_shock_pred = iid_data_df.loc[i, f'y_pred_{model}']
                y_shock_pred_label = 0
                shock_pred_cut2label = predcut_shock_result_df.loc[shock_model_zoo[shock_model_level],'cutoff']
                if y_shock_pred > shock_pred_cut2label:
                    y_shock_pred_label = 1
            else:
                y_sepsis_pred_label = 0
                cost = model_costs[sepsis_model_level] 

            results.append([iid, iid_data_df.loc[i, 't_since_intime'], iid_data_df.loc[i, 'y_sepsis'],
                            iid_data_df.loc[i, 'y_shock'], y_sepsis_pred_label, y_shock_pred_label,
                           model, cost, y_sepsis_pred, sepsis_pred_cut2label,
                           y_shock_pred, shock_pred_cut2label])


    pred_data_df = pd.DataFrame(results, columns=['id','t_since_intime','y_sepsis', 'y_shock','y_sepsis_pred', 'y_shock_pred',
                                                          'model_assign','cost','y_sepsis_prob','sepsis_prob_cut2label', 'y_shock_prob', 'shock_prob_cut2label'])
    return pred_data_df

In [53]:
print('Confidence: probability')
print('-------------------------')
results = []
for c in np.linspace(1,120,10):
    c = round(c, 2)
    sepsis_cutoffs, model_assignment, auc, cost, avg_conf = fit_cascade(validation_df, model_zoo, model_costs, 
                                                                        cost_constraint=c, confidence='probability', 
                                                                        cutoff_max_bound = cutoff_max_bound)
    
    shock_cutoffs, model_assignment, auc, cost, avg_conf = fit_cascade(shock_validation_df, shock_model_zoo, shock_model_costs, 
                                                                       cost_constraint=c, confidence='Probability', 
                                                                       cutoff_max_bound = shock_cutoff_max_bound, nbins=20)
    
    pred_data_df = cascade_stateful_prediction(ss_test_df, sepsis_cutoffs, shock_cutoffs)
    pred_data_df.to_csv(f'./prediction_full_test_results/hard_gate_cascade.{c}.stateful.csv', index=False)
    results.append([c, sepsis_cutoffs, shock_cutoffs])
    
print(results)
pickle.dump(results, open('./prediction_full_test_results/c_cutoffs_mapping.stateful.pkl','wb'))

Confidence: probability
-------------------------


100%|██████████| 1909/1909 [00:04<00:00, 393.36it/s]
100%|██████████| 1909/1909 [00:04<00:00, 392.25it/s]
100%|██████████| 1909/1909 [00:04<00:00, 389.61it/s]
100%|██████████| 1909/1909 [00:04<00:00, 400.02it/s]
100%|██████████| 1909/1909 [00:04<00:00, 407.18it/s]
100%|██████████| 1909/1909 [00:04<00:00, 397.67it/s]
100%|██████████| 1909/1909 [00:04<00:00, 390.65it/s]
100%|██████████| 1909/1909 [00:04<00:00, 411.76it/s]
100%|██████████| 1909/1909 [00:04<00:00, 408.20it/s]
100%|██████████| 1909/1909 [00:04<00:00, 422.43it/s]


[[1.0, [0.5231650182196773], [0.3068528411010638]], [14.22, [0.6203934354365873], [0.3068528411010638]], [27.44, [0.6933147483492699], [0.3068528411010638]], [40.67, [0.6933147483492699], [0.3068528411010638]], [53.89, [0.839157374174635], [0.3068528411010638]], [67.11, [0.839157374174635], [0.3068528411010638]], [80.33, [0.985, 0.5255384845482676], [0.3068528411010638]], [93.56, [0.985, 0.985, 0.7042170655727387], [0.8, 0.8]], [106.78, [0.985, 0.985, 0.8573713934421539], [0.8, 0.8]], [120.0, [0.985, 0.985, 0.985], [0.8, 0.8]]]


In [57]:
print('Confidence: probability')
print('-------------------------')
results = []
for c in np.linspace(1,120,10):
    c = round(c,2)
    sepsis_cutoffs, model_assignment, auc, cost, avg_conf = fit_cascade(validation_df, model_zoo, model_costs,
                                                                        cost_constraint=c, confidence='probability', 
                                                                        cutoff_max_bound = cutoff_max_bound)
    
    shock_cutoffs, model_assignment, auc, cost, avg_conf = fit_cascade(shock_validation_df, shock_model_zoo,
                                                                       shock_model_costs, cost_constraint=c, 
                                                                       confidence='Probability', 
                                                                       cutoff_max_bound = shock_cutoff_max_bound, nbins=20)
    
    pred_data_df = cascade_stateless_prediction(ss_test_df, sepsis_cutoffs, shock_cutoffs)
    pred_data_df.to_csv(f'./prediction_full_test_results/hard_gate_cascade.{c}.stateless.csv', index=False)
    results.append([c, sepsis_cutoffs, shock_cutoffs])
    
print(results)
pickle.dump(results, open('./prediction_full_test_results/c_cutoffs_mapping.stateless.pkl','wb'))

Confidence: probability
-------------------------




  0%|          | 0/1909 [00:00<?, ?it/s][A[A

  2%|▏         | 37/1909 [00:00<00:05, 362.31it/s][A[A

  4%|▎         | 71/1909 [00:00<00:05, 353.01it/s][A[A

  6%|▌         | 114/1909 [00:00<00:04, 371.85it/s][A[A

  8%|▊         | 156/1909 [00:00<00:04, 383.25it/s][A[A

 10%|█         | 200/1909 [00:00<00:04, 397.92it/s][A[A

 13%|█▎        | 241/1909 [00:00<00:04, 399.60it/s][A[A

 15%|█▍        | 277/1909 [00:00<00:04, 377.69it/s][A[A

 17%|█▋        | 316/1909 [00:00<00:04, 380.37it/s][A[A

 19%|█▉        | 359/1909 [00:00<00:03, 392.36it/s][A[A

 21%|██        | 398/1909 [00:01<00:03, 388.17it/s][A[A

 23%|██▎       | 437/1909 [00:01<00:04, 359.71it/s][A[A

 25%|██▍       | 475/1909 [00:01<00:03, 364.25it/s][A[A

 27%|██▋       | 521/1909 [00:01<00:03, 387.82it/s][A[A

 29%|██▉       | 561/1909 [00:01<00:04, 333.25it/s][A[A

 31%|███▏      | 597/1909 [00:01<00:03, 340.25it/s][A[A

 33%|███▎      | 636/1909 [00:01<00:03, 352.63it/s][A[A

 35%|███

 88%|████████▊ | 1681/1909 [00:04<00:00, 374.40it/s][A[A

 90%|█████████ | 1721/1909 [00:04<00:00, 380.60it/s][A[A

 92%|█████████▏| 1763/1909 [00:04<00:00, 388.75it/s][A[A

 94%|█████████▍| 1802/1909 [00:04<00:00, 307.12it/s][A[A

 97%|█████████▋| 1846/1909 [00:04<00:00, 335.75it/s][A[A

100%|██████████| 1909/1909 [00:05<00:00, 381.02it/s][A[A


  0%|          | 0/1909 [00:00<?, ?it/s][A[A

  2%|▏         | 37/1909 [00:00<00:05, 366.54it/s][A[A

  4%|▍         | 78/1909 [00:00<00:04, 376.52it/s][A[A

  6%|▌         | 117/1909 [00:00<00:04, 379.20it/s][A[A

  8%|▊         | 159/1909 [00:00<00:04, 386.07it/s][A[A

 11%|█         | 202/1909 [00:00<00:04, 397.92it/s][A[A

 12%|█▏        | 236/1909 [00:00<00:04, 371.27it/s][A[A

 14%|█▍        | 270/1909 [00:00<00:04, 336.64it/s][A[A

 16%|█▌        | 302/1909 [00:00<00:04, 326.52it/s][A[A

 18%|█▊        | 343/1909 [00:00<00:04, 346.27it/s][A[A

 20%|█▉        | 377/1909 [00:01<00:04, 344.28it/s][A[A

 22

 71%|███████   | 1351/1909 [00:03<00:01, 284.60it/s][A[A

 73%|███████▎  | 1395/1909 [00:04<00:01, 315.50it/s][A[A

 75%|███████▌  | 1439/1909 [00:04<00:01, 344.36it/s][A[A

 78%|███████▊  | 1482/1909 [00:04<00:01, 363.96it/s][A[A

 80%|███████▉  | 1520/1909 [00:04<00:01, 348.08it/s][A[A

 82%|████████▏ | 1564/1909 [00:04<00:00, 368.09it/s][A[A

 84%|████████▍ | 1602/1909 [00:04<00:00, 363.42it/s][A[A

 86%|████████▌ | 1641/1909 [00:04<00:00, 370.44it/s][A[A

 88%|████████▊ | 1679/1909 [00:04<00:00, 369.85it/s][A[A

 90%|████████▉ | 1717/1909 [00:04<00:00, 354.04it/s][A[A

 92%|█████████▏| 1753/1909 [00:05<00:00, 344.38it/s][A[A

 94%|█████████▎| 1788/1909 [00:05<00:00, 335.44it/s][A[A

 96%|█████████▌| 1825/1909 [00:05<00:00, 343.47it/s][A[A

 98%|█████████▊| 1867/1909 [00:05<00:00, 363.26it/s][A[A

100%|██████████| 1909/1909 [00:05<00:00, 347.67it/s][A[A


  0%|          | 0/1909 [00:00<?, ?it/s][A[A

  2%|▏         | 40/1909 [00:00<00:04, 394.83it/s]

 49%|████▉     | 938/1909 [00:02<00:02, 410.37it/s][A[A

 51%|█████▏    | 980/1909 [00:02<00:02, 386.34it/s][A[A

 54%|█████▎    | 1024/1909 [00:02<00:02, 400.36it/s][A[A

 56%|█████▌    | 1065/1909 [00:03<00:02, 401.66it/s][A[A

 58%|█████▊    | 1113/1909 [00:03<00:01, 420.46it/s][A[A

 61%|██████    | 1156/1909 [00:03<00:01, 408.98it/s][A[A

 63%|██████▎   | 1198/1909 [00:03<00:01, 404.65it/s][A[A

 65%|██████▍   | 1239/1909 [00:03<00:01, 347.13it/s][A[A

 67%|██████▋   | 1276/1909 [00:03<00:02, 293.71it/s][A[A

 69%|██████▊   | 1308/1909 [00:03<00:02, 283.72it/s][A[A

 70%|███████   | 1342/1909 [00:03<00:01, 296.04it/s][A[A

 73%|███████▎  | 1387/1909 [00:04<00:01, 329.56it/s][A[A

 75%|███████▍  | 1430/1909 [00:04<00:01, 353.28it/s][A[A

 77%|███████▋  | 1471/1909 [00:04<00:01, 365.64it/s][A[A

 79%|███████▉  | 1510/1909 [00:04<00:01, 361.25it/s][A[A

 81%|████████  | 1548/1909 [00:04<00:00, 365.78it/s][A[A

 83%|████████▎ | 1586/1909 [00:04<00:00, 3

[[1.0, [0.5231650182196773], [0.3068528411010638]], [14.22, [0.6203934354365873], [0.3068528411010638]], [27.44, [0.6933147483492699], [0.3068528411010638]], [40.67, [0.6933147483492699], [0.3068528411010638]], [53.89, [0.839157374174635], [0.3068528411010638]], [67.11, [0.839157374174635], [0.3068528411010638]], [80.33, [0.985, 0.5255384845482676], [0.3068528411010638]], [93.56, [0.985, 0.985, 0.7042170655727387], [0.8, 0.8]], [106.78, [0.985, 0.985, 0.8573713934421539], [0.8, 0.8]], [120.0, [0.985, 0.985, 0.985], [0.8, 0.8]]]
