## Author Response Period results and experiments

In [2]:
import os
import json
import glob
import pickle
import pandas as pd
import numpy as np
import seaborn as sns
from scipy.stats import pearsonr
import matplotlib.pyplot as plt

import metrics as mt
import compute_KL as ckl
import scipy.stats as stats
from scipy.stats import pearsonr

%load_ext autoreload
%autoreload 2

### Section 1: Original experiments values

In [6]:
hospital_file = 'YAIB-cohorts/data/mortality24/eicu/above2000.txt'
df = pd.read_csv(hospital_file, header=None)
n = 12
hospital_ids = df[0].values[:n]
hospital_dict = {} 
for i in range(len(hospital_ids)):
    hospital_dict[hospital_ids[i]] = i

In [5]:
hos=443
# hospital
incl_ground_truth = True
hospital_dict = {} 
for i in range(len(hospital_ids)):
    hospital_dict[hospital_ids[i]] = i
    
hos_ind = hospital_dict[hos]
model = "LogisticRegression"
n_avg = np.load(f"results/sequential-n/{hos}/{hos}-{model}-avg.npz", allow_pickle=True)
n_std = np.load(f"results/sequential-n/{hos}/{hos}-{model}-std.npz", allow_pickle=True)
dist_shift_avg = np.load(f"results/dist_shift/{model}-n1500-avg.npz", allow_pickle=True)
dist_shift_std = np.load(f"results/dist_shift/{model}-n1500-std.npz", allow_pickle=True)
sequential_best_avg = np.load(f"results/sequential-n/{hos}/{hos}-{model}-best-hospitals-avg.npz", allow_pickle=True)
sequential_best_std = np.load(f"results/sequential-n/{hos}/{hos}-{model}-best-hospitals-std.npz", allow_pickle=True)
sequential_worst_avg = np.load(f"results/sequential-n/{hos}/{hos}-{model}-worst-hospitals-avg.npz", allow_pickle=True)
sequential_worst_std = np.load(f"results/sequential-n/{hos}/{hos}-{model}-worst-hospitals-std.npz", allow_pickle=True)
mixture_avg = np.load(f"results/sequential-n/{hos}/{hos}-mixture-{model}-avg.npz", allow_pickle=True)
mixture_std = np.load(f"results/sequential-n/{hos}/{hos}-mixture-{model}-std.npz", allow_pickle=True)

In [6]:
plot_df = pd.DataFrame.from_dict(n_avg['AUC'].item(), orient='index', columns=['AUC'])
std_df = pd.DataFrame.from_dict(n_std['AUC'].item(), orient='index', columns=['std'])

plot_df = pd.concat([plot_df, std_df], axis=1)
new_df = plot_df.reset_index(names='n')
new_df['train_hospital'] = hos

best_df = pd.DataFrame.from_dict(sequential_best_avg['AUC'].item(), orient='index', columns=['Best3-AUC'])
worst_df = pd.DataFrame.from_dict(sequential_worst_avg['AUC'].item(), orient='index', columns=['Worst3-AUC'])
mixture_df = pd.DataFrame.from_dict(mixture_avg['AUC'].item(), orient='index', columns=['Mixture-AUC'])
best_df_std = pd.DataFrame.from_dict(sequential_best_std['AUC'].item(), orient='index', columns=['Best3-std'])
worst_df_std = pd.DataFrame.from_dict(sequential_worst_std['AUC'].item(), orient='index', columns=['Worst3-std'])
mixture_df_std = pd.DataFrame.from_dict(mixture_std['AUC'].item(), orient='index', columns=['Mixture-std'])

In [7]:
plot_df = pd.concat([best_df, worst_df, mixture_df, best_df_std, worst_df_std, mixture_df_std], axis=1)
addition_df = plot_df.reset_index(names='n')

In [8]:
plot_df

Unnamed: 0,Best3-AUC,Worst3-AUC,Mixture-AUC,Best3-std,Worst3-std,Mixture-std
3000,0.754913,0.732681,0.743667,0.008487,0.010288,0.010278
4500,0.755354,0.719812,0.747519,0.008113,0.009501,0.009649
6000,0.764739,0.738016,0.752954,0.007506,0.00988,0.009281


In [9]:
sem_ratio = np.sqrt(5)

### Section 2: Extract new data

In [20]:
file_n = 1000
model = "LogisticRegression"
n_avg = np.load(f"results/sequential-n/{hos}/{hos}-{model}-avg.npz", allow_pickle=True)
n_std = np.load(f"results/sequential-n/{hos}/{hos}-{model}-std.npz", allow_pickle=True)
sequential_best_avg = np.load(f"results/sequential-n{file_n}/{hos}/{hos}-{model}-best-hospitals-avg.npz", allow_pickle=True)
sequential_best_std = np.load(f"results/sequential-n{file_n}/{hos}/{hos}-{model}-best-hospitals-std.npz", allow_pickle=True)
sequential_worst_avg = np.load(f"results/sequential-n{file_n}/{hos}/{hos}-{model}-worst-hospitals-avg.npz", allow_pickle=True)
sequential_worst_std = np.load(f"results/sequential-n{file_n}/{hos}/{hos}-{model}-worst-hospitals-std.npz", allow_pickle=True)
mixture_avg = np.load(f"results/sequential-n{file_n}/{hos}/{hos}-mixture-{model}-avg.npz", allow_pickle=True)
mixture_std = np.load(f"results/sequential-n{file_n}/{hos}/{hos}-mixture-{model}-std.npz", allow_pickle=True)

plot_df = pd.DataFrame.from_dict(n_avg['AUC'].item(), orient='index', columns=['AUC'])
std_df = pd.DataFrame.from_dict(n_std['AUC'].item(), orient='index', columns=['std'])

plot_df = pd.concat([plot_df, std_df], axis=1)
new_df = plot_df.reset_index(names='n')
new_df['train_hospital'] = hos

best_df = pd.DataFrame.from_dict(sequential_best_avg['AUC'].item(), orient='index', columns=['Best3-AUC'])
worst_df = pd.DataFrame.from_dict(sequential_worst_avg['AUC'].item(), orient='index', columns=['Worst3-AUC'])
mixture_df = pd.DataFrame.from_dict(mixture_avg['AUC'].item(), orient='index', columns=['Mixture-AUC'])
best_df_std = pd.DataFrame.from_dict(sequential_best_std['AUC'].item(), orient='index', columns=['Best3-std'])
worst_df_std = pd.DataFrame.from_dict(sequential_worst_std['AUC'].item(), orient='index', columns=['Worst3-std'])
mixture_df_std = pd.DataFrame.from_dict(mixture_std['AUC'].item(), orient='index', columns=['Mixture-std'])
plot_df = pd.concat([best_df, worst_df, mixture_df, best_df_std, worst_df_std, mixture_df_std], axis=1)
addition_df = plot_df.reset_index(names='n')

In [21]:
plot_df

Unnamed: 0,Best3-AUC,Worst3-AUC,Mixture-AUC,Best3-std,Worst3-std,Mixture-std
2000,0.735563,0.709273,0.726154,0.007521,0.011729,0.0091
3000,0.745956,0.700069,0.737105,0.007628,0.010716,0.010062
4000,0.753337,0.735349,0.741432,0.007533,0.009567,0.009712


In [23]:
n_avg = np.load(f"results/sequential-n/{hos}/{hos}-{model}-avg.npz", allow_pickle=True)
n_std = np.load(f"results/sequential-n/{hos}/{hos}-{model}-std.npz", allow_pickle=True)

plot_df = pd.DataFrame.from_dict(n_avg['AUC'].item(), orient='index', columns=['AUC'])
std_df = pd.DataFrame.from_dict(n_std['AUC'].item(), orient='index', columns=['std'])
plot_df = pd.concat([plot_df, std_df], axis=1)
new_df = plot_df.reset_index(names='n')
new_df['train_hospital'] = hos

In [24]:
new_df

Unnamed: 0,n,AUC,std,train_hospital
0,400,0.662183,0.012482,443
1,800,0.71176,0.008971,443
2,1000,0.717573,0.008739,443
3,1200,0.725728,0.010076,443
4,1500,0.731079,0.010628,443
5,2000,0.737726,0.009233,443


## Score Function Robustness

In [None]:
from sklearn.linear_model import LogisticRegression 
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

n = 12
hospital_file = '/home/ubuntu/projects/more-data-more-problems/YAIB-cohorts/data/mortality24/eicu/above2000.txt'
df = pd.read_csv(hospital_file, header=None)
hospital_ids = df[0].values[:n]

hospital_dict = {} 
for i in range(len(hospital_ids)):
    hospital_dict[hospital_ids[i]] = i
    
#all_sizes = {} 

for max_size in [200, 100, 50]:
    print(f"calculating max size {max_size}")
    results_x = np.zeros((len(hospital_ids), len(hospital_ids))) 
    results_xy = np.zeros((len(hospital_ids), len(hospital_ids))) 
    for test_i, test_h in enumerate(hospital_ids): 
        for i, h in enumerate(hospital_ids): 
            hos = test_h
            if h != hos: 
                x, y, xy = ckl.get_hospital(h, 'train', max_samples=max_size)
                x2, y2, xy2 = ckl.get_hospital(hos, 'train', max_samples=max_size)
                print(len(x))
                scaler = StandardScaler()
                logistic = LogisticRegression(max_iter=10000, tol=0.1)
                pipe = Pipeline(steps=[("scaler", scaler), ("logistic", logistic)])
                
                # X_train = np.concatenate((x, x2), axis=0)
                # Y_train = np.concatenate((np.ones(len(x)), np.zeros(len(x2))), axis=0)
                
                # pipe.fit(X_train, Y_train)
                
                # x_val, _, _ = ckl.get_hospital(h, 'test')
                # results_x[i, test_i] = pipe.predict_proba(x_val)[:, 1].mean()
    
                X_train = np.concatenate((xy, xy2), axis=0)
                Y_train = np.concatenate((np.ones(len(xy)), np.zeros(len(xy2))), axis=0)
                
                pipe.fit(X_train, Y_train)
                
                _, _, xy_val = ckl.get_hospital(h, 'test')
                results_xy[i, test_i] = pipe.predict_proba(xy_val)[:, 1].mean()
    all_sizes[max_size] = results_xy 

In [26]:
ref = all_sizes[2000].flatten()
for comp in [1500, 1000, 800, 500, 200, 100, 50]: 
    print(comp, pearsonr(ref, all_sizes[comp].flatten()))

1500 PearsonRResult(statistic=0.9940310253717729, pvalue=1.5755847404540598e-138)
1000 PearsonRResult(statistic=0.9941845917355212, pvalue=2.4892243746403424e-139)
800 PearsonRResult(statistic=0.9931834088285018, pvalue=1.9014749200001079e-134)
500 PearsonRResult(statistic=0.994541062675264, pvalue=2.824247565532734e-141)
200 PearsonRResult(statistic=0.9885471120845948, pvalue=1.617995144270007e-118)
100 PearsonRResult(statistic=0.9762139673387682, pvalue=3.6175278953543356e-96)
50 PearsonRResult(statistic=0.955518637991524, pvalue=3.505482822511442e-77)


### Ground Truth Distribution Shift Hospitals

In [9]:
hos=443
file_n = ''
model = "LogisticRegression"
n_avg = np.load(f"YAIB/results/sequential-n/{hos}/{hos}-{model}-avg.npz", allow_pickle=True)
n_std = np.load(f"YAIB/results/sequential-n/{hos}/{hos}-{model}-std.npz", allow_pickle=True)
sequential_best_avg = np.load(f"YAIB/results/sequential-n{file_n}/{hos}/{hos}-{model}-best-hospitals-gt-avg.npz", allow_pickle=True)
sequential_best_std = np.load(f"YAIB/results/sequential-n{file_n}/{hos}/{hos}-{model}-best-hospitals-gt-std.npz", allow_pickle=True)
sequential_worst_avg = np.load(f"YAIB/results/sequential-n{file_n}/{hos}/{hos}-{model}-worst-hospitals-gt-avg.npz", allow_pickle=True)
sequential_worst_std = np.load(f"YAIB/results/sequential-n{file_n}/{hos}/{hos}-{model}-worst-hospitals-gt-std.npz", allow_pickle=True)
mixture_avg = np.load(f"YAIB/results/sequential-n{file_n}/{hos}/{hos}-mixture-{model}-avg.npz", allow_pickle=True)
mixture_std = np.load(f"YAIB/results/sequential-n{file_n}/{hos}/{hos}-mixture-{model}-std.npz", allow_pickle=True)

plot_df = pd.DataFrame.from_dict(n_avg['AUC'].item(), orient='index', columns=['AUC'])
std_df = pd.DataFrame.from_dict(n_std['AUC'].item(), orient='index', columns=['std'])

plot_df = pd.concat([plot_df, std_df], axis=1)
new_df = plot_df.reset_index(names='n')
new_df['train_hospital'] = hos

best_df = pd.DataFrame.from_dict(sequential_best_avg['AUC'].item(), orient='index', columns=['Best3-AUC'])
worst_df = pd.DataFrame.from_dict(sequential_worst_avg['AUC'].item(), orient='index', columns=['Worst3-AUC'])
mixture_df = pd.DataFrame.from_dict(mixture_avg['AUC'].item(), orient='index', columns=['Mixture-AUC'])
best_df_std = pd.DataFrame.from_dict(sequential_best_std['AUC'].item(), orient='index', columns=['Best3-std'])
worst_df_std = pd.DataFrame.from_dict(sequential_worst_std['AUC'].item(), orient='index', columns=['Worst3-std'])
mixture_df_std = pd.DataFrame.from_dict(mixture_std['AUC'].item(), orient='index', columns=['Mixture-std'])
plot_df = pd.concat([best_df, worst_df, mixture_df, best_df_std, worst_df_std, mixture_df_std], axis=1)
addition_df = plot_df.reset_index(names='n')

In [10]:
plot_df

Unnamed: 0,Best3-AUC,Worst3-AUC,Mixture-AUC,Best3-std,Worst3-std,Mixture-std
3000,0.754913,0.732681,0.743667,0.008487,0.010288,0.010278
4500,0.765146,0.71871,0.747519,0.009459,0.012256,0.009649
6000,0.743729,0.719298,0.752954,0.008696,0.009804,0.009281
