In [1]:
import os
import pickle

import pandas as pd
import scipy
import datetime
import numpy as np
import tensorflow as tf

from definitions import ROOT_DIR

seed = 2
tf.random.set_seed(seed)
np.random.seed(seed)

import simclr_models
import simclr_utitlities

In [2]:
# Dataset-specific
# working_directory = 'SleepEEG/'
# data_folder = 'SleepEEG'
working_directory = 'MIMIC/'
data_folder = 'MIMIC'
model_name = '20230227-170733_finetuning.hdf5'

In [3]:
# Load preprocessed data
np_train = (np.load(os.path.join(data_folder, 'train_x.npy')),
           np.load(os.path.join(data_folder, 'train_y.npy')))
np_val = (np.load(os.path.join(data_folder, 'val_x.npy')),
           np.load(os.path.join(data_folder, 'val_y.npy')))
np_test = (np.load(os.path.join(data_folder, 'test_x.npy')),
           np.load(os.path.join(data_folder, 'test_y.npy')))

pretrained_model = tf.keras.models.load_model(os.path.join(working_directory, model_name), compile=False)
probs = pretrained_model.predict(np_test[0])
predictions = np.argmax(probs, axis=1)
# print(simclr_utitlities.evaluate_model_simple(pretrained_model.predict(np_test[0]), np_test[1], return_dict=True))

In [5]:
os.chdir(ROOT_DIR)
subjects = pd.read_csv(os.path.join('datasets', data_folder, 'demographics_rich.csv'))
subjects.head()

Unnamed: 0,SUBJECT_ID,LANGUAGE,INSURANCE,RELIGION,ETHNICITY,GENDER,AGE
0,22,,Private,UNOBTAINABLE,WHITE,F,1
1,23,ENGL,Medicare,CATHOLIC,WHITE,M,0
2,24,,Private,PROTESTANT QUAKER,WHITE,M,1
3,25,,Private,UNOBTAINABLE,WHITE,M,1
4,26,,Medicare,CATHOLIC,UNKNOWN/NOT SPECIFIED,M,0


In [6]:
test_listfile = pd.read_csv(os.path.join('datasets', data_folder, 'test_listfile.csv'))
test_listfile.head(20)

Unnamed: 0,stay,y_true
0,10011_episode1_timeseries.csv,1
1,10026_episode1_timeseries.csv,0
2,10030_episode1_timeseries.csv,0
3,10042_episode1_timeseries.csv,0
4,10094_episode1_timeseries.csv,0
5,10094_episode2_timeseries.csv,1
6,1009_episode1_timeseries.csv,0
7,10102_episode1_timeseries.csv,1
8,10149_episode1_timeseries.csv,0
9,10149_episode2_timeseries.csv,0


In [7]:
# Get subject from test
test_listfile.loc[:, "subject"] = test_listfile.stay.apply(lambda stay: stay.split("_")[0]).astype(int)
test_listfile.head()

Unnamed: 0,stay,y_true,subject
0,10011_episode1_timeseries.csv,1,10011
1,10026_episode1_timeseries.csv,0,10026
2,10030_episode1_timeseries.csv,0,10030
3,10042_episode1_timeseries.csv,0,10042
4,10094_episode1_timeseries.csv,0,10094


In [8]:
# merge demographics in test df
test_listfile = test_listfile.merge(subjects, left_on="subject", right_on="SUBJECT_ID", how="left")
test_listfile.head()

Unnamed: 0,stay,y_true,subject,SUBJECT_ID,LANGUAGE,INSURANCE,RELIGION,ETHNICITY,GENDER,AGE
0,10011_episode1_timeseries.csv,1,10011,10011,,Private,CATHOLIC,UNKNOWN/NOT SPECIFIED,F,1
1,10026_episode1_timeseries.csv,0,10026,10026,,Medicare,OTHER,WHITE,F,0
2,10030_episode1_timeseries.csv,0,10030,10030,,Medicare,CATHOLIC,WHITE,M,0
3,10042_episode1_timeseries.csv,0,10042,10042,,Medicare,UNOBTAINABLE,WHITE,M,0
4,10094_episode1_timeseries.csv,0,10094,10094,,Medicare,UNOBTAINABLE,BLACK/AFRICAN AMERICAN,M,0


In [9]:
print("Test rows with null demograpphics: {} ({}%)".format(test_listfile.SUBJECT_ID.isna().sum(), test_listfile.SUBJECT_ID.isna().sum()/test_listfile.shape[0]))
test_listfile.drop(columns=['SUBJECT_ID'], inplace=True)
test_listfile.head()

Test rows with null demograpphics: 0 (0.0%)


Unnamed: 0,stay,y_true,subject,LANGUAGE,INSURANCE,RELIGION,ETHNICITY,GENDER,AGE
0,10011_episode1_timeseries.csv,1,10011,,Private,CATHOLIC,UNKNOWN/NOT SPECIFIED,F,1
1,10026_episode1_timeseries.csv,0,10026,,Medicare,OTHER,WHITE,F,0
2,10030_episode1_timeseries.csv,0,10030,,Medicare,CATHOLIC,WHITE,M,0
3,10042_episode1_timeseries.csv,0,10042,,Medicare,UNOBTAINABLE,WHITE,M,0
4,10094_episode1_timeseries.csv,0,10094,,Medicare,UNOBTAINABLE,BLACK/AFRICAN AMERICAN,M,0


In [10]:
test_listfile.loc[:, "y_predicted"] = predictions
test_listfile.head()

Unnamed: 0,stay,y_true,subject,LANGUAGE,INSURANCE,RELIGION,ETHNICITY,GENDER,AGE,y_predicted
0,10011_episode1_timeseries.csv,1,10011,,Private,CATHOLIC,UNKNOWN/NOT SPECIFIED,F,1,0
1,10026_episode1_timeseries.csv,0,10026,,Medicare,OTHER,WHITE,F,0,0
2,10030_episode1_timeseries.csv,0,10030,,Medicare,CATHOLIC,WHITE,M,0,0
3,10042_episode1_timeseries.csv,0,10042,,Medicare,UNOBTAINABLE,WHITE,M,0,0
4,10094_episode1_timeseries.csv,0,10094,,Medicare,UNOBTAINABLE,BLACK/AFRICAN AMERICAN,M,0,0


In [11]:
# Indicatively testing performance on publicly insured (1 for majority) versus privately insured (0 for minority) subjects
test_listfile_Medicare = test_listfile[test_listfile.INSURANCE == "Medicare"]
test_listfile_Private = test_listfile[test_listfile.INSURANCE == "Private"]
test_listfile_Medicaid = test_listfile[test_listfile.INSURANCE == "Medicaid"]
test_listfile_Government = test_listfile[test_listfile.INSURANCE == "Government"]
test_listfile_Self = test_listfile[test_listfile.INSURANCE == "Self Pay"]

In [21]:
# Overall performance
print("\n--- Train Performance Overall ---")
train_listfile = pd.read_csv(os.path.join('datasets', data_folder, 'train_listfile.csv'))
train_listfile.loc[:, "y_predicted"] = np.argmax(pretrained_model.predict(np_train[0]),axis=1)
print(simclr_utitlities.evaluate_model_simple(train_listfile.y_predicted, train_listfile.y_true, is_one_hot=False, return_dict=True))
print("\n--- Validation Performance Overall ---")
val_listfile = pd.read_csv(os.path.join('datasets', data_folder, 'val_listfile.csv'))
val_listfile.loc[:, "y_predicted"] = np.argmax(pretrained_model.predict(np_val[0]),axis=1)
print(simclr_utitlities.evaluate_model_simple(val_listfile.y_predicted, val_listfile.y_true, is_one_hot=False, return_dict=True))
# todo buggy: should return same results
print("\n--- Test Performance Overall ---")
print(simclr_utitlities.evaluate_model_simple(pretrained_model.predict(np_test[0]), np_test[1], return_dict=True))
print(simclr_utitlities.evaluate_model_simple(test_listfile.y_predicted, test_listfile.y_true, is_one_hot=False, return_dict=True))


--- Train Performance Overall ---
{'Accuracy': 0.8633608064845719, 'AUROC': 0.5334233729260677, 'AUPRC': 0.16263444368264365, 'Confusion Matrix': array([[12514,   180],
       [ 1826,   161]], dtype=int64), 'F1 Macro': 0.5320566477697551, 'F1 Micro': 0.8633608064845719, 'F1 Weighted': 0.8192155263708678, 'Precision': 0.6724023198648654, 'Recall': 0.5334233729260677, 'Kappa': 0.10273974647795148}

--- Validation Performance Overall ---
{'Accuracy': 0.8594040968342644, 'AUROC': 0.5327398789491362, 'AUPRC': 0.15834061296877144, 'Confusion Matrix': array([[2732,   54],
       [ 399,   37]], dtype=int64), 'F1 Macro': 0.5319291951053456, 'F1 Micro': 0.8594040968342646, 'F1 Weighted': 0.8174824489320465, 'Precision': 0.6395790412079138, 'Recall': 0.5327398789491362, 'Kappa': 0.09827744491987733}

--- Test Performance Overall ---
{'Accuracy': 0.8788627935723115, 'AUROC': 0.7053460987978191, 'AUPRC': 0.5887859983195709, 'Confusion Matrix': array([[2815,   47],
       [ 345,   29]], dtype=int64

In [26]:
y1 = pd.Series(np.argmax(np_test[1], axis=1))
y2 = test_listfile.y_true
y1.equals(y2)

True

In [27]:
x1 = pd.Series(np.argmax(pretrained_model.predict(np_test[0]), axis=1))
x2 = test_listfile.y_predicted
x1.equals(x2)

True

In [37]:
import sklearn
# todo what is happening?
#     test_auroc = sklearn.metrics.roc_auc_score(truth, pred, multi_class='ovr')
print(sklearn.metrics.roc_auc_score(np_test[1], pretrained_model.predict(np_test[0])))  # version from github code
print(sklearn.metrics.roc_auc_score(np.argmax(np_test[1], axis=1), np.argmax(pretrained_model.predict(np_test[0]), axis=1)))  # our version

0.7053460987978191
0.5305590122460266


In [43]:
pretrained_model.predict(np_test[0])

array([[0.8113033 , 0.18869668],
       [0.91866565, 0.0813344 ],
       [0.9861631 , 0.01383692],
       ...,
       [0.24069574, 0.7593042 ],
       [0.958743  , 0.04125697],
       [0.9296661 , 0.07033388]], dtype=float32)

In [16]:
# Fairness Comparisons conditioned on Insurance Type
print("\n--- Performance on Medicare Insured Subjects ---")
print(simclr_utitlities.evaluate_model_simple(test_listfile_Medicare.y_predicted, test_listfile_Medicare.y_true, is_one_hot=False, return_dict=True))
print("\n--- Performance on Private Insured Subjects ---")
print(simclr_utitlities.evaluate_model_simple(test_listfile_Private.y_predicted, test_listfile_Private.y_true, is_one_hot=False, return_dict=True))
print("\n--- Performance on Medicaid Insured Subjects ---")
print(simclr_utitlities.evaluate_model_simple(test_listfile_Medicaid.y_predicted, test_listfile_Medicaid.y_true, is_one_hot=False, return_dict=True))
print("\n--- Performance on Government Insured Subjects ---")
print(simclr_utitlities.evaluate_model_simple(test_listfile_Government.y_predicted, test_listfile_Government.y_true, is_one_hot=False, return_dict=True))
print("\n--- Performance on Self-Insured Subjects ---")
print(simclr_utitlities.evaluate_model_simple(test_listfile_Self.y_predicted, test_listfile_Self.y_true, is_one_hot=False, return_dict=True))


--- Performance Overall ---
{'Accuracy': 0.8841161928306551, 'AUROC': 0.5044740785584293, 'AUPRC': 0.11909210625854266, 'Confusion Matrix': array([[2857,    5],
       [ 370,    4]], dtype=int64), 'F1 Macro': 0.47965063052965007, 'F1 Micro': 0.8841161928306551, 'F1 Weighted': 0.8323706862621775, 'Precision': 0.6648934338739111, 'Recall': 0.5044740785584293, 'Kappa': 0.015540426526135409}

--- Performance on Medicare Insured Subjects ---
{'Accuracy': 0.8665265370467683, 'AUROC': 0.5069329430801447, 'AUPRC': 0.14086059016354835, 'Confusion Matrix': array([[1645,    3],
       [ 251,    4]], dtype=int64), 'F1 Macro': 0.47943196112556646, 'F1 Micro': 0.8665265370467683, 'F1 Weighted': 0.808025955189085, 'Precision': 0.7195223025919228, 'Recall': 0.5069329430801447, 'Kappa': 0.02354267336813365}

--- Performance on Private Insured Subjects ---
{'Accuracy': 0.911042944785276, 'AUROC': 0.4994394618834081, 'AUPRC': 0.08793456032719836, 'Confusion Matrix': array([[891,   1],
       [ 86,   0]]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
