In [1]:
import numpy as np
import pandas as pd
import os
import pprint
import sklearn
from sklearn import preprocessing
from sklearn.decomposition import PCA
import warnings
import matplotlib
from os import listdir
import seaborn as sns
import scipy
import sys
import pickle
import miceforest as mf
warnings.filterwarnings("ignore")
import functools
from functools import reduce
from mc4.algorithm import mc4_aggregator
from copy import deepcopy

import sklearn.neighbors._base
sys.modules['sklearn.neighbors.base'] = sklearn.neighbors._base

from sklearn.preprocessing import StandardScaler, MinMaxScaler
import missingpy
from sklearn.experimental import enable_iterative_imputer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, ConfusionMatrixDisplay
from sklearn.metrics import brier_score_loss, precision_score, recall_score, f1_score, log_loss, roc_auc_score
from collections import Counter
import torch
from torch import nn
import torch.nn.functional as F
from tqdm import tqdm

import warnings
warnings.filterwarnings("ignore")

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import TomekLinks
from imblearn.under_sampling import NearMiss
from collections import Counter

import shap
import matplotlib.pyplot as plt
import statsmodels.api as sm
from torch.utils.data import Dataset, DataLoader

In [2]:
rs = 42

In [3]:
import random

def _init_fn(worker_id):
    np.random.seed(int(rs))
    
def seed_torch(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed) # if you are using multi-GPU.
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

seed_torch()

In [4]:
# Undersample Test Set
test_under_sampler = 0
do_class_weight = 1

# Undersample Train Set
do_smote = 0
do_random_under_sampler = 0
do_random_over_sampler = 0
do_tomek_links = 0
do_near_miss = 0

In [5]:
def find_csv_filenames( path_to_dir, suffix=".csv" ):
    filenames = listdir(path_to_dir)
    return [ filename for filename in filenames if filename.endswith( suffix ) ]

In [6]:
os.chdir('..')

In [7]:
prediction_window = "48h"

In [8]:
data_directory = 'AKI Data/eicu_aki/new data/'
prediction_directory = 'observation' + prediction_window + '_prediction24h/'

data_files = find_csv_filenames(data_directory+prediction_directory)

hosps = [x.replace('.csv', '') for x in data_files]

In [9]:
model_directory = 'AKI Models/' + prediction_window + '/Pytorch LR/'

#### Loading Cleaned Data

In [10]:
def clean_outliers(data):
    demographic_index = list(data).index('is_female')
    medical_signs = list(data)[:demographic_index]
    demographic_data = ['is_female', 'age', 'race_black', 'race_hispanic', 'race_asian', 
                        'race_other', 'electivesurgery', 'BMI']
    demographic_data = list(set(list(data)) & set(demographic_data))
    numerical_col = medical_signs + list(set(list(data)) & set(['age', 'BMI']))
    for col in numerical_col:
        min_value = data[col].quantile(0.01)
        max_value = data[col].quantile(0.99)
        data[col][data[col] < min_value] = None
        data[col][data[col] > max_value] = None
    return data

def get_features_diag(data):
    features = list(data_dictionary[hosp])
    features.remove(list(data_dictionary[hosp])[-1])
    AKI_diag = list(data_dictionary[hosp])[-1]
    return features, AKI_diag

def do_training_modifications(X_train, y_train):
    if do_smote:
        oversample = SMOTE(random_state=rs)
        X_train, y_train = oversample.fit_resample(X_train, y_train)
    if do_random_under_sampler:
        rus = RandomUnderSampler(replacement=True, random_state=rs)
        X_train, y_train = rus.fit_resample(X_train, y_train)
    if do_random_over_sampler:
        ros = RandomOverSampler(random_state=rs)
        X_train, y_train = ros.fit_resample(X_train, y_train)
    if do_tomek_links:
        tl = TomekLinks(random_state=rs)
        X_train, y_train = tl.fit_resample(X_train, y_train)
    if do_near_miss:
        nm = NearMiss(random_state=rs)
        X_train, y_train = nm.fit_resample(X_train, y_train)  
    return X_train, y_train

def normalize_data(data_dictionary, hosp):
    df = deepcopy(data_dictionary[hosp])
    demographic_index = list(df).index('is_female')
    medical_signs = list(df)[:demographic_index]
    medication_index = list(df).index('ACETAMIN')
    medications = list(df)[medication_index:len(list(data))-1]
    demographic_binary = ['is_female', 'race_black', 'race_hispanic', 'race_asian', 
                        'race_other', 'electivesurgery']
    binary_features = medications + demographic_binary
    continuous_features = medical_signs + ['age', 'BMI']

    df[continuous_features] = (df[continuous_features]-df[continuous_features].mean())/df[continuous_features].std()
    df[binary_features] = df[binary_features].replace([0],-1)
    return df

In [11]:
temp_data_dictionary = dict.fromkeys(hosps)
missing_dictionary = dict.fromkeys(hosps)

In [12]:
data_files = find_csv_filenames(data_directory+prediction_directory)
missing_features = []

for file in data_files:
    data = pd.read_csv(data_directory + prediction_directory + file)
    hosp = file.replace('.csv', '')

    data = clean_outliers(data)

    df_missing = data.isnull().sum()
    all_cols = list(df_missing.index)
    missing_vals = list(df_missing)

    indices_remove = []
    for i in range(0, len(missing_vals)):
        if missing_vals[i] == 0:
            indices_remove.append(i)

    big_array = np.array([all_cols,missing_vals])
    missing_feats, num_missing = np.delete(big_array, indices_remove, axis=1)
    df_missing = pd.DataFrame(data={"Feature":missing_feats, 'Number Missing':num_missing})

    missing_dictionary[hosp] = df_missing
    df_missing["Frequency Missing"] = (df_missing["Number Missing"].values).astype(int) / data.shape[0]
    large_missing_features = df_missing[df_missing["Frequency Missing"] > 0.999]
    missing_features.append(large_missing_features["Feature"].values)

    temp_data_dictionary[hosp] = data

missing_features = list(np.unique(np.concatenate(missing_features)))

for num, hosp in enumerate(temp_data_dictionary):
    data = temp_data_dictionary[hosp]
    data = data.drop(list(missing_features), axis=1)
    temp_data_dictionary[hosp] = data

In [13]:
data_dictionary = dict.fromkeys(hosps)

for hosp in data_dictionary:
    file = str(hosp) + '.csv'
    data = pd.read_csv(data_directory + prediction_directory + "cleaned/" + file)
    data = data.loc[:, ~data.columns.str.contains('^Unnamed')]
    data_dictionary[hosp] = data

df = deepcopy(data_dictionary[hosp])
demographic_index = list(df).index('is_female')
medical_signs = list(df)[:demographic_index]
medication_index = list(df).index('ACETAMIN')
medications = list(df)[medication_index:len(list(data))-1]
demographic_binary = ['is_female', 'race_black', 'race_hispanic', 'race_asian', 
                    'race_other', 'electivesurgery']
binary_features = medications + demographic_binary
continuous_features = medical_signs + ['age', 'BMI']

for hosp in hosps:
    orig_data = temp_data_dictionary[hosp]
    data = data_dictionary[hosp]
    
    for col in continuous_features:
        maximum = orig_data[col].max(axis=0)
        minimum = orig_data[col].min(axis=0)
        data[col] = data[col] * (maximum - minimum) + minimum
        
    data_dictionary[hosp] = data
    
    file = str(hosp) + '.csv'
    data.to_csv(data_directory + prediction_directory + "cleaned/unscaled/" + file)