In [29]:
#data preprocessing
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import collections
from collections import defaultdict

In [30]:
# NN
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import time
from torch.utils.data import TensorDataset, DataLoader


from sklearn.calibration import calibration_curve
from sklearn.metrics import roc_curve, precision_recall_curve, roc_auc_score, auc, accuracy_score
import sklearn.metrics as metrics
from sklearn.impute import SimpleImputer
import matplotlib.lines as mlines
import matplotlib.transforms as mtransforms
from matplotlib import pyplot as plt

In [31]:
# the full files pathes are here
# time dependent
DATA_PATH_stages="../data/kdigo_stages_measured.csv" 
DATA_PATH_labs = "../data/labs-kdigo_stages_measured.csv" 
DATA_PATH_vitals = "../data/vitals-kdigo_stages_measured.csv" 
DATA_PATH_vents = "../data/vents-vasopressor-sedatives-kdigo_stages_measured.csv"
#no time dependent
DATA_PATH_detail="../data/icustay_detail-kdigo_stages_measured.csv" #age constraint
#DATA_PATH_icd = "../data/diagnoses_icd_aki_measured.csv" #AL was "...measured 2.csv"

SEPARATOR=";"

In [32]:
# Set parameter as constant 

TESTING = False # Use 1% of data for testing. IF false - full dataset takes more time
TEST_SIZE = 0.01
LOS = True # additional code to cut the records longer than 32 days

#which classifier to use, only run one classifier at one time 
ALL_STAGES = False
CLASS1 = True   #AnyAKI
#CLASS2 = False    #ModerateSevereAKI
#CLASS3 = False    #SevereAKI
    
SELECTED_FEATURE_SET = False
MAX_FEATURE_SET = True
#DIAGNOSIS = False

MAX_DAYS = 35

NORMALIZATION = True # worsen results for XGB and RF
CAPPING = True

if CAPPING:
    CAPPING_THRESHOLD_UPPER = 0.99
    CAPPING_THRESHOLD_LOWER = 0.01

#Age constraints: adults
ADULTS_MIN_AGE = 18
ADULTS_MAX_AGE = -1

#Two options to deal with time series data
FIRST_TURN_POS = True #True #False # the first charttime that turn pos as bechmark for Target
LAST_CHARTTIME = False #the last charttime as bechmark for Target

# resampling or not
TIME_SAMPLING = True 
SAMPLING_INTERVAL = '6H'
RESAMPLE_LIMIT = 16 # 4 days*6h interval
MOST_COMMON = False #resampling with most common
# if MOST_COMMON is not applied,sampling with different strategies per kind of variable, 
# numeric variables use mean value, categorical variables use max value
IMPUTE_EACH_ID = False # imputation within each icustay_id
IMPUTE_COLUMN = False # imputation based on whole column
IMPUTE_METHOD = 'most_frequent' 
FILL_VALUE = 0 #fill missing value and ragged part of 3d array

# How much time the prediction should occur (hours)
HOURS_AHEAD = 48
LONG_STAY = 32 #days

# precentage of dataset to be used for testing model
TEST_SIZE = 0.3

NORM_TYPE = 'min_max'

# set constant for function build_basic_LR
C = 0.1
PENALTY = "l1"
SOLVER = 'saga' 
MAX_ITERATION = 1000 
CLASS_WEIGHT = "balanced"
RANDOM = 42

In [33]:
#set changable info corresponding to each classifier as variables

min_set =  ["icustay_id", "charttime", "creat", "uo_rt_6hr", "uo_rt_12hr", "uo_rt_24hr", "aki_stage"]


#selected_set = 


max_set = ['icustay_id', 'charttime', 'aki_stage', 'hadm_id', 'albumin_avg','aniongap_avg', 'bicarbonate_avg', 
           'bilirubin_avg', 'bun_avg','chloride_avg', 'creat', 'diasbp_mean', 'glucose_avg', 'heartrate_mean',
           'hematocrit_avg', 'hemoglobin_avg', 'potassium_avg', 'resprate_mean','sodium_avg', 'spo2_mean', 'sysbp_mean', 
           'uo_rt_12hr', 'uo_rt_24hr','uo_rt_6hr', 'wbc_avg', 'sedative', 'vasopressor', 'vent', 'age', 'F','M', 
           'asian', 'black', 'hispanic', 'native', 'other', 'unknown','white', 'ELECTIVE', 'EMERGENCY', 'URGENT']

# naming model and plot
classifier_name = "None vs. Any AKI"    ###change every time #Moderate vs. Severe #None vs. Any #Others vs. Severe
plot_name = "adult_AnyAKI_LR"    ###change every time

In [34]:
# Some functions used later

if CAPPING:
    def cap_data(df):
        print("Capping between the {} and {} quantile".format(CAPPING_THRESHOLD_LOWER, CAPPING_THRESHOLD_UPPER))
        cap_mask = df.columns.difference(['icustay_id', 'charttime', 'aki_stage'])
        df[cap_mask] = df[cap_mask].clip(df[cap_mask].quantile(CAPPING_THRESHOLD_LOWER),
                                         df[cap_mask].quantile(CAPPING_THRESHOLD_UPPER),
                                         axis=1)

        return df

# impute missing value in resampleing data with most common based on each id
def fast_mode(df, key_cols, value_col):
    """ Calculate a column mode, by group, ignoring null values. 
    
    key_cols : list of str - Columns to groupby for calculation of mode.
    value_col : str - Column for which to calculate the mode. 

    Return
    pandas.DataFrame
        One row for the mode of value_col per key_cols group. If ties, returns the one which is sorted first. """
    return (df.groupby(key_cols + [value_col]).size() 
              .to_frame('counts').reset_index() 
              .sort_values('counts', ascending=False) 
              .drop_duplicates(subset=key_cols)).drop('counts',axis=1)


def normalise_data(df, norm_mask):
    print("Normalizing in [0,1] with {} normalization".format(NORMALIZATION))

    if NORM_TYPE == 'min_max':
        df[norm_mask] = (df[norm_mask] - df[norm_mask].min()) / (df[norm_mask].max() - df[norm_mask].min())
    else:
        df[norm_mask] = (df[norm_mask] - df[norm_mask].mean()) / df[norm_mask].std()

    return df


#get max shape of 3d array
def get_dimensions(array, level=0):   
    yield level, len(array)
    try:
        for row in array:
            yield from get_dimensions(row, level + 1)
    except TypeError: #not an iterable
        pass

def get_max_shape(array):
    dimensions = defaultdict(int)
    for level, length in get_dimensions(array):
        dimensions[level] = max(dimensions[level], length)
    return [value for _, value in sorted(dimensions.items())]

#pad the ragged 3d array to rectangular shape based on max size
def iterate_nested_array(array, index=()):
    try:
        for idx, row in enumerate(array):
            yield from iterate_nested_array(row, (*index, idx))
    except TypeError: # final level            
        yield (*index, slice(len(array))), array

def pad(array, fill_value):
    dimensions = get_max_shape(array)
    result = np.full(dimensions, fill_value)
    for index, value in iterate_nested_array(array):
        result[index] = value
    return result

def bin_total(y_true, y_prob, n_bins):
    bins = np.linspace(0., 1. + 1e-8, n_bins + 1)

    # In sklearn.calibration.calibration_curve,
    # the last value in the array is always 0.
    binids = np.digitize(y_prob, bins) - 1

    return np.bincount(binids, minlength=len(bins))

def missing_bin(bin_array):
    midpoint = " "    
    if bin_array[0]==0:
        midpoint = "5%, "
    if bin_array[1]==0:
        midpoint = midpoint + "15%, "
    if bin_array[2]==0:
        midpoint = midpoint + "25%, "
    if bin_array[3]==0:
        midpoint = midpoint + "35%, " 
    if bin_array[4]==0:
        midpoint = midpoint + "45%, "
    if bin_array[5]==0:
        midpoint = midpoint + "55%, "
    if bin_array[6]==0:
        midpoint = midpoint + "65%, "
    if bin_array[7]==0:
        midpoint = midpoint + "75%, "
    if bin_array[8]==0:
        midpoint = midpoint + "85%, "
    if bin_array[9]==0:
        midpoint = midpoint + "95%, "
    return "The missing bins have midpoint values of "+ str(midpoint)


In [35]:
print("read csv files")
#reading csv files
X = pd.read_csv(DATA_PATH_stages, sep= SEPARATOR)
X.drop(["aki_stage_creat", "aki_stage_uo"], axis = 1, inplace = True)
#remove totally empty rows 
X = X.dropna(how = 'all', subset = ['creat','uo_rt_6hr','uo_rt_12hr','uo_rt_24hr','aki_stage'])
print("convert charttime to timestamp")
X['charttime'] = pd.to_datetime(X['charttime'])
# AL it substitutes missing values with zero!
#merge rows if they have exact timestamp within same icustay_id
#X = X.groupby(['icustay_id', 'charttime']).sum().reset_index(['icustay_id', 'charttime'])

dataset_detail = pd.read_csv(DATA_PATH_detail, sep= SEPARATOR)  #age constraint
dataset_detail.drop(['dod', 'admittime','dischtime', 'los_hospital','ethnicity','hospital_expire_flag', 'hospstay_seq',
       'first_hosp_stay', 'intime', 'outtime', 'los_icu', 'icustay_seq','first_icu_stay'], axis = 1, inplace = True)


read csv files
convert charttime to timestamp


In [36]:
#3
dataset_labs = pd.read_csv(DATA_PATH_labs, sep= SEPARATOR) # 'bands lactate platelet ptt inr pt
dataset_labs.drop(['albumin_min', 'albumin_max','bilirubin_min', 'bilirubin_max','bands_min', 'bands_max',
                   'lactate_min', 'lactate_max','platelet_min', 'platelet_max','ptt_min', 'ptt_max', 
                   'inr_min', 'inr_max', 'pt_min', 'pt_max'], axis = 1, inplace = True)
dataset_labs = dataset_labs.dropna(subset=['charttime'])
dataset_labs = dataset_labs.dropna(subset=dataset_labs.columns[4:], how='all')
dataset_labs['charttime'] = pd.to_datetime(dataset_labs['charttime'])
dataset_labs = dataset_labs.sort_values(by=['icustay_id', 'charttime'])

if SELECTED_FEATURE_SET or MAX_FEATURE_SET:
    #4,5,6
    dataset_vitals = pd.read_csv(DATA_PATH_vitals, sep= SEPARATOR)  #'meanbp_mean', 'tempc_mean',
    dataset_vents = pd.read_csv(DATA_PATH_vents , sep= SEPARATOR)
    #dataset_icd = pd.read_csv(DATA_PATH_icd, sep= SEPARATOR)

    dataset_vitals.drop(["heartrate_min", "heartrate_max","sysbp_min", "sysbp_max","diasbp_min", "diasbp_max",
                        'meanbp_min','meanbp_max', 'meanbp_mean','tempc_min', 'tempc_max', 'tempc_mean',
                        "resprate_min", "resprate_max", "spo2_min", "spo2_max", "glucose_min", "glucose_max"], axis = 1, inplace = True)
          
    print("convert charttime to timestamp")
    dataset_vitals['charttime'] = pd.to_datetime(dataset_vitals['charttime'])
    dataset_vents['charttime'] = pd.to_datetime(dataset_vents['charttime'])
    
    dataset_vitals = dataset_vitals.sort_values(by=['icustay_id', 'charttime'])
    dataset_vents = dataset_vents.sort_values(by=['icustay_id', 'charttime'])
    
    # AL drop those where all columns are nan
    dataset_vitals = dataset_vitals.dropna(subset=dataset_vitals.columns[4:], how='all')   
     

convert charttime to timestamp


In [37]:
# Labs file: instead of min and max their avg
counter = 0
col1 = 4
col2 = 5
null_l = [] # no null values in those that are different
changed = 0 # 4316 records changed to avg

while counter < 11:
    row = 0
# find where min and max are different and save their row indices 
    while row < len(dataset_labs):
        a = dataset_labs.iloc[row,col1]
        b = dataset_labs.iloc[row,col2]
        if a==b or (np.isnan(a) and np.isnan(b)):
            pass
        elif a!=b:
            changed +=1
            avg = (a+b)/2
            dataset_labs.iloc[row,col1] = avg
            if (np.isnan(a) and ~np.isnan(b)) or (np.isnan(b) and ~np.isnan(a)):
                null_l.append(row)
        else:
            print(a)
            print(b)
        row +=1       
    # delete the redundant column max, update counters
    dataset_labs.drop(dataset_labs.columns[col2], axis=1, inplace = True)
    counter = counter+1
    col1 = col1+1
    col2 = col2+1

dataset_labs.columns = ['subject_id','hadm_id', 'icustay_id', 'charttime', 'aniongap_avg', 'bicarbonate_avg', 
                        'creatinine_avg', 'chloride_avg', 'glucose_avg', 'hematocrit_avg','hemoglobin_avg',
                        'potassium_avg', 'sodium_avg', 'bun_avg', 'wbc_avg']
if len(null_l)>0:
    print("null values encountered")

In [38]:
print("Merge creatinine and glucose.")
# merge creatinine from labs and set with labels
creat_l = dataset_labs[['icustay_id','charttime','creatinine_avg']].copy()
creat_l = creat_l.dropna(subset=['creatinine_avg'])
creat = X[['icustay_id','charttime', 'creat']].copy()
creat = creat.dropna(subset=['creat'])
creat_l = creat_l.rename(columns={"creatinine_avg": "creat"})
creat = creat.append(creat_l, ignore_index=True)
creat.drop_duplicates(inplace = True)
#delete old columns
dataset_labs.drop(["creatinine_avg"], axis = 1, inplace = True)
dataset_labs = dataset_labs.dropna(subset=dataset_labs.columns[4:], how='all')
X.drop(["creat"], axis = 1, inplace = True)
#merge new column
X = pd.merge(X, creat, on = ["icustay_id", "charttime"], sort = True, how= "outer", copy = False)

if SELECTED_FEATURE_SET or MAX_FEATURE_SET:
    # merge glucose from vitals and labs
    glucose_v = dataset_vitals[['subject_id','hadm_id','icustay_id','charttime', 'glucose_mean']].copy()
    glucose_v = glucose_v.dropna(subset=['glucose_mean'])
    glucose = dataset_labs[['subject_id','hadm_id','icustay_id','charttime', 'glucose_avg']].copy()
    glucose = glucose.dropna(subset=['glucose_avg'])
    glucose_v = glucose_v.rename(columns={"glucose_mean": "glucose_avg"})
    glucose = glucose.append(glucose_v, ignore_index=True)
    glucose.drop_duplicates(inplace = True)
    #delete old columns
    dataset_labs.drop(["glucose_avg"], axis = 1, inplace = True)
    dataset_vitals.drop(["glucose_mean"], axis = 1, inplace = True)
    dataset_vitals = dataset_vitals.dropna(subset=dataset_vitals.columns[4:], how='all')
    #merge new column
    dataset_labs = pd.merge(dataset_labs, glucose, on = ['subject_id','hadm_id','icustay_id','charttime',], sort = True, how= "outer", copy = False)
    
dataset_labs = dataset_labs.sort_values(by=['icustay_id', 'charttime'], ignore_index = True)
X = X.sort_values(by=['icustay_id', 'charttime'], ignore_index = True)

Merge creatinine and glucose.


In [39]:
print("Merging labs, vitals and vents files")
#merge files with time-dependent data, based on icustay_id and charttime
if SELECTED_FEATURE_SET or MAX_FEATURE_SET:
    X = pd.merge(X, dataset_labs, on = ["icustay_id", "charttime"], how= "outer", copy = False)
    X = pd.merge(X, dataset_vitals, on = ["icustay_id", "charttime","subject_id", "hadm_id"], how= "outer", copy = False)
    X = pd.merge(X, dataset_vents, on = ["icustay_id", "charttime"], how= "outer", copy = False) 
    X.drop(["subject_id"], axis = 1, inplace = True) 


Merging labs, vitals and vents files


In [40]:
print("start preprocessing time dependent data") # AL removed a line where rows with missing labels are deleted (we will ffil)
print("Removing patients under the min age")
dataset_detail = dataset_detail.loc[dataset_detail['age'] >= ADULTS_MIN_AGE]
adults_icustay_id_list = dataset_detail['icustay_id'].unique()
X = X[X.icustay_id.isin(adults_icustay_id_list)].sort_values(by=['icustay_id'], ignore_index = True)
X = X.sort_values(by=['icustay_id', 'charttime'], ignore_index = True)
adults_icustay_id_list = np.sort(adults_icustay_id_list)

start preprocessing time dependent data
Removing patients under the min age


In [41]:
print("drop icustay_id with time span less than 48hrs")
def more_than_HOURS_ahead(adults_icustay_id_list, X):
    drop_list = []
    los_list = [] # calculating LOS ICU based on charttime
    long_stays_id = [] # LOS longer than MAX DAYS days
    last_charttime_list = []
    seq_length = X.groupby(['icustay_id'],as_index=False).size().to_frame('size')
    id_count = 0
    first_row_index = 0

    while id_count < len(adults_icustay_id_list):
        icustay_id = adults_icustay_id_list[id_count]
        last_row_index = first_row_index + seq_length.iloc[id_count,0]-1
        first_time = X.iat[first_row_index, X.columns.get_loc('charttime')]
        last_time = X.iat[last_row_index, X.columns.get_loc('charttime')]
        los = round(float((last_time - first_time).total_seconds()/60/60/24),4) # in days
        if los < HOURS_AHEAD/24:
            drop_list.append(icustay_id)
        else:
            los_list.append(los)
            if los > MAX_DAYS:
                long_stays_id.append(icustay_id)
                last_charttime_list.append(last_time)
        # udpate for the next icustay_id
        first_row_index = last_row_index+1
        id_count +=1
    if len(long_stays_id) != len(last_charttime_list):
        print('ERROR')
    print("%d long stays" % len(long_stays_id))
    # drop all the rows with the saved icustay_id
    print("there are %d id-s shorter than 48 hours" % len(drop_list))
    X = X[~X.icustay_id.isin(drop_list)]
    id_list = X['icustay_id'].unique()
    X = X.sort_values(by=['icustay_id', 'charttime'], ignore_index = True)
    
    return id_list, X, long_stays_id,last_charttime_list

id_list, X, long_stays_id,last_charttime_list  = more_than_HOURS_ahead(adults_icustay_id_list, X)


long = pd.DataFrame()
long['icustay_id']  = long_stays_id
long['last_time']  = last_charttime_list


drop icustay_id with time span less than 48hrs
2302 long stays
there are 5214 id-s shorter than 48 hours


In [42]:
# deleting rows that are not within MAX_DAYS (35) period
i = 0 # long df index
drop_long_time = []
    
while i < len(long_stays_id):
    j = 0
    all_rows = X.index[X['icustay_id'] == long.loc[i,'icustay_id']].tolist()
    while j < len(all_rows):
        time = X.iat[all_rows[j], X.columns.get_loc('charttime')]
        # if keep last MAX_DAYS 
        if (long.loc[i,'last_time'] - time).total_seconds() > MAX_DAYS*24*60*60:
            drop_long_time.append(all_rows[j])
            j +=1
        else:
            break
    i +=1       
X.drop(X.index[drop_long_time], inplace=True) 

# checking for 48h min length again
id_list, X, long_stays_id,last_charttime_list  = more_than_HOURS_ahead(id_list, X)
dataset_detail = dataset_detail[dataset_detail.icustay_id.isin(id_list)].sort_values(by=['icustay_id'], ignore_index = True)


0 long stays
there are 1 id-s shorter than 48 hours


In [43]:
if SELECTED_FEATURE_SET or MAX_FEATURE_SET:
    # AL create a dictionary for hadm
    hadm = dataset_detail.filter(['hadm_id','icustay_id'],axis = 1)
    dict_hadm = pd.Series(hadm.hadm_id.values,index=hadm.icustay_id).to_dict()
    # fill in the missing values (to ensure correct merging of icd below)
    X.hadm_id = X.hadm_id.fillna(FILL_VALUE)
    # AL change the type to prevent warning of merging int on float
    X = X.astype({"hadm_id": int})
    a = -1
    while a < X.shape[0]-1:
        a = a+1
        if X.iat[a, X.columns.get_loc('hadm_id')] !=-1 :
            continue
        elif X.iat[a, X.columns.get_loc('hadm_id')]==-1:
            X.iat[a, X.columns.get_loc('hadm_id')] = dict_hadm[X.iat[a, X.columns.get_loc('icustay_id')]]


# For testing purpose, use small amount of data first

In [44]:
#For testing purpose, use small amount of data first
if TESTING:
    rest, id_list = train_test_split(id_list, test_size= TEST_SIZE, random_state=42)
    X = X[X.icustay_id.isin(id_list)].sort_values(by=['icustay_id'])
    dataset_detail = dataset_detail[dataset_detail.icustay_id.isin(id_list)].sort_values(by=['icustay_id'])

# Resample  and impute

In [45]:
if (TIME_SAMPLING and MOST_COMMON):
    print("resampling: MOST_COMMON")
    # Resample the data using assigned interval,mode() for most common
    X = X.set_index('charttime').groupby('icustay_id').resample(SAMPLING_INTERVAL).mode().reset_index()
elif TIME_SAMPLING:
    print("resampling: MEAN & ZERO")
    # Sampling with different strategies per kind of variable
    label = ['aki_stage']
    skip = ['icustay_id', 'charttime', 'aki_stage']
    if SELECTED_FEATURE_SET or MAX_FEATURE_SET:
        discrete_feat = ['sedative', 'vasopressor', 'vent', 'hadm_id']
        skip.extend(discrete_feat)    
    # all features that are not in skip are numeric
    numeric_feat = list(X.columns.difference(skip))
    
    # Applying aggregation to features depending on their type
    X = X.set_index('charttime').groupby('icustay_id').resample(SAMPLING_INTERVAL)
    if SELECTED_FEATURE_SET or MAX_FEATURE_SET:
        X_discrete = X[discrete_feat].max().fillna(FILL_VALUE).astype(np.int64)
    X_numeric = X[numeric_feat].mean() 
    X_label = X['aki_stage'].max()
    print("Merging sampled features")
    try:
        X = pd.concat([X_numeric, X_discrete,X_label], axis=1).reset_index()
    except:
        X = pd.concat([X_numeric,X_label], axis=1).reset_index()
print(X.shape)
#Label forward fill
X['aki_stage'] = X['aki_stage'].ffill(limit=RESAMPLE_LIMIT)

resampling: MEAN & ZERO
Merging sampled features
(2156523, 26)


In [46]:
print("Imputation.")
# do imputation of label with zero if there are still missing values
X['aki_stage'] = X['aki_stage'].fillna(0)
# using most common within each icustay_id
if IMPUTE_EACH_ID:
    # set a new variable so won't change the orginial X
    column_name = list(X.columns)
    column_name.remove(column_name[0]) 
    for feature in column_name:
        X.loc[X[feature].isnull(), feature] = X.icustay_id.map(fast_mode(X, ['icustay_id'], feature).set_index('icustay_id')[feature])       

# imputation based on whole column
if IMPUTE_COLUMN:
    imp = SimpleImputer(missing_values=np.nan, strategy= IMPUTE_METHOD)
    cols = list(X.columns)
    cols = cols[2:23]
    X[cols]=imp.fit_transform(X[cols])  

# If no imputation method selected or only impute each id, for the remaining nan impute direclty with FILL_VALUE
X = X.fillna(FILL_VALUE) 

Imputation.


In [47]:
# more comfortable to review in this order
try:
    cols = ['icustay_id', 'charttime','aki_stage','hadm_id','aniongap_avg','bicarbonate_avg', 'bun_avg','chloride_avg',
            'creat','diasbp_mean', 'glucose_avg', 'heartrate_mean', 'hematocrit_avg','hemoglobin_avg', 
            'potassium_avg', 'resprate_mean', 'sodium_avg','spo2_mean', 'sysbp_mean', 'uo_rt_12hr', 
            'uo_rt_24hr', 'uo_rt_6hr','wbc_avg', 'sedative', 'vasopressor', 'vent' ]
    X = X[cols]
    print("success")
except:
    try:
        cols = ['icustay_id', 'charttime','aki_stage','creat','uo_rt_12hr', 'uo_rt_24hr', 'uo_rt_6hr']
        X = X[cols]
    except:
        print("error")

success


In [48]:
print("binarise labels")
if ALL_STAGES:
    pass
elif CLASS1:
    X.loc[X['aki_stage'] > 1, 'aki_stage'] = 1
elif CLASS2:
    X.loc[X['aki_stage'] < 2, 'aki_stage'] = 0
    X.loc[X['aki_stage'] > 1, 'aki_stage'] = 1
elif CLASS3:
    X.loc[X['aki_stage'] < 3, 'aki_stage'] = 0
    X.loc[X['aki_stage'] > 2, 'aki_stage'] = 1

binarise labels


In [49]:
X['aki_stage'].value_counts()

0.0    1769822
1.0     386701
Name: aki_stage, dtype: int64

#  Cap features between 0.01 / 0.99 quantile and normalisation

In [50]:
if CAPPING:
    X = cap_data(X)
if NORMALIZATION:
    X = normalise_data(X, numeric_feat)

Capping between the 0.01 and 0.99 quantile
Normalizing in [0,1] with True normalization


In [51]:
X.shape

(2156523, 26)

In [52]:
X['icustay_id'].nunique()

47751

# Shift labels for HOURS_AHEAD

In [53]:
#print("Shifting the labels 48 h") # by 8 position : 6h sampling*8=48h and ffil 8 newly empty ones
# group by
X['aki_stage'] = X.groupby('icustay_id')['aki_stage'].shift(-(HOURS_AHEAD // int(SAMPLING_INTERVAL[:-1])))
X = X.dropna(subset=['aki_stage'])
X['icustay_id'].nunique()
X.shape

47751

# Last charttime (option1)

In [102]:
if LAST_CHARTTIME:
    print("Here we choose the last charttime as bechmark for Target")
    print("find last charttime for each icustay_id")
    #extract the aki_stage value of row of "last" charttime of each icustay_id as the label/target set
    #find last charttime for each icustay_id

    target_list = [] #to store target(y)
    last_charttime_list= [] #a list to store last charttime for every icustay_id
    last_charttime_index_list = [] #a list to store index when last charttime for every icustay_id

    dataset_size = X.shape[0]
    k=0 #index
    h=0 #the times that id matches

    for extract_id in id_list:
        while k < X:
            if X.iat[k, X.columns.get_loc('icustay_id')] == extract_id:
                if h==0 and k != dataset_size-1:
                    k = k+1
                    h = h+1
                elif h !=0:
                    if k == dataset_size-1:
                        target_list.append(X.iat[k, X.columns.get_loc('aki_stage')])
                        last_charttime_list.append(X.iat[k, X.columns.get_loc('charttime')])
                        last_charttime_index_list.append(k)
                    k = k+1
            elif X.iat[k, X.columns.get_loc('icustay_id')] != extract_id:
                target_list.append(X.iat[k-1, X.columns.get_loc('aki_stage')])
                last_charttime_list.append(X.iat[k-1, X.columns.get_loc('charttime')])
                last_charttime_index_list.append(k-1)
                h=0
                break
                


# First Turn Pos (option 2)

In [68]:
if FIRST_TURN_POS:
    # create one label per icustay_id - first turn positive approach

    X = X.sort_values(by=['icustay_id', 'charttime'], ignore_index = True)
    id_list.sort()
    last_charttime_list= []

    index_list = []
    label_list = []

    first_row_index = 0
    id_count = 0
    seq_length = X.groupby(['icustay_id'],as_index=False).size().to_frame('size')

    for ID in id_list:
        last_row_index = first_row_index + seq_length.iloc[id_count,0]-1
        a = X.loc[X['icustay_id']==ID].aki_stage
        if 1 not in a.values:
            label_list.append(0)
            last_charttime_list.append(X.iat[last_row_index, X.columns.get_loc('charttime')]) 
            index_list.append(last_row_index)
        elif 1 in a.values:
            label_list.append(1)
            row = first_row_index
            while row != last_row_index+1:
                if X.iat[row, X.columns.get_loc('aki_stage')]==0:
                    row +=1
                elif X.iat[row, X.columns.get_loc('aki_stage')]==1:
                    last_charttime_list.append(X.iat[row, X.columns.get_loc('charttime')])
                    index_list.append(row)
                    break
        first_row_index = last_row_index+1
        id_count +=1

    X = X.drop(['aki_stage'], axis=1)
    Thresholds = pd.DataFrame({'icustay_id':id_list, 'charttime': last_charttime_list, 'final_label':label_list})
    X = (Thresholds.merge(X, on='icustay_id', how='left',suffixes=('_x','')).query("charttime_x >= charttime").reindex(columns=X.columns))        
    print(X.shape)
    X['icustay_id'].nunique()

(784755, 25)


In [72]:
Thresholds['final_label'].value_counts()

1    28097
0    19654
Name: final_label, dtype: int64

# Add categorical features (details)

In [73]:
print("start preprocessing not time dependent data")
if SELECTED_FEATURE_SET or MAX_FEATURE_SET:
    #extract datasets based on id_list
    dataset_detail = dataset_detail.loc[dataset_detail['icustay_id'].isin(id_list)]
    #sort by ascending order
    dataset_detail = dataset_detail.sort_values(by=['icustay_id'])
    subject_id = dataset_detail["subject_id"].unique()
    
    #transfrom categorical data to binary form
    dataset_detail = dataset_detail.join(pd.get_dummies(dataset_detail.pop('gender')))
    dataset_detail = dataset_detail.join(pd.get_dummies(dataset_detail.pop("ethnicity_grouped")))
    dataset_detail = dataset_detail.join(pd.get_dummies(dataset_detail.pop('admission_type')))
    dataset_detail = dataset_detail.drop(['subject_id', 'hadm_id'], axis=1)
    X =  pd.merge(X, dataset_detail, on = ["icustay_id"], how= "left", copy = False) 
    #dataset_detail = dataset_detail.drop(["icustay_id",'subject_id'], axis=1)
    #!!!!!X =  pd.merge(X, dataset_detail, on = ["icustay_id"], how= "left") 
    
    #convert dataframe to array for later concontenate
    #dataset_detail = np.array((dataset_detail.fillna(FILL_VALUE)).to_numpy())
    #X = np.concatenate((X, dataset_detail), axis=1)

start preprocessing not time dependent data


In [74]:
print("select features used in the paper")
#select only features that used in paper
if SELECTED_FEATURE_SET:
    X = X[selected_set]

select features used in the paper


In [75]:
#AL re-write as try except to make it work as hadm_id is not used if only one csv file is used and none are merged
try:
    X = X.drop(['hadm_id', 'charttime'], axis=1)
except:
    X = X.drop(['charttime'], axis=1)

In [76]:
# set target

if LAST_CHARTTIME:
    y = pd.DataFrame(target_list)
if FIRST_TURN_POS:
    y = Thresholds['final_label']

In [79]:
len(y) == X['icustay_id'].nunique()

True

In [80]:
rows = X.groupby(['icustay_id'],as_index=False).size()
sequence_length = rows.max()
print(sequence_length)

133


In [45]:
#print("reshape 2D dataframe to 3D Array, group by icustay_id")
X = np.array(list(X.groupby(['icustay_id'],as_index=False).apply(pd.DataFrame.to_numpy)))
print(X.shape)

(8553,)


In [46]:
# could change after I add batching!

print("fill ragged part(missing value) of 3d array with zeroes")
# fill ragged part(missing value) of 3d array with 0
X = pad(X, 0)
print(X.shape)


fill ragged part(missing value) of 3d array with zeroes
(8553, 133, 38)


In [47]:
#AL to see the balance of the classes
try:
    z = list(y[0])
except:
    z = y
counter=collections.Counter(z)
print(counter)

Counter({0: 4417, 1: 4136})


In [48]:
print("divide dataset into train, test sets")
# divide dataset into train, test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size = 0.5, random_state = 42)

divide dataset into train, test sets


In [49]:
# from numpy to tensors
X_train = Variable(torch.from_numpy(X_train).float()) # if requires_grad=True will mean its an optimizable variable
X_test = Variable(torch.from_numpy(X_test).float())
X_val = Variable(torch.from_numpy(X_val).float())

y_train = Variable(torch.from_numpy(y_train).float())
y_test = Variable(torch.from_numpy(y_test).float())
y_val = Variable(torch.from_numpy(y_val).float())


In [50]:
y_train.shape

torch.Size([6842])

In [51]:
X_train.shape

torch.Size([6842, 133, 38])

In [52]:
print("Now, the dataset is divided to X_train, X_test, y_train, y_test")


Now, the dataset is divided to X_train, X_test, y_train, y_test


# CNN

In [162]:
if (torch.cuda.is_available()):
    print('Training on GPU')
else:
    print('Training on CPU') # On mac book GPU is not possible =() 
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')


Training on CPU


In [163]:
X_train = X_train.permute((0, 2, 1))
X_test = X_test.permute((0, 2, 1))
X_val = X_val.permute((0, 2, 1))
print(X_train.shape)

torch.Size([1138, 38, 422])


In [164]:
train_loader = DataLoader(TensorDataset(X_train, y_train), shuffle=True, batch_size=1)
val_loader = DataLoader(TensorDataset(X_val, y_val), shuffle=False, batch_size=1)
test_loader = DataLoader(TensorDataset(X_test, y_test), shuffle=False, batch_size=1)

In [192]:
# to calculate the output sequence length use this cell (depends on the sequence length that is every time different)

F = 10 # kernel size
l_in = sequence_length #features
S = 1
P = 1


l_out = ((l_in+2*P-(F-1)-1)/S)+1
print(l_in,l_out)
while (l_in != l_out):
    if F==1:
        print("kernel 1")
        break
    else:
        F=F-1
        l_out = ((l_in+2*P-(F-1)-1)/S)+1
        print(l_in,l_out, F)
    
print("the kernel is",F)

422 415.0
422 416.0 9
422 417.0 8
422 418.0 7
422 419.0 6
422 420.0 5
422 421.0 4
422 422.0 3
the kernel is 3


In [193]:
class Net(nn.Module):
    def __init__(self, input_size, in_channels, out_channels):
        super(Net, self).__init__()
        self.input_size = input_size
        self.in_channels = in_channels
        self.out_channels = out_channels
        
        self.fc1 = nn.Linear(self.input_size, self.in_channels)
        self.conv1 = nn.Conv1d(in_channels, out_channels, kernel_size=F, stride = S, padding = P)
        self.relu1 = nn.ReLU()
        
        
    def forward(self, x):
        h = self.fc1(x)
        h = self.conv1(x) 
        h = self.relu1(h)
        #h, _ = self.fc2(h) # h, _ : as I have 2outputs (tuple), only take the real output [0]. 
        #print(type(h)) # Underscore throughs away the rest, _ "I do not care" variable notation in python
        #h = self.activation(h)
        return h

In [197]:
input_size = X_train.shape[1]
if input_size == 5:
    in_channels =5
elif input_size == 38:
    in_channels = 38
elif input_size>400:
    in_channels =256
else:
    print('something is wrong')

out_channels = in_channels
#output_size = 2


nn_model = Net(input_size,in_channels,out_channels)
out = nn_model(X_train)

RuntimeError: size mismatch, m1: [43244 x 422], m2: [38 x 38] at /Users/distiller/project/conda/conda-bld/pytorch_1595629449223/work/aten/src/TH/generic/THTensorMath.cpp:41

In [276]:
out

tensor([[[0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00,
          0.0000e+00, 0.0000e+00],
         [4.5865e+04, 6.1958e+04, 2.0539e+04,  ..., 0.0000e+00,
          0.0000e+00, 0.0000e+00],
         [1.0810e+04, 2.2959e+04, 0.0000e+00,  ..., 3.3661e-02,
          3.3661e-02, 3.3661e-02],
         [1.1705e+04, 1.6910e+04, 2.3068e+04,  ..., 0.0000e+00,
          0.0000e+00, 0.0000e+00],
         [5.0932e+03, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00,
          0.0000e+00, 0.0000e+00]],

        [[0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00,
          0.0000e+00, 0.0000e+00],
         [5.7785e+04, 7.4728e+04, 0.0000e+00,  ..., 0.0000e+00,
          0.0000e+00, 0.0000e+00],
         [1.3620e+04, 6.3335e+03, 0.0000e+00,  ..., 3.3661e-02,
          3.3661e-02, 3.3661e-02],
         [1.4747e+04, 1.8482e+04, 0.0000e+00,  ..., 0.0000e+00,
          0.0000e+00, 0.0000e+00],
         [6.4168e+03, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00,
          0.0000e+00, 0.0000e+00]],

      

In [270]:
out.shape

torch.Size([1117, 5, 518])

In [144]:
a = torch.randn(32, 100, 1)  
m = nn.Conv1d(100, 100, 1) 
out = m(a)
print(out.size())
print(m)

torch.Size([32, 100, 1])
Conv1d(100, 100, kernel_size=(1,), stride=(1,))
