In [1]:
TESTING = False # Use 1% of data for testing. IF false - full dataset takes more time
MAX_FEATURE_SET = True
#DIAGNOSIS = False
IMPUTE_EACH_ID = False # imputation within each icustay_id
IMPUTE_COLUMN = False # imputation based on whole column

In [2]:
#data preprocessing
from datetime import datetime
print(datetime.now())
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import collections
from collections import defaultdict
#import time
from datetime import datetime

#logistic regression model: train and evaluation and XGB 
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from  sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

from sklearn.calibration import CalibratedClassifierCV, calibration_curve
from sklearn import metrics
from sklearn.metrics import roc_curve, precision_recall_curve, f1_score, roc_auc_score, auc, accuracy_score
from sklearn.model_selection import cross_validate
from sklearn.impute import SimpleImputer
import matplotlib.lines as mlines
import matplotlib.transforms as mtransforms
from matplotlib import pyplot as plt
import seaborn as sns
import shap
import joblib

2023-01-25 09:25:27.320308


In [3]:
# the full files pathes are here
# time dependent
DATA_PATH_stages="../data/kdigo_stages_measured.csv" 
DATA_PATH_labs = "../data/labs-kdigo_stages_measured.csv" 
DATA_PATH_vitals = "../data/vitals-kdigo_stages_measured.csv" 
DATA_PATH_vents = "../data/vents-vasopressor-sedatives-kdigo_stages_measured.csv"
#no time dependent
DATA_PATH_detail="../data/icustay_detail-kdigo_stages_measured.csv" #age constraint
#DATA_PATH_icd = "../data/diagnoses_icd_aki_measured.csv" #AL was "...measured 2.csv"

SEPARATOR=";"

In [4]:
# Set parameter as constant 

#which classifier to use, only run one classifier at one time 
ALL_STAGES = False
CLASS1 = True   #AnyAKI
#CLASS2 = True    #ModerateSevereAKI
#CLASS3 = False    #SevereAKI

MAX_DAYS = 35

NORMALIZATION = 'min-max' 

CAPPING_THRESHOLD_UPPER = 0.99
CAPPING_THRESHOLD_LOWER = 0.01

#Age constraints: adults
ADULTS_MIN_AGE = 18
ADULTS_MAX_AGE = -1


SPLIT_SIZE = 0.2

#Two options to deal with time series data
FIRST_TURN_POS = True #True #False # the first charttime that turn pos as bechmark for Target
LAST_CHARTTIME = False #the last charttime as bechmark for Target

# resampling or not
TIME_SAMPLING = True 
SAMPLING_INTERVAL = '6H'
RESAMPLE_LIMIT = 16 # 4 days*6h interval
MOST_COMMON = False #resampling with most common
# if MOST_COMMON is not applied,sampling with different strategies per kind of variable, 
# numeric variables use mean value, categorical variables use max value

IMPUTE_METHOD = 'most_frequent' 
FILL_VALUE = 0 #fill missing value and ragged part of 3d array

# How much time the prediction should occur (hours)
HOURS_AHEAD = 48

# precentage of dataset to be used for testing model
TEST_SIZE = 0.1


In [5]:
#set changable info corresponding to each classifier as variables

min_set =  ["icustay_id", "charttime", "creat", "uo_rt_6hr", "uo_rt_12hr", "uo_rt_24hr", "aki_stage"]

max_set = ['icustay_id', 'charttime', 'aki_stage', 'hadm_id','aniongap_avg', 'bicarbonate_avg', 
           'bun_avg','chloride_avg', 'creat', 'diasbp_mean', 'glucose_avg', 'heartrate_mean',
           'hematocrit_avg', 'hemoglobin_avg', 'potassium_avg', 'resprate_mean','sodium_avg', 'spo2_mean', 'sysbp_mean', 
           'uo_rt_12hr', 'uo_rt_24hr','uo_rt_6hr', 'wbc_avg', 'sedative', 'vasopressor', 'vent', 'age', 'F','M', 
           'asian', 'black', 'hispanic', 'native', 'other', 'unknown','white', 'ELECTIVE', 'EMERGENCY', 'URGENT']

In [6]:
# Some functions used later


def cap_data(df):
    print("Capping between the {} and {} quantile".format(CAPPING_THRESHOLD_LOWER, CAPPING_THRESHOLD_UPPER))
    cap_mask = df.columns.difference(['icustay_id', 'charttime', 'aki_stage'])
    df[cap_mask] = df[cap_mask].clip(df[cap_mask].quantile(CAPPING_THRESHOLD_LOWER),
                                     df[cap_mask].quantile(CAPPING_THRESHOLD_UPPER),
                                     axis=1)

    return df

# impute missing value in resampleing data with most common based on each id
def fast_mode(df, key_cols, value_col):
    """ Calculate a column mode, by group, ignoring null values. 
    
    key_cols : list of str - Columns to groupby for calculation of mode.
    value_col : str - Column for which to calculate the mode. 

    Return
    pandas.DataFrame
        One row for the mode of value_col per key_cols group. If ties, returns the one which is sorted first. """
    return (df.groupby(key_cols + [value_col]).size() 
              .to_frame('counts').reset_index() 
              .sort_values('counts', ascending=False) 
              .drop_duplicates(subset=key_cols)).drop('counts',axis=1)

def normalise_data(df, norm_mask):
    print("Normalizing in [0,1] with {} normalization".format(NORMALIZATION))
    
    df[norm_mask] = (df[norm_mask] - df[norm_mask].min()) / (df[norm_mask].max() - df[norm_mask].min())
    
    return df


#get max shape of 3d array
def get_dimensions(array, level=0):   
    yield level, len(array)
    try:
        for row in array:
            yield from get_dimensions(row, level + 1)
    except TypeError: #not an iterable
        pass

def get_max_shape(array):
    dimensions = defaultdict(int)
    for level, length in get_dimensions(array):
        dimensions[level] = max(dimensions[level], length)
    return [value for _, value in sorted(dimensions.items())]

#pad the ragged 3d array to rectangular shape based on max size
def iterate_nested_array(array, index=()):
    try:
        for idx, row in enumerate(array):
            yield from iterate_nested_array(row, (*index, idx)) 
    except TypeError: # final level            
        yield (*index, slice(len(array))), array # think of the types

def pad(array, fill_value):
    dimensions = get_max_shape(array)
    result = np.full(dimensions, fill_value, dtype = np.float64)  
    for index, value in iterate_nested_array(array):
        result[index] = value 
    return result



In [7]:
print("read csv files")
#reading csv files
X = pd.read_csv(DATA_PATH_stages, sep= SEPARATOR)
X.drop(["aki_stage_creat", "aki_stage_uo"], axis = 1, inplace = True)
#remove totally empty rows 
X = X.dropna(how = 'all', subset = ['creat','uo_rt_6hr','uo_rt_12hr','uo_rt_24hr','aki_stage'])
print("convert charttime to timestamp")
X['charttime'] = pd.to_datetime(X['charttime'])
#merge rows if they have exact timestamp within same icustay_id AL: t substitutes missing values with zero
#X = X.groupby(['icustay_id', 'charttime']).sum().reset_index(['icustay_id', 'charttime'])

dataset_detail = pd.read_csv(DATA_PATH_detail, sep= SEPARATOR)  #age constraint
dataset_detail.drop(['dod', 'admittime','dischtime', 'los_hospital','ethnicity','hospital_expire_flag', 'hospstay_seq',
       'first_hosp_stay', 'intime', 'outtime', 'los_icu', 'icustay_seq','first_icu_stay'], axis = 1, inplace = True)


read csv files
convert charttime to timestamp


In [8]:
dataset_labs = pd.read_csv(DATA_PATH_labs, sep= SEPARATOR) # 'bands lactate platelet ptt inr pt
dataset_labs.drop(['albumin_min', 'albumin_max','bilirubin_min', 'bilirubin_max','bands_min', 'bands_max',
                   'lactate_min', 'lactate_max','platelet_min', 'platelet_max','ptt_min', 'ptt_max', 
                   'inr_min', 'inr_max', 'pt_min', 'pt_max'], axis = 1, inplace = True)
dataset_labs = dataset_labs.dropna(subset=['charttime'])
dataset_labs = dataset_labs.dropna(subset=dataset_labs.columns[4:], how='all')
dataset_labs['charttime'] = pd.to_datetime(dataset_labs['charttime'])
dataset_labs = dataset_labs.sort_values(by=['icustay_id', 'charttime'])

if  MAX_FEATURE_SET:
    dataset_vitals = pd.read_csv(DATA_PATH_vitals, sep= SEPARATOR)  
    dataset_vents = pd.read_csv(DATA_PATH_vents , sep= SEPARATOR)
    #dataset_icd = pd.read_csv(DATA_PATH_icd, sep= SEPARATOR)

    dataset_vitals.drop(["heartrate_min", "heartrate_max","sysbp_min", "sysbp_max","diasbp_min", "diasbp_max",
                        'meanbp_min','meanbp_max', 'meanbp_mean','tempc_min', 'tempc_max', 'tempc_mean',
                        "resprate_min", "resprate_max", "spo2_min", "spo2_max", "glucose_min", "glucose_max"], axis = 1, inplace = True)
          
    print("convert charttime to timestamp")
    dataset_vitals['charttime'] = pd.to_datetime(dataset_vitals['charttime'])
    dataset_vents['charttime'] = pd.to_datetime(dataset_vents['charttime'])
    
    dataset_vitals = dataset_vitals.sort_values(by=['icustay_id', 'charttime'])
    dataset_vents = dataset_vents.sort_values(by=['icustay_id', 'charttime'])
    
    # AL drop those where all columns are nan
    dataset_vitals = dataset_vitals.dropna(subset=dataset_vitals.columns[4:], how='all')   
     

convert charttime to timestamp


In [9]:
# Labs file: instead of min and max their avg
counter = 0
col1 = 4
col2 = 5
null_l = [] # no null values in those that are different
changed = 0 # 4316 records changed to avg

while counter < 11:
    row = 0
# find where min and max are different and save their row indices 
    while row < len(dataset_labs):
        a = dataset_labs.iloc[row,col1]
        b = dataset_labs.iloc[row,col2]
        if a==b or (np.isnan(a) and np.isnan(b)):
            pass
        elif a!=b:
            changed +=1
            avg = (a+b)/2
            dataset_labs.iloc[row,col1] = avg
            if (np.isnan(a) and ~np.isnan(b)) or (np.isnan(b) and ~np.isnan(a)):
                null_l.append(row)
        else:
            print(a)
            print(b)
        row +=1       
    # delete the redundant column max, update counters
    dataset_labs.drop(dataset_labs.columns[col2], axis=1, inplace = True)
    counter = counter+1
    col1 = col1+1
    col2 = col2+1

dataset_labs.columns = ['subject_id','hadm_id', 'icustay_id', 'charttime', 'aniongap_avg', 'bicarbonate_avg', 
                        'creatinine_avg', 'chloride_avg', 'glucose_avg', 'hematocrit_avg','hemoglobin_avg',
                        'potassium_avg', 'sodium_avg', 'bun_avg', 'wbc_avg']
if len(null_l)>0:
    print("null values encountered")

In [10]:
print("Merge creatinine and glucose.")
# merge creatinine from labs and set with labels
creat_l = dataset_labs[['icustay_id','charttime','creatinine_avg']].copy()
creat_l = creat_l.dropna(subset=['creatinine_avg'])
creat = X[['icustay_id','charttime', 'creat']].copy()
creat = creat.dropna(subset=['creat'])
creat_l = creat_l.rename(columns={"creatinine_avg": "creat"})
creat = creat.append(creat_l, ignore_index=True)
creat.drop_duplicates(inplace = True)
#delete old columns
dataset_labs.drop(["creatinine_avg"], axis = 1, inplace = True)
dataset_labs = dataset_labs.dropna(subset=dataset_labs.columns[4:], how='all')
X.drop(["creat"], axis = 1, inplace = True)
#merge new column
X = pd.merge(X, creat, on = ["icustay_id", "charttime"], sort = True, how= "outer", copy = False)

if MAX_FEATURE_SET:
    # merge glucose from vitals and labs
    glucose_v = dataset_vitals[['subject_id','hadm_id','icustay_id','charttime', 'glucose_mean']].copy()
    glucose_v = glucose_v.dropna(subset=['glucose_mean'])
    glucose = dataset_labs[['subject_id','hadm_id','icustay_id','charttime', 'glucose_avg']].copy()
    glucose = glucose.dropna(subset=['glucose_avg'])
    glucose_v = glucose_v.rename(columns={"glucose_mean": "glucose_avg"})
    glucose = glucose.append(glucose_v, ignore_index=True)
    glucose.drop_duplicates(inplace = True)
    #delete old columns
    dataset_labs.drop(["glucose_avg"], axis = 1, inplace = True)
    dataset_vitals.drop(["glucose_mean"], axis = 1, inplace = True)
    dataset_vitals = dataset_vitals.dropna(subset=dataset_vitals.columns[4:], how='all')
    #merge new column
    dataset_labs = pd.merge(dataset_labs, glucose, on = ['subject_id','hadm_id','icustay_id','charttime',], sort = True, how= "outer", copy = False)
    
dataset_labs = dataset_labs.sort_values(by=['icustay_id', 'charttime'], ignore_index = True)
X = X.sort_values(by=['icustay_id', 'charttime'], ignore_index = True)

Merge creatinine and glucose.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


In [11]:
print("Merging labs, vitals and vents files")
#merge files with time-dependent data, based on icustay_id and charttime
if MAX_FEATURE_SET:
    X = pd.merge(X, dataset_labs, on = ["icustay_id", "charttime"], how= "outer", copy = False)
    X = pd.merge(X, dataset_vitals, on = ["icustay_id", "charttime","subject_id", "hadm_id"], how= "outer", copy = False)
    X = pd.merge(X, dataset_vents, on = ["icustay_id", "charttime"], how= "outer", copy = False) 
    X.drop(["subject_id"], axis = 1, inplace = True) 


Merging labs, vitals and vents files


In [12]:
print("start preprocessing time dependent data") 
print("Removing patients under the min age")
dataset_detail = dataset_detail.loc[dataset_detail['age'] >= ADULTS_MIN_AGE]
adults_icustay_id_list = dataset_detail['icustay_id'].unique()
X = X[X.icustay_id.isin(adults_icustay_id_list)].sort_values(by=['icustay_id'], ignore_index = True)
X = X.sort_values(by=['icustay_id', 'charttime'], ignore_index = True)
adults_icustay_id_list = np.sort(adults_icustay_id_list)

start preprocessing time dependent data
Removing patients under the min age


In [13]:
print("drop icustay_id with time span less than 48hrs")
def more_than_HOURS_ahead(adults_icustay_id_list, X):
    drop_list = []
    los_list = [] # calculating LOS ICU based on charttime
    long_stays_id = [] # LOS longer than MAX DAYS days
    last_charttime_list = []
    seq_length = X.groupby(['icustay_id'],as_index=False).size()
    id_count = 0
    first_row_index = 0

    while id_count < len(adults_icustay_id_list):
        icustay_id = adults_icustay_id_list[id_count]
        last_row_index = first_row_index + seq_length.iloc[id_count,1]-1
        first_time = X.iat[first_row_index, X.columns.get_loc('charttime')]
        last_time = X.iat[last_row_index, X.columns.get_loc('charttime')]
        los = round(float((last_time - first_time).total_seconds()/60/60/24),4) # in days
        if los < HOURS_AHEAD/24:
            drop_list.append(icustay_id)
        else:
            los_list.append(los)
            if los > MAX_DAYS:
                long_stays_id.append(icustay_id)
                last_charttime_list.append(last_time)
        # udpate for the next icustay_id
        first_row_index = last_row_index+1
        id_count +=1
    if len(long_stays_id) != len(last_charttime_list):
        print('ERROR')
    print("%d long stays" % len(long_stays_id))
    # drop all the rows with the saved icustay_id
    print("there are %d id-s shorter than 48 hours" % len(drop_list))
    X = X[~X.icustay_id.isin(drop_list)]
    id_list = X['icustay_id'].unique()
    X = X.sort_values(by=['icustay_id', 'charttime'], ignore_index = True)
    
    return id_list, X, long_stays_id,last_charttime_list

id_list, X, long_stays_id,last_charttime_list  = more_than_HOURS_ahead(adults_icustay_id_list, X)


long = pd.DataFrame()
long['icustay_id']  = long_stays_id
long['last_time']  = last_charttime_list


drop icustay_id with time span less than 48hrs
2302 long stays
there are 5214 id-s shorter than 48 hours


In [14]:
# deleting rows that are not within MAX_DAYS (35) period
i = 0 # long df index
drop_long_time = []
    
while i < len(long_stays_id):
    j = 0
    all_rows = X.index[X['icustay_id'] == long.loc[i,'icustay_id']].tolist()
    while j < len(all_rows):
        time = X.iat[all_rows[j], X.columns.get_loc('charttime')]
        # if keep last MAX_DAYS 
        if (long.loc[i,'last_time'] - time).total_seconds() > MAX_DAYS*24*60*60:
            drop_long_time.append(all_rows[j])
            j +=1
        else:
            break
    i +=1       
X.drop(X.index[drop_long_time], inplace=True) 

# checking for 48h min length again
id_list, X, long_stays_id,last_charttime_list  = more_than_HOURS_ahead(id_list, X)
dataset_detail = dataset_detail[dataset_detail.icustay_id.isin(id_list)].sort_values(by=['icustay_id'], ignore_index = True)


0 long stays
there are 1 id-s shorter than 48 hours


In [15]:
#For testing purpose, use small amount of data first
if TESTING:
    rest, id_list = train_test_split(id_list, test_size= 0.01, random_state=42)
    X = X[X.icustay_id.isin(id_list)].sort_values(by=['icustay_id'])
    dataset_detail = dataset_detail[dataset_detail.icustay_id.isin(id_list)].sort_values(by=['icustay_id'])

In [16]:
if TIME_SAMPLING:
    print("resampling: MEAN & ZERO")
    # Sampling with different strategies per kind of variable
    label = ['aki_stage']
    skip = ['icustay_id', 'charttime', 'aki_stage']
    if MAX_FEATURE_SET:
        discrete_feat = ['sedative', 'vasopressor', 'vent', 'hadm_id']
        skip.extend(discrete_feat)    
    # all features that are not in skip are numeric
    numeric_feat = list(X.columns.difference(skip))

resampling: MEAN & ZERO


In [18]:
X_1 = X # without resample or imputation

In [19]:
# Applying aggregation to features depending on their type
X = X.set_index('charttime').groupby('icustay_id').resample(SAMPLING_INTERVAL)
if MAX_FEATURE_SET:
    X_discrete = X[discrete_feat].max().fillna(FILL_VALUE).astype(np.int64)
X_numeric = X[numeric_feat].mean() 
X_label = X['aki_stage'].max()
print("Merging sampled features")
try:
    X = pd.concat([X_numeric, X_discrete,X_label], axis=1).reset_index()
except:
    X = pd.concat([X_numeric,X_label], axis=1).reset_index()
print(X.shape)
#Label forward fill
X['aki_stage'] = X['aki_stage'].ffill(limit=RESAMPLE_LIMIT)

Merging sampled features
(2156523, 26)


In [20]:
X_2 = X # after resample but without imputation

In [21]:
print("Imputation.")
# do imputation of label with zero if there are still missing values
X['aki_stage'] = X['aki_stage'].fillna(0)
# using most common within each icustay_id
if IMPUTE_EACH_ID:
    # set a new variable so won't change the orginial X
    column_name = list(X.columns)
    column_name.remove(column_name[0]) 
    for feature in column_name:
        X.loc[X[feature].isnull(), feature] = X.icustay_id.map(fast_mode(X, ['icustay_id'], feature).set_index('icustay_id')[feature])       

# imputation based on whole column
if IMPUTE_COLUMN:
    imp = SimpleImputer(missing_values=np.nan, strategy= IMPUTE_METHOD)
    cols = list(X.columns)
    cols = cols[2:23]
    X[cols]=imp.fit_transform(X[cols])  

# If no imputation method selected or only impute each id, for the remaining nan impute direclty with FILL_VALUE
X = X.fillna(FILL_VALUE) 

Imputation.


In [22]:
X_3 = X # with both resamaple and imputation

In [43]:
print("Imputation.")
# do imputation of label with zero if there are still missing values
X_4 = X_1
X_4['aki_stage'] = X_4['aki_stage'].fillna(0)
# using most common within each icustay_id
if IMPUTE_EACH_ID:
    # set a new variable so won't change the orginial X
    column_name = list(X_4.columns)
    column_name.remove(column_name[0]) 
    for feature in column_name:
        X_4.loc[X_4[feature].isnull(), feature] = X_4.icustay_id.map(fast_mode(X, ['icustay_id'], feature).set_index('icustay_id')[feature])       

# imputation based on whole column
if IMPUTE_COLUMN:
    imp = SimpleImputer(missing_values=np.nan, strategy= IMPUTE_METHOD)
    cols = list(X_4.columns)
    cols = cols[2:23]
    X_4[cols]=imp.fit_transform(X_4[cols])  

# If no imputation method selected or only impute each id, for the remaining nan impute direclty with FILL_VALUE
X_4 = X_4.fillna(FILL_VALUE) 

Imputation.


In [44]:
X_4 # imputation but without resample

Unnamed: 0,icustay_id,charttime,uo_rt_6hr,uo_rt_12hr,uo_rt_24hr,aki_stage,creat,hadm_id,aniongap_avg,bicarbonate_avg,...,wbc_avg,glucose_avg,heartrate_mean,sysbp_mean,diasbp_mean,resprate_mean,spo2_mean,vent,vasopressor,sedative
0,200001,2181-11-18 11:10:00,0.0000,0.0000,0.0000,0.0,2.5,152234.0,11.0,30.0,...,2.6,90.0,0.0,0.0,0.0,0.0,0.0,0,0,0
1,200001,2181-11-18 11:20:00,0.0000,0.0000,0.0000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0
2,200001,2181-11-18 15:29:00,0.0000,0.0000,0.0000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0
3,200001,2181-11-18 23:40:00,0.0000,0.0000,0.0000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0
4,200001,2181-11-19 06:00:00,0.0000,0.0000,0.0000,0.0,2.7,152234.0,14.0,27.0,...,2.7,74.0,0.0,0.0,0.0,0.0,0.0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10378676,299999,2117-09-01 16:00:00,0.2354,0.4426,1.9528,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0
10378677,299999,2117-09-02 05:20:00,0.0000,0.0000,0.0000,0.0,1.0,129161.0,12.0,28.0,...,10.0,104.0,0.0,0.0,0.0,0.0,0.0,0,0,0
10378678,299999,2117-09-03 05:28:00,0.0000,0.0000,0.0000,0.0,0.8,129161.0,0.0,0.0,...,10.6,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0
10378679,299999,2117-09-03 21:36:00,0.0000,0.0000,0.0000,0.0,0.9,129161.0,13.0,27.0,...,0.0,88.0,0.0,0.0,0.0,0.0,0.0,0,0,0


In [49]:
# more comfortable to review in this order
def change_order(X):
    try:
        cols = ['icustay_id', 'charttime','aki_stage','hadm_id','aniongap_avg','bicarbonate_avg', 'bun_avg','chloride_avg',
                'creat','diasbp_mean', 'glucose_avg', 'heartrate_mean', 'hematocrit_avg','hemoglobin_avg', 
                'potassium_avg', 'resprate_mean', 'sodium_avg','spo2_mean', 'sysbp_mean', 'uo_rt_12hr', 
                'uo_rt_24hr', 'uo_rt_6hr','wbc_avg', 'sedative', 'vasopressor', 'vent' ]
        X = X[cols]
        print("success")
    except:
        try:
            cols = ['icustay_id', 'charttime','aki_stage','creat','uo_rt_12hr', 'uo_rt_24hr', 'uo_rt_6hr']
            X = X[cols]
        except:
            print("error")
            
change_order(X)
change_order(X_1)
change_order(X_2)
change_order(X_3)
change_order(X_4)

success
success
success
success
success


In [48]:
print("binarise labels")
def binarise_lable(X):
    if ALL_STAGES:
        pass
    elif CLASS1:
        X.loc[X['aki_stage'] > 1, 'aki_stage'] = 1
    elif CLASS2:
        X.loc[X['aki_stage'] < 2, 'aki_stage'] = 0
        X.loc[X['aki_stage'] > 1, 'aki_stage'] = 1
    elif CLASS3:
        X.loc[X['aki_stage'] < 3, 'aki_stage'] = 0
        X.loc[X['aki_stage'] > 2, 'aki_stage'] = 1
    print('success')
        
binarise_lable(X)
binarise_lable(X_1)
binarise_lable(X_2)
binarise_lable(X_3)
binarise_lable(X_4)


binarise labels
success
success
success
success
success


In [25]:
#print("Shifting the labels 48 h") # by 8 position : 6h sampling*8=48h and ffil 8 newly empty ones
# group by
X['aki_stage'] = X.groupby('icustay_id')['aki_stage'].shift(-(HOURS_AHEAD // int(SAMPLING_INTERVAL[:-1])))
X = X.dropna(subset=['aki_stage'])
X['icustay_id'].unique()
X.shape

(1774515, 26)

In [68]:
print(X_1[X_1['icustay_id'].isin([200001])])
# print(X_1[X_1['icustay_id'].isin([200001])].isna().sum())
print(X_1.isna().sum())
print(X_1.shape[0])

# print(X_2[X_2['icustay_id'].isin([200001])])
# print(X_2[X_2['icustay_id'].isin([200001])].isna().sum())
# print(X_2.isna().sum())

# print(X_3[X_3['icustay_id'].isin([200001])])
# print(X_3[X_3['icustay_id'].isin([200001])].isna().sum())
# print(X_3.isna().sum())

# print(X_1.shape)
# print(X_4.shape)
# print(X_4[X_4['icustay_id'].isin([200001])])
# print(X_4.columns[2:])
# print(len(X_4.columns[2:]))


     icustay_id           charttime  uo_rt_6hr  uo_rt_12hr  uo_rt_24hr  \
0        200001 2181-11-18 11:10:00        NaN         NaN         NaN   
1        200001 2181-11-18 11:20:00        NaN         NaN         NaN   
2        200001 2181-11-18 15:29:00        NaN         NaN         NaN   
3        200001 2181-11-18 23:40:00        NaN         NaN         NaN   
4        200001 2181-11-19 06:00:00        NaN         NaN         NaN   
..          ...                 ...        ...         ...         ...   
150      200001 2181-12-01 13:20:00        NaN         NaN         NaN   
151      200001 2181-12-01 21:00:00        NaN         NaN         NaN   
152      200001 2181-12-02 05:30:00        NaN         NaN         NaN   
153      200001 2181-12-03 06:10:00        NaN         NaN         NaN   
154      200001 2181-12-04 05:58:00        NaN         NaN         NaN   

     aki_stage  creat   hadm_id  aniongap_avg  bicarbonate_avg  ...  wbc_avg  \
0          0.0    2.5  152234.0

In [None]:
# check how many NA
for i in X['icustay_id'].unique():
    X['creat'][]

In [69]:
# aggregate timestamps
def delete_timestamps(X):
    tempX = pd.DataFrame()
    for i in X['icustay_id'].unique():
        tempa = []
        for j in X.columns[2:]:
            tempa.append(X[j][X['icustay_id'].isin([i])].mean())
        tempa = pd.DataFrame(tempa)
        tempX.append(tempa,ignore_index = True)
    return tempX

X_del = delete_timestamps(X_4)

The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


In [None]:
X_3.plot(figsize=(15, 6))
plt.show()

from statsmodels.tsa.seasonal import seasonal_decompose
from matplotlib import pyplot
decomposition = seasonal_decompose(X_3, model='additive')
fig = decomposition.plot()
pyplot.show()