In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install captum

Collecting captum
  Downloading captum-0.7.0-py3-none-any.whl.metadata (26 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.6->captum)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.6->captum)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.6->captum)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.6->captum)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.6->captum)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch>=1.6->captum)
  Using cached nvidia_cufft_cu12-11.0.2

In [None]:
from datetime import datetime
print(datetime.now())
#data preprocessing
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import collections
from collections import defaultdict
# NN
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import time
import math
from sklearn.calibration import calibration_curve
from sklearn.metrics import roc_curve, precision_recall_curve, f1_score, roc_auc_score, auc, accuracy_score
from sklearn.metrics import average_precision_score
import sklearn.metrics as metrics
from sklearn.impute import SimpleImputer
import matplotlib.lines as mlines
import matplotlib.transforms as mtransforms
from matplotlib import pyplot as plt
import seaborn as sns
from captum.attr import IntegratedGradients

# the full files pathes are here
DATA_PATH_stages="/content/drive/MyDrive/Colab Notebooks/AKI/data/kdigo_stages_measured.csv"
DATA_PATH_labs = "/content/drive/MyDrive/Colab Notebooks/AKI/data/labs-kdigo_stages_measured.csv"
DATA_PATH_vitals = "/content/drive/MyDrive/Colab Notebooks/AKI/data/vitals-kdigo_stages_measured.csv"
DATA_PATH_vents = "/content/drive/MyDrive/Colab Notebooks/AKI/data/vents-vasopressor-sedatives-kdigo_stages_measured.csv"
DATA_PATH_detail="/content/drive/MyDrive/Colab Notebooks/AKI/data/icustay_detail-kdigo_stages_measured.csv"
SEPARATOR=";"

2024-08-16 12:29:00.666106


In [None]:
IMPUTE_EACH_ID = False # imputation within each icustay_id with most common value
IMPUTE_COLUMN = True # imputation based on whole column

In [None]:
# Set parameter as constant

TESTING = False
TEST_SIZE = 0.05

SPLIT_SIZE = 0.2
MAX_DAYS = 31

#which classifier to use, only run one classifier at one time
CLASS1 = True   #AnyAKI
#CLASS2 = False    #ModerateSevereAKI
#CLASS3 = False    #SevereAKI
# ALL_STAGES = True # not binary label, each class separately 0,1,2,3

MAX_FEATURE_SET = True
#DIAGNOSIS = False

FIRST_TURN_POS = True # creating one label per one ICU stay id

# resampling  and imputing
TIME_SAMPLING = True
SAMPLING_INTERVAL = '6H'
RESAMPLE_LIMIT = 16 # 4 days*6h interval
MOST_COMMON = False #resampling with most common
# if MOST_COMMON is not applied,sampling with different strategies per kind of variable,
# numeric variables use mean value, categorical variables use max value

IMPUTE_METHOD = 'most_frequent'
FILL_VALUE = 0 #fill missing value and ragged part of 3d array

#Age constraints: adults
ADULTS_MIN_AGE = 18
ADULTS_MAX_AGE = -1

NORMALIZATION = 'min-max'

CAPPING_THRESHOLD_UPPER = 0.99
CAPPING_THRESHOLD_LOWER = 0.01

# How much time the prediction should occur (hours)
HOURS_AHEAD = 48

NORM_TYPE = 'min_max'

RANDOM = 42

#set changable info corresponding to each classifier as variables

min_set =  ["icustay_id", "charttime", "creat", "uo_rt_6hr", "uo_rt_12hr", "uo_rt_24hr", "aki_stage"]

max_set = ['icustay_id', 'charttime', 'aki_stage', 'hadm_id','aniongap_avg', 'bicarbonate_avg',
           'bun_avg','chloride_avg', 'creat', 'diasbp_mean', 'glucose_avg', 'heartrate_mean',
           'hematocrit_avg', 'hemoglobin_avg', 'potassium_avg', 'resprate_mean','sodium_avg', 'spo2_mean', 'sysbp_mean',
           'uo_rt_12hr', 'uo_rt_24hr','uo_rt_6hr', 'wbc_avg', 'sedative', 'vasopressor', 'vent', 'age', 'F','M',
           'asian', 'black', 'hispanic', 'native', 'other', 'unknown','white']

# LSTM
batch_size = 5

# naming model and plot
classifier_name = "None vs. Any AKI"    ###change every time #Moderate vs. Severe #None vs. Any #Others vs. Severe
plot_name = "adult_AnyAKI_LR"    ###change every time

In [None]:
# Some functions used later

# impute missing value in resampleing data with most common based on each id
def fast_mode(df, key_cols, value_col):
    """ Calculate a column mode, by group, ignoring null values.

    key_cols : list of str - Columns to groupby for calculation of mode.
    value_col : str - Column for which to calculate the mode.

    Return
    pandas.DataFrame
        One row for the mode of value_col per key_cols group. If ties, returns the one which is sorted first. """
    return (df.groupby(key_cols + [value_col]).size()
              .to_frame('counts').reset_index()
              .sort_values('counts', ascending=False)
              .drop_duplicates(subset=key_cols)).drop('counts',axis=1)


#get max shape of 3d array
def get_dimensions(array, level=0):
    yield level, len(array)
    try:
        for row in array:
            yield from get_dimensions(row, level + 1)
    except TypeError: #not an iterable
        pass

def get_max_shape(array):
    dimensions = defaultdict(int)
    for level, length in get_dimensions(array):
        dimensions[level] = max(dimensions[level], length)
    return [value for _, value in sorted(dimensions.items())]

#pad the ragged 3d array to rectangular shape based on max size
def iterate_nested_array(array, index=()):
    try:
        for idx, row in enumerate(array):
            yield from iterate_nested_array(row, (*index, idx))
    except TypeError: # final level
        yield (*index, slice(len(array))), array # think of the types

def pad(array, fill_value):
    dimensions = get_max_shape(array)
    result = np.full(dimensions, fill_value, dtype = np.float64)
    for index, value in iterate_nested_array(array):
        result[index] = value
    return result

def bin_total(y_true, y_prob, n_bins):
    bins = np.linspace(0., 1. + 1e-8, n_bins + 1)

    # In sklearn.calibration.calibration_curve,
    # the last value in the array is always 0.
    binids = np.digitize(y_prob, bins) - 1

    return np.bincount(binids, minlength=len(bins))

def missing_bin(bin_array):
    midpoint = " "
    if bin_array[0]==0:
        midpoint = "5%, "
    if bin_array[1]==0:
        midpoint = midpoint + "15%, "
    if bin_array[2]==0:
        midpoint = midpoint + "25%, "
    if bin_array[3]==0:
        midpoint = midpoint + "35%, "
    if bin_array[4]==0:
        midpoint = midpoint + "45%, "
    if bin_array[5]==0:
        midpoint = midpoint + "55%, "
    if bin_array[6]==0:
        midpoint = midpoint + "65%, "
    if bin_array[7]==0:
        midpoint = midpoint + "75%, "
    if bin_array[8]==0:
        midpoint = midpoint + "85%, "
    if bin_array[9]==0:
        midpoint = midpoint + "95%, "
    return "The missing bins have midpoint values of "+ str(midpoint)


In [None]:
print("read csv files")
#reading csv files
X = pd.read_csv(DATA_PATH_stages, sep= SEPARATOR)
X.drop(["aki_stage_creat", "aki_stage_uo"], axis = 1, inplace = True)
#remove totally empty rows
X = X.dropna(how = 'all', subset = ['creat','uo_rt_6hr','uo_rt_12hr','uo_rt_24hr','aki_stage'])
print("convert charttime to timestamp")
X['charttime'] = pd.to_datetime(X['charttime'])

#merge rows if they have exact timestamp within same icustay_id AL : it substitutes missing values with zero
#X = X.groupby(['icustay_id', 'charttime']).sum().reset_index(['icustay_id', 'charttime'])

dataset_detail = pd.read_csv(DATA_PATH_detail, sep= SEPARATOR)  #age constraint

# Rename the column 'admission_age' to 'age'
dataset_detail.rename(columns={'admission_age': 'age'}, inplace=True)

dataset_detail.drop(['dod', 'admittime','dischtime', 'los_hospital','ethnicity','hospital_expire_flag', 'hospstay_seq',
       'first_hosp_stay', 'intime', 'outtime', 'los_icu', 'icustay_seq','first_icu_stay'], axis = 1, inplace = True)


read csv files
convert charttime to timestamp


In [None]:
dataset_labs = pd.read_csv(DATA_PATH_labs, sep= SEPARATOR) # 'bands lactate platelet ptt inr pt
dataset_labs.drop(['albumin_min', 'albumin_max','bilirubin_min', 'bilirubin_max','bands_min', 'bands_max',
                   'lactate_min', 'lactate_max','platelet_min', 'platelet_max','ptt_min', 'ptt_max',
                   'inr_min', 'inr_max', 'pt_min', 'pt_max'], axis = 1, inplace = True)
dataset_labs = dataset_labs.dropna(subset=['charttime'])
dataset_labs = dataset_labs.dropna(subset=dataset_labs.columns[4:], how='all')
dataset_labs['charttime'] = pd.to_datetime(dataset_labs['charttime'])
dataset_labs = dataset_labs.sort_values(by=['icustay_id', 'charttime'])

if  MAX_FEATURE_SET:
    dataset_vitals = pd.read_csv(DATA_PATH_vitals, sep= SEPARATOR)
    dataset_vents = pd.read_csv(DATA_PATH_vents , sep= SEPARATOR)
    #dataset_icd = pd.read_csv(DATA_PATH_icd, sep= SEPARATOR)
    dataset_vitals.drop(["heartrate_min", "heartrate_max","sysbp_min", "sysbp_max","diasbp_min", "diasbp_max",
                        'meanbp_min','meanbp_max', 'meanbp_mean','tempc_min', 'tempc_max', 'tempc_mean',
                        "resprate_min", "resprate_max", "spo2_min", "spo2_max", "glucose_min", "glucose_max"], axis = 1, inplace = True)
    print("convert charttime to timestamp")
    dataset_vitals['charttime'] = pd.to_datetime(dataset_vitals['charttime'])
    dataset_vents['charttime'] = pd.to_datetime(dataset_vents['charttime'])
    dataset_vitals = dataset_vitals.sort_values(by=['icustay_id', 'charttime'])
    dataset_vents = dataset_vents.sort_values(by=['icustay_id', 'charttime'])
    # AL drop those where all columns are nan (empty rows)
    dataset_vitals = dataset_vitals.dropna(subset=dataset_vitals.columns[4:], how='all')


convert charttime to timestamp


In [None]:
print(datetime.now())
# Labs file: instead of min and max their avg
counter = 0
col1 = 4
col2 = 5
null_l = [] # no null values in those that are different
changed = 0 # 4316 records changed to avg

while counter < 11:
    row = 0
# find where min and max are different and save their row indices
    while row < len(dataset_labs):
        a = dataset_labs.iloc[row,col1]
        b = dataset_labs.iloc[row,col2]
        if a==b or (np.isnan(a) and np.isnan(b)):
            pass
        elif a!=b:
            changed +=1
            avg = (a+b)/2
            dataset_labs.iloc[row,col1] = avg
            if (np.isnan(a) and ~np.isnan(b)) or (np.isnan(b) and ~np.isnan(a)):
                null_l.append(row)
        else:
            print(a)
            print(b)
        row +=1
    # delete the redundant column max, update counters
    dataset_labs.drop(dataset_labs.columns[col2], axis=1, inplace = True)
    counter = counter+1
    col1 = col1+1
    col2 = col2+1

dataset_labs.columns = ['subject_id','hadm_id', 'icustay_id', 'charttime', 'aniongap_avg', 'bicarbonate_avg',
                        'creatinine_avg', 'chloride_avg', 'glucose_avg', 'hematocrit_avg','hemoglobin_avg',
                        'potassium_avg', 'sodium_avg', 'bun_avg', 'wbc_avg']
if len(null_l)>0:
    print("null values encountered")
print(datetime.now())

2024-08-16 12:29:48.124731
2024-08-16 12:37:14.043576


In [None]:
print("Merge creatinine and glucose.")
# merge creatinine from labs and set with labels
creat_l = dataset_labs[['icustay_id','charttime','creatinine_avg']].copy()
creat_l = creat_l.dropna(subset=['creatinine_avg'])
creat = X[['icustay_id','charttime', 'creat']].copy()
creat = creat.dropna(subset=['creat'])
creat_l = creat_l.rename(columns={"creatinine_avg": "creat"})
creat = pd.concat([creat, creat_l], ignore_index=True)
creat.drop_duplicates(inplace = True)
#delete old columns
dataset_labs.drop(["creatinine_avg"], axis = 1, inplace = True)
dataset_labs = dataset_labs.dropna(subset=dataset_labs.columns[4:], how='all')
X.drop(["creat"], axis = 1, inplace = True)
#merge new column
X = pd.merge(X, creat, on = ["icustay_id", "charttime"], sort = True, how= "outer", copy = False)

if MAX_FEATURE_SET:
    # merge glucose from vitals and labs
    glucose_v = dataset_vitals[['subject_id','hadm_id','icustay_id','charttime', 'glucose_mean']].copy()
    glucose_v = glucose_v.dropna(subset=['glucose_mean'])
    glucose = dataset_labs[['subject_id','hadm_id','icustay_id','charttime', 'glucose_avg']].copy()
    glucose = glucose.dropna(subset=['glucose_avg'])
    glucose_v = glucose_v.rename(columns={"glucose_mean": "glucose_avg"})
    glucose = pd.concat([glucose, glucose_v], ignore_index=True)
    glucose.drop_duplicates(inplace = True)
    #delete old columns
    dataset_labs.drop(["glucose_avg"], axis = 1, inplace = True)
    dataset_vitals.drop(["glucose_mean"], axis = 1, inplace = True)
    dataset_vitals = dataset_vitals.dropna(subset=dataset_vitals.columns[4:], how='all')
    #merge new column
    dataset_labs = pd.merge(dataset_labs, glucose, on = ['subject_id','hadm_id','icustay_id','charttime',], sort = True, how= "outer", copy = False)

dataset_labs = dataset_labs.sort_values(by=['icustay_id', 'charttime'], ignore_index = True)
X = X.sort_values(by=['icustay_id', 'charttime'], ignore_index = True)

Merge creatinine and glucose.


In [None]:
print("Merging labs, vitals and vents files")
if MAX_FEATURE_SET:
    X = pd.merge(X, dataset_labs, on = ["icustay_id", "charttime"], how= "outer", copy = False)
    X = pd.merge(X, dataset_vitals, on = ["icustay_id", "charttime","subject_id", "hadm_id"], how= "outer", copy = False)
    X = pd.merge(X, dataset_vents, on = ["icustay_id", "charttime"], how= "outer", copy = False)
    X.drop(["subject_id"], axis = 1, inplace = True)


Merging labs, vitals and vents files


In [None]:
print("start preprocessing time dependent data")
print("Removing patients under the min age")
dataset_detail = dataset_detail.loc[dataset_detail['age'] >= ADULTS_MIN_AGE]
adults_icustay_id_list = dataset_detail['icustay_id'].unique()
X = X[X.icustay_id.isin(adults_icustay_id_list)].sort_values(by=['icustay_id'], ignore_index = True)
X = X.sort_values(by=['icustay_id', 'charttime'], ignore_index = True)
adults_icustay_id_list = np.sort(adults_icustay_id_list)

start preprocessing time dependent data
Removing patients under the min age


In [None]:
print("drop icustay_id with time span less than 48hrs")
def more_than_HOURS_ahead(adults_icustay_id_list, X):
    drop_list = []
    los_list = [] # calculating LOS ICU based on charttime
    long_stays_id = [] # LOS longer than MAX DAYS days
    last_charttime_list = []
    seq_length = X.groupby(['icustay_id'],as_index=False).size()
    id_count = 0
    first_row_index = 0

    while id_count < len(adults_icustay_id_list):
        icustay_id = adults_icustay_id_list[id_count]
        last_row_index = first_row_index + seq_length.iloc[id_count,1]-1
        first_time = X.iat[first_row_index, X.columns.get_loc('charttime')]
        last_time = X.iat[last_row_index, X.columns.get_loc('charttime')]
        los = round(float((last_time - first_time).total_seconds()/60/60/24),4) # in days
        if los < HOURS_AHEAD/24:
            drop_list.append(icustay_id)
        else:
            los_list.append(los)
            if los > MAX_DAYS:
                long_stays_id.append(icustay_id)
                last_charttime_list.append(last_time)
        # udpate for the next icustay_id
        first_row_index = last_row_index+1
        id_count +=1
    if len(long_stays_id) != len(last_charttime_list):
        print('ERROR')
    print("%d long stays" % len(long_stays_id))
    # drop all the rows with the saved icustay_id
    print("there are %d id-s shorter than 48 hours" % len(drop_list))
    X = X[~X.icustay_id.isin(drop_list)]
    id_list = X['icustay_id'].unique()
    X = X.sort_values(by=['icustay_id', 'charttime'], ignore_index = True)

    return id_list, X, long_stays_id,last_charttime_list

id_list, X, long_stays_id,last_charttime_list  = more_than_HOURS_ahead(adults_icustay_id_list, X)

long = pd.DataFrame()
long['icustay_id']  = long_stays_id
long['last_time']  = last_charttime_list


drop icustay_id with time span less than 48hrs
2936 long stays
there are 5417 id-s shorter than 48 hours


In [None]:
# deleting rows that are not within MAX_DAYS (31) period
i = 0 # long df index
drop_long_time = []

while i < len(long_stays_id):
    j = 0
    all_rows = X.index[X['icustay_id'] == long.loc[i,'icustay_id']].tolist()
    while j < len(all_rows):
        time = X.iat[all_rows[j], X.columns.get_loc('charttime')]
        # if keep last MAX_DAYS
        if (long.loc[i,'last_time'] - time).total_seconds() > MAX_DAYS*24*60*60:
            drop_long_time.append(all_rows[j])
            j +=1
        else:
            break
    i +=1
X.drop(X.index[drop_long_time], inplace=True)

# checking for 48h min length again
id_list, X, long_stays_id,last_charttime_list  = more_than_HOURS_ahead(id_list, X)
dataset_detail = dataset_detail[dataset_detail.icustay_id.isin(id_list)].sort_values(by=['icustay_id'], ignore_index = True)


0 long stays
there are 1 id-s shorter than 48 hours


In [None]:
# For testing purpose, use small amount of data first
if TESTING:
    rest, id_list = train_test_split(id_list, test_size= TEST_SIZE, random_state=42)
    X = X[X.icustay_id.isin(id_list)].sort_values(by=['icustay_id'])
    dataset_detail = dataset_detail[dataset_detail.icustay_id.isin(id_list)].sort_values(by=['icustay_id'])

# Resampling , imputing

In [None]:
if (TIME_SAMPLING and MOST_COMMON):
    print("resampling: MOST_COMMON")
    # Resample the data using assigned interval,mode() for most common
    X = X.set_index('charttime').groupby('icustay_id').resample(SAMPLING_INTERVAL).mode().reset_index()
elif TIME_SAMPLING:
    print("resampling: MEAN & ZERO")
    # Sampling with different strategies per kind of variable
    label = ['aki_stage']
    skip = ['icustay_id', 'charttime', 'aki_stage']
    if MAX_FEATURE_SET:
        discrete_feat = ['sedative', 'vasopressor', 'vent', 'hadm_id']
        skip.extend(discrete_feat)
    # all features that are not in skip are numeric
    numeric_feat = list(X.columns.difference(skip))

    # Applying aggregation to features depending on their type
    X = X.set_index('charttime').groupby('icustay_id').resample(SAMPLING_INTERVAL)
    if MAX_FEATURE_SET:
        X_discrete = X[discrete_feat].max().fillna(FILL_VALUE).astype(np.int64)
    X_numeric = X[numeric_feat].mean()
    X_label = X['aki_stage'].max()
    print("Merging sampled features")
    try:
        X = pd.concat([X_numeric, X_discrete,X_label], axis=1).reset_index()
    except:
        X = pd.concat([X_numeric,X_label], axis=1).reset_index()
print(X.shape)
#Label forward fill
X['aki_stage'] = X['aki_stage'].ffill(limit=RESAMPLE_LIMIT)

resampling: MEAN & ZERO
Merging sampled features
(2028990, 26)


In [None]:
print("Imputation.")
# do imputation of label with zero if there are still missing values
X['aki_stage'] = X['aki_stage'].fillna(0)

# using most common within each icustay_id
if IMPUTE_EACH_ID:
    column_name = list(X.columns)
    column_name.remove(column_name[0])
    for feature in column_name:
        X.loc[X[feature].isnull(), feature] = X.icustay_id.map(fast_mode(X, ['icustay_id'], feature).set_index('icustay_id')[feature])

# imputation based on whole column
if IMPUTE_COLUMN:
    imp = SimpleImputer(missing_values=np.nan, strategy= IMPUTE_METHOD)
    cols = list(X.columns)
    cols = cols[2:23]
    X[cols]=imp.fit_transform(X[cols])

# If no imputation method selected or only impute each id, for the remaining nan impute direclty with FILL_VALUE
X = X.fillna(FILL_VALUE)

Imputation.


In [None]:
# more comfortable to review in this order
try:
    cols = ['icustay_id', 'charttime','aki_stage','hadm_id','aniongap_avg','bicarbonate_avg', 'bun_avg','chloride_avg',
            'creat','diasbp_mean', 'glucose_avg', 'heartrate_mean', 'hematocrit_avg','hemoglobin_avg',
            'potassium_avg', 'resprate_mean', 'sodium_avg','spo2_mean', 'sysbp_mean', 'uo_rt_12hr',
            'uo_rt_24hr', 'uo_rt_6hr','wbc_avg', 'sedative', 'vasopressor', 'vent' ]
    X = X[cols]
    print("success")
except:
    try:
        cols = ['icustay_id', 'charttime','aki_stage','creat','uo_rt_12hr', 'uo_rt_24hr', 'uo_rt_6hr']
        X = X[cols]
    except:
        print("error")

success


In [None]:
print("binarise labels")
if ALL_STAGES:
    pass
elif CLASS1:
    X.loc[X['aki_stage'] > 1, 'aki_stage'] = 1
elif CLASS2:
    X.loc[X['aki_stage'] < 2, 'aki_stage'] = 0
    X.loc[X['aki_stage'] > 1, 'aki_stage'] = 1
elif CLASS3:
    X.loc[X['aki_stage'] < 3, 'aki_stage'] = 0
    X.loc[X['aki_stage'] > 2, 'aki_stage'] = 1

binarise labels


In [None]:
print(X.columns)

Index(['icustay_id', 'charttime', 'aki_stage', 'hadm_id', 'aniongap_avg',
       'bicarbonate_avg', 'bun_avg', 'chloride_avg', 'creat', 'diasbp_mean',
       'glucose_avg', 'heartrate_mean', 'hematocrit_avg', 'hemoglobin_avg',
       'potassium_avg', 'resprate_mean', 'sodium_avg', 'spo2_mean',
       'sysbp_mean', 'uo_rt_12hr', 'uo_rt_24hr', 'uo_rt_6hr', 'wbc_avg',
       'sedative', 'vasopressor', 'vent'],
      dtype='object')


# SHIFTING labels

In [None]:
#print("Shifting the labels 48 h") # by 8 position : 6h sampling*8=48h and ffil 8 newly empty ones
X['aki_stage'] = X.groupby('icustay_id')['aki_stage'].shift(-(HOURS_AHEAD // int(SAMPLING_INTERVAL[:-1])))
X = X.dropna(subset=['aki_stage'])
X['icustay_id'].nunique()

47250

In [None]:
# Create one label per icustay_id - first turn positive approach
if FIRST_TURN_POS:
    X = X.sort_values(by=['icustay_id', 'charttime'], ignore_index=True)
    id_list.sort()
    last_charttime_list = []
    index_list = []
    label_list = []
    filtered_id_list = []

    first_row_index = 0
    id_count = 0
    seq_length = X.groupby(['icustay_id']).size().reset_index(name='count')

    for ID in id_list:
        if id_count >= len(seq_length):
            break

        last_row_index = first_row_index + seq_length.iloc[id_count]['count'] - 1
        a = X.loc[X['icustay_id'] == ID].aki_stage
        if 1 not in a.values:
            label_list.append(0)
            last_charttime_list.append(X.iat[last_row_index, X.columns.get_loc('charttime')])
            index_list.append(last_row_index)
            filtered_id_list.append(ID)
        else:
            row = first_row_index
            while row <= last_row_index:
                if X.iat[row, X.columns.get_loc('aki_stage')] == 1:
                    label_list.append(1)
                    last_charttime_list.append(X.iat[row, X.columns.get_loc('charttime')])
                    index_list.append(row)
                    filtered_id_list.append(ID)
                    break
                row += 1
        first_row_index = last_row_index + 1
        id_count += 1

    # Debugging statements
    print(f"Length of filtered_id_list: {len(filtered_id_list)}")
    print(f"Length of last_charttime_list: {len(last_charttime_list)}")
    print(f"Length of label_list: {len(label_list)}")

    if len(filtered_id_list) == len(last_charttime_list) == len(label_list):
        X = X.drop(['aki_stage'], axis=1)
        Thresholds = pd.DataFrame({'icustay_id': filtered_id_list, 'charttime': last_charttime_list, 'final_label': label_list})
        X = (Thresholds.merge(X, on='icustay_id', how='left', suffixes=('_x', '')).query("charttime_x >= charttime").reindex(columns=X.columns))
        print(X.shape)
        print(X['icustay_id'].nunique())
    else:
        print("Error: The lengths of the lists do not match.")

# Assuming label_register code follows the successful creation of Thresholds
if len(filtered_id_list) == len(label_list):
    label_register = pd.DataFrame()
    label_register['icustay_id'] = filtered_id_list
    label_register['label'] = label_list
else:
    print("Error: Unable to create label_register due to length mismatch.")

Length of filtered_id_list: 35781
Length of last_charttime_list: 35781
Length of label_list: 35781
(577930, 25)
18054


In [None]:
print(X.columns)

Index(['icustay_id', 'charttime', 'hadm_id', 'aniongap_avg', 'bicarbonate_avg',
       'bun_avg', 'chloride_avg', 'creat', 'diasbp_mean', 'glucose_avg',
       'heartrate_mean', 'hematocrit_avg', 'hemoglobin_avg', 'potassium_avg',
       'resprate_mean', 'sodium_avg', 'spo2_mean', 'sysbp_mean', 'uo_rt_12hr',
       'uo_rt_24hr', 'uo_rt_6hr', 'wbc_avg', 'sedative', 'vasopressor',
       'vent'],
      dtype='object')


# Add categorical features (details)

In [None]:
print("start preprocessing not time dependent data")
if MAX_FEATURE_SET:
    #extract datasets based on id_list
    dataset_detail = dataset_detail.loc[dataset_detail['icustay_id'].isin(id_list)]
    #sort by ascending order
    dataset_detail = dataset_detail.sort_values(by=['icustay_id'])
    subject_id = dataset_detail["subject_id"].unique()
    #transfrom categorical data to binary form
    dataset_detail = dataset_detail.join(pd.get_dummies(dataset_detail.pop('gender')))
    dataset_detail = dataset_detail.join(pd.get_dummies(dataset_detail.pop("ethnicity_grouped")))
    #dataset_detail = dataset_detail.join(pd.get_dummies(dataset_detail.pop('admission_type')))
    dataset_detail = dataset_detail.drop(['subject_id', 'hadm_id'], axis=1)
    X =  pd.merge(X, dataset_detail, on = ["icustay_id"], how= "left", copy = False)
    numeric_feat.append('age')

start preprocessing not time dependent data


In [None]:
feature_names =['Anion gap', 'Bicarbonate', 'Blood Urea Nitrogen', 'Chloride', 'Creatinine', 'Diastolic BP', 'Glucose', 'Heart rate',
            'Hematocrit', 'Hemoglobin', 'Potassium', 'Respiratory rate', 'Sodium', 'Oxygen saturation', 'Systolic BP', 'Urine output 12h', 'Urine output 24h', 'Urine output 6h',
            'White cell count', 'Sedative', 'Vasopressor', 'Ventilation', 'Age', 'Female gender', 'Male gender', 'Asian ethnicity', 'Black ethnicity', 'Hispanic ethnicity', 'Native american',
            'Other ethnicity', 'Ethnicity unknown', 'White ethnicity']



In [None]:
X

Unnamed: 0,icustay_id,charttime,hadm_id,aniongap_avg,bicarbonate_avg,bun_avg,chloride_avg,creat,diasbp_mean,glucose_avg,...,age,F,M,asian,black,hispanic,native,other,unknown,white
0,200001,2181-11-18 06:00:00,152234.0,11.0,30.0,42.0,107.0,2.5,59.000000,90.000000,...,61.071279,True,False,True,False,False,False,False,False,False
1,200001,2181-11-18 12:00:00,0.0,13.0,25.0,15.0,104.0,0.7,59.000000,111.000000,...,61.071279,True,False,True,False,False,False,False,False,False
2,200001,2181-11-18 18:00:00,0.0,13.0,25.0,15.0,104.0,0.7,59.000000,111.000000,...,61.071279,True,False,True,False,False,False,False,False,False
3,200001,2181-11-19 00:00:00,0.0,13.0,25.0,15.0,104.0,0.7,59.000000,111.000000,...,61.071279,True,False,True,False,False,False,False,False,False
4,200001,2181-11-19 06:00:00,152234.0,14.0,27.0,46.0,105.0,2.7,59.000000,74.000000,...,61.071279,True,False,True,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
577925,299818,2163-06-29 00:00:00,157997.0,10.0,26.0,18.0,109.0,1.0,47.500000,118.666667,...,72.405247,True,False,False,False,False,False,False,False,True
577926,299818,2163-06-29 06:00:00,157997.0,13.0,25.0,15.0,104.0,0.7,51.200000,122.000000,...,72.405247,True,False,False,False,False,False,False,False,True
577927,299818,2163-06-29 12:00:00,157997.0,13.0,25.0,15.0,106.0,0.7,48.333333,118.000000,...,72.405247,True,False,False,False,False,False,False,False,True
577928,299818,2163-06-29 18:00:00,157997.0,13.0,25.0,15.0,104.0,0.7,55.000000,160.000000,...,72.405247,True,False,False,False,False,False,False,False,True


In [None]:
print(X.columns)

Index(['icustay_id', 'charttime', 'hadm_id', 'aniongap_avg', 'bicarbonate_avg',
       'bun_avg', 'chloride_avg', 'creat', 'diasbp_mean', 'glucose_avg',
       'heartrate_mean', 'hematocrit_avg', 'hemoglobin_avg', 'potassium_avg',
       'resprate_mean', 'sodium_avg', 'spo2_mean', 'sysbp_mean', 'uo_rt_12hr',
       'uo_rt_24hr', 'uo_rt_6hr', 'wbc_avg', 'sedative', 'vasopressor', 'vent',
       'age', 'F', 'M', 'asian', 'black', 'hispanic', 'native', 'other',
       'unknown', 'white'],
      dtype='object')


In [None]:
X.to_csv('/content/drive/MyDrive/Colab Notebooks/AKI/final/input_data/X_init-ord.csv', index=False)
# X.to_csv('/content/drive/MyDrive/Colab Notebooks/AKI/final/input_data/X_init.csv', index=False)

#  Cap features between 0.01 / 0.99 quantile and normalisation

In [None]:
# Replace values in the 'age' column greater than 89 with 90
X['age'] = X['age'].apply(lambda x: 90.0 if x > 89 else x)

# Check the number of occurrences of the value 90
num_ninety_values = (X['age'] == 90.0).sum()

# Alternatively, using value_counts:
# num_ninety_values = X['age'].value_counts().get(90.0, 0)

# Print the result
print(f"Number of values equal to 90 in the 'age' column: {num_ninety_values}")

# Verify the changes
print(X['age'].unique())


Number of values equal to 90 in the 'age' column: 22748
[61.07127869 48.29627051 54.07230806 ... 80.46649905 70.30398583
 72.40524744]


In [None]:
# X.to_csv('/content/drive/MyDrive/Colab Notebooks/AKI/final/input_data/X_crt-age-ord.csv', index=False)
X.to_csv('/content/drive/MyDrive/Colab Notebooks/AKI/final/input_data/X_crt-age_flat.csv', index=False)

In [None]:
# X = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/AKI/final/input_data/X_crt-age-ord.csv')
X = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/AKI/final/input_data/X_crt-age_flat.csv')

In [None]:
# Define the features that need capping and normalization, including 'age'
numeric_features = [
    'aniongap_avg', 'bicarbonate_avg', 'bun_avg', 'chloride_avg', 'creat',
    'diasbp_mean', 'glucose_avg', 'heartrate_mean', 'hematocrit_avg',
    'hemoglobin_avg', 'potassium_avg', 'resprate_mean', 'sodium_avg',
    'spo2_mean', 'sysbp_mean', 'uo_rt_12hr', 'uo_rt_24hr', 'uo_rt_6hr',
    'wbc_avg', 'age'
]

# Convert binary features to 0/1 (not included in capping or normalization)
binary_features = [
    'F', 'M', 'asian', 'black', 'hispanic', 'native', 'other', 'unknown',
    'white', 'sedative', 'vasopressor', 'vent'
]
X[binary_features] = X[binary_features].astype(int)

# Function to cap numeric data
def cap_data(df):
    print(f"Capping between the {CAPPING_THRESHOLD_LOWER} and {CAPPING_THRESHOLD_UPPER} quantile")

    # Apply the clipping to numeric features only
    df[numeric_features] = df[numeric_features].clip(
        df[numeric_features].quantile(CAPPING_THRESHOLD_LOWER),
        df[numeric_features].quantile(CAPPING_THRESHOLD_UPPER),
        axis=1
    )
    return df

# Function to normalize numeric data
def normalise_data(df):
    print(f"Normalizing in [0,1] with {NORMALIZATION} normalization")

    # Apply normalization to numeric features only
    df[numeric_features] = (df[numeric_features] - df[numeric_features].min()) / \
                            (df[numeric_features].max() - df[numeric_features].min())
    return df

# Print the first few rows of the aki_stage and sedatives columns before applying capping and normalization
print("Before capping and normalization:")
print(X['aki_stage'].head())
print(X['sedative'].head())

# Apply capping
X = cap_data(X)

# Print the first few rows of the aki_stage and sedatives columns after capping to check if it was modified
print("After capping:")
print(X['aki_stage'].head())
print(X['sedative'].head())

# Apply normalization
X = normalise_data(X)

# Print the first few rows of the aki_stage and sedatives columns after normalization to check if it was modified
print("After normalization:")
print(X['aki_stage'].head())
print(X['sedative'].head())


In [None]:
X = X.sort_values(by=['icustay_id', 'charttime'])
seq_lengths = X.groupby(['icustay_id'],as_index=False).size().sort_values(by = ['size'],ascending=False)
sequence_length = seq_lengths.max() # the longest sequence per icustay-id
print(sequence_length)

icustay_id    299818
size             227
dtype: int64


In [None]:
#AL re-write as try except to make it work as hadm_id is not used if only one csv file is used and none are merged
try:
    X.drop(['hadm_id'], axis=1, inplace = True)
except:
    pass

In [None]:
features = X.shape[1]-3
print(features)

32


In [None]:
print("divide dataset into train, test and validation sets")
id_train, id_test_val = train_test_split(id_list, test_size = SPLIT_SIZE, random_state = 42) # train set is 80%)
print("train is %d" % len(id_train))
# remaining 20% split in halves as test and validation 10% and 10%
id_valid, id_test = train_test_split(id_test_val, test_size = 0.5, random_state = 42) # test 10% valid 10%
print("val and test are %d" %len(id_test))


divide dataset into train, test and validation sets
train is 37868
val and test are 4734


In [None]:
train = X[X.icustay_id.isin(id_train)].sort_values(by=['icustay_id'])
test = X[X.icustay_id.isin(id_test)].sort_values(by=['icustay_id'], ignore_index = True)
validation = X[X.icustay_id.isin(id_valid)].sort_values(by=['icustay_id'])

test = test.sort_values(by=['icustay_id', 'charttime'], ignore_index = True)
train = train.sort_values(by=['icustay_id', 'charttime'], ignore_index = True)
validation = validation.sort_values(by=['icustay_id', 'charttime'], ignore_index = True)

In [None]:
test.shape

(58036, 35)

In [None]:
test['icustay_id'].nunique()

1793

In [None]:
train['icustay_id'].nunique()

14490

In [None]:
validation['icustay_id'].nunique()

1771

In [None]:
test_seq = test.groupby(['icustay_id'],as_index=False).size()

# Generate the test set before onset

In [None]:
# Check if 'aki_stage' exists in the test DataFrame
if 'aki_stage' not in test.columns:
    raise KeyError("'aki_stage' column not found in the test DataFrame")

KeyError: "'aki_stage' column not found in the test DataFrame"

In [None]:
Z = test.copy(deep = True)
test = test.sort_values(by=['icustay_id', 'charttime'], ignore_index = True)
id_test.sort()


index_list = []
label_list = []

first_row_index = 0
id_count = 0
seq_length = Z.groupby(['icustay_id'], as_index=False).size()

# Filter id_test to only include IDs present in Z (i.e., in seq_length)
id_test_filtered = [ID for ID in id_test if ID in seq_length['icustay_id'].values]

# Loop through the filtered id_test
for ID in id_test_filtered:
    # Get the number of rows for the current ID in Z
    last_row_index = first_row_index + seq_length[seq_length['icustay_id'] == ID].iloc[0, 1] - 1

    # Extract the aki_stage column for the current ID
    aki_stages = Z.loc[Z['icustay_id'] == ID, 'aki_stage'].values

    # Default to the last index with the maximum AKI stage (3), if present
    if 3 in aki_stages:
        label_list.append(3)
        index_list.append(np.argmax(aki_stages == 3) + first_row_index)
    elif 2 in aki_stages:
        label_list.append(2)
        index_list.append(np.argmax(aki_stages == 2) + first_row_index)
    elif 1 in aki_stages:
        label_list.append(1)
        index_list.append(np.argmax(aki_stages == 1) + first_row_index)
    else:
        label_list.append(0)
        index_list.append(last_row_index)

    # Update the first_row_index for the next iteration
    first_row_index = last_row_index + 1
    id_count += 1

# Now index_list and label_list contain the results of the processed IDs
print("Processed index_list:", index_list)
print("Processed label_list:", label_list)

print(f"Length of id_test: {len(id_test)}")
print(f"Length of seq_length: {len(seq_length)}")

# Get the number of elements in index_list
num_elements_index_list = len(index_list)

# Get the number of elements in label_list
num_elements_label_list = len(label_list)

# Print the results
print(f"Number of elements in index_list: {num_elements_index_list}")
print(f"Number of elements in label_list: {num_elements_label_list}")


In [None]:
Z = test.copy(deep = True)
test = test.sort_values(by=['icustay_id', 'charttime'], ignore_index = True)
id_test.sort()
seq_length = Z.groupby(['icustay_id'],as_index=False).size()
#last_charttime_list= []

# Step 1: Filter id_test to keep only IDs that exist in seq_length
id_test_filtered = [ID for ID in id_test if ID in seq_length['icustay_id'].values]

# Step 2: Reinitialize the variables
index_list = []
label_list = []

first_row_index = 0
id_count = 0

# Step 3: Run the loop with the filtered id_test
for ID in id_test_filtered:
    last_row_index = first_row_index + seq_length.iloc[id_count, 1] - 1
    a = Z.loc[Z['icustay_id'] == ID].aki_stage

    if 1 not in a.values:
        label_list.append(0)
        index_list.append(last_row_index)
    elif 1 in a.values:
        row = first_row_index
        while row != last_row_index + 1:
            if Z.iat[row, Z.columns.get_loc('aki_stage')] == 0:
                row += 1
            elif Z.iat[row, Z.columns.get_loc('aki_stage')] == 1:
                index_list.append(row)
                break

    first_row_index = last_row_index + 1
    id_count += 1

# Proceed with the rest of your analysis


AttributeError: 'DataFrame' object has no attribute 'aki_stage'

In [None]:
# Merge label_register with test data to get the labels
Z = test.merge(label_register, on='icustay_id', how='left')

# Sort test data if needed
Z = Z.sort_values(by=['icustay_id', 'charttime'], ignore_index=True)
id_test.sort()
seq_length = Z.groupby(['icustay_id'], as_index=False).size()

# Filter id_test to keep only IDs that exist in seq_length
id_test_filtered = [ID for ID in id_test if ID in seq_length['icustay_id'].values]

# Reset label_list and index_list
label_list = []
index_list = []

# Ensure the first_row_index is reset before the loop
first_row_index = 0
id_count = 0

for ID in filtered_id_list:
    # Ensure the ID exists in seq_length before attempting to access it
    filtered_seq_length = seq_length[seq_length['icustay_id'] == ID]

    if filtered_seq_length.empty:
        print(f"Warning: ID {ID} not found in seq_length.")
        continue  # Skip this ID if it's not found

    last_row_index = first_row_index + filtered_seq_length.iloc[0, 1] - 1

    # Use the 'label' from the label_register directly since 'aki_stage' is dropped
    label = label_register.loc[label_register['icustay_id'] == ID, 'label'].values[0]
    label_list.append(label)
    index_list.append(last_row_index if label == 0 else first_row_index)

    first_row_index = last_row_index + 1
    id_count += 1

# After the loop, check the lengths again
print(f"New Length of filtered_id_list: {len(filtered_id_list)}")
print(f"New Length of label_list: {len(label_list)}")

# Check if they now match
if len(filtered_id_list) != len(label_list):
    print("Error: Lengths still do not match.")
else:
    print("Lengths match. Proceeding with further processing.")



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
New Length of filtered_id_list: 35781
New Length of label_list: 1793
Error: Lengths still do not match.


In [None]:
print(f"Length of id_test: {len(id_test)}")
print(f"Length of seq_length: {len(seq_length)}")


Length of id_test: 4734
Length of seq_length: 1793


In [None]:
# Get the number of elements in index_list
num_elements_index_list = len(index_list)

# Get the number of elements in label_list
num_elements_label_list = len(label_list)

# Print the results
print(f"Number of elements in index_list: {num_elements_index_list}")
print(f"Number of elements in label_list: {num_elements_label_list}")


Number of elements in index_list: 1793
Number of elements in label_list: 1363


In [None]:
test.drop(['charttime'], axis=1, inplace = True)
train.drop(['charttime'], axis=1, inplace = True)
validation.drop(['charttime'], axis=1, inplace = True)

In [None]:
np.version.version

'1.26.4'

In [None]:
# Group by 'icustay_id' and convert each group to a 2D array
train_sequences = [group.to_numpy() for _, group in train.groupby('icustay_id')]
test_sequences = [group.to_numpy() for _, group in test.groupby('icustay_id')]
validation_sequences = [group.to_numpy() for _, group in validation.groupby('icustay_id')]

# The result is a list of 2D numpy arrays, where each array corresponds to a sequence for a specific icustay_id.

# Example: Print the number of sequences and the shape of the first sequence in each set
print(f"Number of training sequences: {len(train_sequences)}")
print(f"Shape of the first training sequence: {train_sequences[0].shape}")

print(f"Number of test sequences: {len(test_sequences)}")
print(f"Shape of the first test sequence: {test_sequences[0].shape}")

print(f"Number of validation sequences: {len(validation_sequences)}")
print(f"Shape of the first validation sequence: {validation_sequences[0].shape}")


Number of training sequences: 14490
Shape of the first training sequence: (28, 33)
Number of test sequences: 1793
Shape of the first test sequence: (71, 33)
Number of validation sequences: 1771
Shape of the first validation sequence: (17, 33)


In [None]:
print(train.columns)

Index(['icustay_id', 'aniongap_avg', 'bicarbonate_avg', 'bun_avg',
       'chloride_avg', 'creat', 'diasbp_mean', 'glucose_avg', 'heartrate_mean',
       'hematocrit_avg', 'hemoglobin_avg', 'potassium_avg', 'resprate_mean',
       'sodium_avg', 'spo2_mean', 'sysbp_mean', 'uo_rt_12hr', 'uo_rt_24hr',
       'uo_rt_6hr', 'wbc_avg', 'sedative', 'vasopressor', 'vent', 'age', 'F',
       'M', 'asian', 'black', 'hispanic', 'native', 'other', 'unknown',
       'white'],
      dtype='object')


In [None]:
print(validation.shape)

(58966, 33)


In [None]:
def pad_sequences(sequences, fill_value):
    max_length = max(seq.shape[0] for seq in sequences)
    padded_sequences = []
    for seq in sequences:
        padded_seq = np.pad(seq, ((0, max_length - seq.shape[0]), (0, 0)), mode='constant', constant_values=fill_value)
        padded_sequences.append(padded_seq)
    return np.array(padded_sequences)

FILL_VALUE = 0  # Define the fill value for padding
print("Pad the sequences to ensure uniform length")
X_train_padded = pad_sequences(train_sequences, FILL_VALUE)
X_test_padded = pad_sequences(test_sequences, FILL_VALUE)
X_validation_padded = pad_sequences(validation_sequences, FILL_VALUE)

print(f"Padded training data shape: {X_train_padded.shape}")
print(f"Padded test data shape: {X_test_padded.shape}")
print(f"Padded validation data shape: {X_validation_padded.shape}")


Pad the sequences to ensure uniform length
Padded training data shape: (14490, 227, 33)
Padded test data shape: (1793, 117, 33)
Padded validation data shape: (1771, 117, 33)


In [None]:
print("Flatten the 3D arrays back to 2D arrays")
nsamples, nx, ny = X_train_padded.shape
X_train_flattened = X_train_padded.reshape((nsamples, nx * ny))
print(f"Flattened training data shape: {X_train_flattened.shape}")

nsamples, nx, ny = X_test_padded.shape
X_test_flattened = X_test_padded.reshape((nsamples, nx * ny))
print(f"Flattened test data shape: {X_test_flattened.shape}")

nsamples, nx, ny = X_validation_padded.shape
X_validation_flattened = X_validation_padded.reshape((nsamples, nx * ny))
print(f"Flattened validation data shape: {X_validation_flattened.shape}")

Flatten the 3D arrays back to 2D arrays
Flattened training data shape: (14490, 7491)
Flattened test data shape: (1793, 3861)
Flattened validation data shape: (1771, 3861)


In [None]:
# Check the lengths of filtered_id_list and label_list
print(f"Length of filtered_id_list: {len(filtered_id_list)}")
print(f"Length of label_list: {len(label_list)}")

# If they do not match, identify where the issue might be
if len(filtered_id_list) != len(label_list):
    print("Error: The lengths of filtered_id_list and label_list do not match.")

    # Optional: Print out the differences or examine the lists
    for i, (id, label) in enumerate(zip(filtered_id_list, label_list)):
        print(f"Index {i}: ID = {id}, Label = {label}")

    # Further debugging steps could include checking unique counts or values in each list
    unique_ids = set(filtered_id_list)
    unique_labels = set(label_list)
    print(f"Unique IDs: {len(unique_ids)}")
    print(f"Unique Labels: {len(unique_labels)}")

    # Exit the code if there is a mismatch to prevent further errors
    raise ValueError("Cannot proceed due to mismatched lengths.")
else:
    print("No length mismatch found.")

# Proceed to create the DataFrame if lengths match
label_register = pd.DataFrame({'icustay_id': filtered_id_list, 'label': label_list})

# Now, merge labels with the original train, test, and validation sets
y_train = label_register.loc[label_register['icustay_id'].isin(id_train), 'label'].values
y_test = label_register.loc[label_register['icustay_id'].isin(id_test), 'label'].values
y_validation = label_register.loc[label_register['icustay_id'].isin(id_valid), 'label'].values

# Convert y values to DataFrames if needed
y_train_df = pd.DataFrame(y_train, columns=['aki_stage'])
y_test_df = pd.DataFrame(y_test, columns=['aki_stage'])
y_validation_df = pd.DataFrame(y_validation, columns=['aki_stage'])


Length of filtered_id_list: 35781
Length of label_list: 1363
Error: The lengths of filtered_id_list and label_list do not match.
Index 0: ID = 200001, Label = 0
Index 1: ID = 200003, Label = 0
Index 2: ID = 200006, Label = 0
Index 3: ID = 200007, Label = 0
Index 4: ID = 200009, Label = 0
Index 5: ID = 200014, Label = 0
Index 6: ID = 200019, Label = 0
Index 7: ID = 200024, Label = 0
Index 8: ID = 200025, Label = 0
Index 9: ID = 200028, Label = 0
Index 10: ID = 200030, Label = 0
Index 11: ID = 200033, Label = 0
Index 12: ID = 200034, Label = 0
Index 13: ID = 200035, Label = 0
Index 14: ID = 200036, Label = 0
Index 15: ID = 200038, Label = 0
Index 16: ID = 200039, Label = 0
Index 17: ID = 200041, Label = 0
Index 18: ID = 200045, Label = 0
Index 19: ID = 200047, Label = 0
Index 20: ID = 200049, Label = 0
Index 21: ID = 200052, Label = 0
Index 22: ID = 200053, Label = 0
Index 23: ID = 200055, Label = 0
Index 24: ID = 200059, Label = 0
Index 25: ID = 200061, Label = 0
Index 26: ID = 200062, 

ValueError: Cannot proceed due to mismatched lengths.

In [None]:
# Aggregating each sequence by taking the mean across the time dimension
# Ensure that features like 'icustay_id', 'aki_stage', 'age', and binary features (F, M, ethnicity columns) are preserved and not averaged
aggregated_train_data = []
aggregated_test_data = []
aggregated_validation_data = []

# Define columns to preserve (not average)
preserve_columns = ['icustay_id', 'aki_stage', 'F', 'M', 'asian', 'black',
                    'hispanic', 'native', 'other', 'unknown', 'white', 'sedative', 'vasopressor', 'vent' ]

# Get the indices of columns to preserve
preserve_indices = [train.columns.get_loc(col) for col in preserve_columns]

for seq in train_sequences:
    # Preserve the first occurrence of categorical and identifier columns
    preserved_values = seq[0, preserve_indices]
    # Aggregate the rest by taking the mean
    aggregated_values = np.mean(np.delete(seq, preserve_indices, axis=1), axis=0)
    # Combine preserved values and aggregated values
    combined_values = np.concatenate([preserved_values, aggregated_values])
    aggregated_train_data.append(combined_values)

for seq in test_sequences:
    preserved_values = seq[0, preserve_indices]
    aggregated_values = np.mean(np.delete(seq, preserve_indices, axis=1), axis=0)
    combined_values = np.concatenate([preserved_values, aggregated_values])
    aggregated_test_data.append(combined_values)

for seq in validation_sequences:
    preserved_values = seq[0, preserve_indices]
    aggregated_values = np.mean(np.delete(seq, preserve_indices, axis=1), axis=0)
    combined_values = np.concatenate([preserved_values, aggregated_values])
    aggregated_validation_data.append(combined_values)

# Convert lists to numpy arrays
X_train_agg = np.array(aggregated_train_data)
X_test_agg = np.array(aggregated_test_data)
X_validation_agg = np.array(aggregated_validation_data)

# Print the shape of the aggregated arrays
print(f"Shape of aggregated training data: {X_train_agg.shape}")
print(f"Shape of aggregated test data: {X_test_agg.shape}")
print(f"Shape of aggregated validation data: {X_validation_agg.shape}")

Shape of aggregated training data: (37802, 34)
Shape of aggregated test data: (4724, 34)
Shape of aggregated validation data: (4724, 34)


In [None]:
print(feature_names)

['Anion gap', 'Bicarbonate', 'Blood Urea Nitrogen', 'Chloride', 'Creatinine', 'Diastolic BP', 'Glucose', 'Heart rate', 'Hematocrit', 'Hemoglobin', 'Potassium', 'Respiratory rate', 'Sodium', 'Oxygen saturation', 'Systolic BP', 'Urine output 12h', 'Urine output 24h', 'Urine output 6h', 'White cell count', 'Sedative', 'Vasopressor', 'Ventilation', 'Age', 'Female gender', 'Male gender', 'Asian ethnicity', 'Black ethnicity', 'Hispanic ethnicity', 'Native american', 'Other ethnicity', 'Ethnicity unknown', 'White ethnicity']


In [None]:
import pandas as pd
import numpy as np

# Assuming X_train_agg, X_test_agg, X_validation_agg, y_train, y_test, and y_validation are already defined
# and that X_train_agg, X_test_agg, X_validation_agg are numpy arrays
# and that y_train, y_test, y_validation are also numpy arrays

# Step 4: Define feature names including 'icustay_id' and the preserved columns
feature_names_with_id = [
    'icustay_id',         # Identifier
    'aki_stage',          # Target label
    'F', 'M',             # Gender binary features
    'asian', 'black', 'hispanic', 'native', 'other', 'unknown', 'white',  # Ethnicity binary features
    'sedative', 'vasopressor', 'vent',  # Categorical medical interventions
    'aniongap_avg', 'bicarbonate_avg', 'bun_avg', 'chloride_avg', 'creat',
    'diasbp_mean', 'glucose_avg', 'heartrate_mean', 'hematocrit_avg',
    'hemoglobin_avg', 'potassium_avg', 'resprate_mean', 'sodium_avg',
    'spo2_mean', 'sysbp_mean', 'uo_rt_12hr', 'uo_rt_24hr', 'uo_rt_6hr',
    'wbc_avg', 'age'  # Numeric features
]

# Convert the numpy arrays back to pandas DataFrames
X_train = pd.DataFrame(X_train_agg, columns=feature_names_with_id)
X_test = pd.DataFrame(X_test_agg, columns=feature_names_with_id)
X_validation = pd.DataFrame(X_validation_agg, columns=feature_names_with_id)

# Remove 'icustay_id' from the DataFrames
X_train = X_train.drop(['icustay_id'], axis=1)
X_test = X_test.drop(['icustay_id'], axis=1)
X_validation = X_validation.drop(['icustay_id'], axis=1)

# Save the combined dataset with 'icustay_id' to 'final_dataset.csv'
final_dataset = pd.concat([X_train, X_test, X_validation], ignore_index=True)
final_dataset.to_csv('/content/drive/MyDrive/Colab Notebooks/AKI/final/input_data/final_dataset_ord.csv', index=False)
# final_dataset.to_csv('/content/drive/MyDrive/Colab Notebooks/AKI/final/input_data/final_dataset_final.csv', index=False)
# final_dataset.to_csv('/content/drive/MyDrive/Colab Notebooks/AKI/final/input_data/final_dataset_raw.csv', index=False)


# Explicitly identify the index of the 'aki_stage' column
aki_stage_index = list(train.columns).index('aki_stage')

# Extract the labels from the correct 'aki_stage' column
y_train = X_train_agg[:, aki_stage_index]
y_test = X_test_agg[:, aki_stage_index]
y_validation = X_validation_agg[:, aki_stage_index]

# Convert the labels to DataFrames
y_train = pd.DataFrame(y_train.reshape(-1, 1), columns=['aki_stage'])
y_test = pd.DataFrame(y_test.reshape(-1, 1), columns=['aki_stage'])
y_validation = pd.DataFrame(y_validation.reshape(-1, 1), columns=['aki_stage'])

# Remove 'icustay_id' from the DataFrames
X_train = X_train.drop(['aki_stage'], axis=1)
X_test = X_test.drop(['aki_stage'], axis=1)
X_validation = X_validation.drop(['aki_stage'], axis=1)

(X_train.shape, y_train.shape, X_test.shape, y_test.shape, X_validation.shape, y_validation.shape)


((37802, 32), (37802, 1), (4724, 32), (4724, 1), (4724, 32), (4724, 1))

In [None]:
import pandas as pd
import numpy as np

# Assuming X_train_flattened, X_test_flattened, X_validation_flattened are the flattened data arrays
# and y_train, y_test, y_validation contain the aki_stage values

# Define the original column names including 'icustay_id' and 'aki_stage'
feature_names_with_id = [
    'icustay_id',         # Identifier
    'aki_stage',          # Target label
    'F', 'M',             # Gender binary features
    'asian', 'black', 'hispanic', 'native', 'other', 'unknown', 'white',  # Ethnicity binary features
    'sedative', 'vasopressor', 'vent',  # Categorical medical interventions
    'aniongap_avg', 'bicarbonate_avg', 'bun_avg', 'chloride_avg', 'creat',
    'diasbp_mean', 'glucose_avg', 'heartrate_mean', 'hematocrit_avg',
    'hemoglobin_avg', 'potassium_avg', 'resprate_mean', 'sodium_avg',
    'spo2_mean', 'sysbp_mean', 'uo_rt_12hr', 'uo_rt_24hr', 'uo_rt_6hr',
    'wbc_avg', 'age'  # Numeric features
]

# Step 1: Convert the flattened numpy arrays back to pandas DataFrames
# Ensure all columns, including 'aki_stage', are preserved in the order
X_train_df = pd.DataFrame(X_train_flattened, columns=feature_names_with_id[1:])  # Skip 'icustay_id'
X_test_df = pd.DataFrame(X_test_flattened, columns=feature_names_with_id[1:])    # Skip 'icustay_id'
X_validation_df = pd.DataFrame(X_validation_flattened, columns=feature_names_with_id[1:])  # Skip 'icustay_id'

# Step 2: Merge the DataFrames back with the 'icustay_id' column and the 'aki_stage' column
# Assuming you have the 'icustay_id' and 'aki_stage' from your y variables, merge them back
final_dataset_train = pd.concat([pd.DataFrame(X_train_flattened, columns=feature_names_with_id[1:]), pd.DataFrame(y_train, columns=['aki_stage'])], axis=1)
final_dataset_test = pd.concat([pd.DataFrame(X_test_flattened, columns=feature_names_with_id[1:]), pd.DataFrame(y_test, columns=['aki_stage'])], axis=1)
final_dataset_validation = pd.concat([pd.DataFrame(X_validation_flattened, columns=feature_names_with_id[1:]), pd.DataFrame(y_validation, columns=['aki_stage'])], axis=1)

# Combine the DataFrames if needed
final_dataset = pd.concat([final_dataset_train, final_dataset_test, final_dataset_validation], ignore_index=True)

# Save the final dataset to CSV, preserving the 'aki_stage' column
final_dataset.to_csv('/content/drive/MyDrive/Colab Notebooks/AKI/final/input_data/final_dataset-raw_flat.csv', index=False)

# Step 3: Use the `aki_stage` column from the dataset as y variables for further processing

# Extract the labels from the correct 'aki_stage' column
y_train = final_dataset_train['aki_stage'].values
y_test = final_dataset_test['aki_stage'].values
y_validation = final_dataset_validation['aki_stage'].values

# Remove 'aki_stage' from feature datasets
X_train_final = final_dataset_train.drop(['aki_stage'], axis=1)
X_test_final = final_dataset_test.drop(['aki_stage'], axis=1)
X_validation_final = final_dataset_validation.drop(['aki_stage'], axis=1)

(X_train_final.shape, y_train.shape, X_test_final.shape, y_test.shape, X_validation_final.shape, y_validation.shape)


In [None]:
print(list(train.columns))

['icustay_id', 'aki_stage', 'aniongap_avg', 'bicarbonate_avg', 'bun_avg', 'chloride_avg', 'creat', 'diasbp_mean', 'glucose_avg', 'heartrate_mean', 'hematocrit_avg', 'hemoglobin_avg', 'potassium_avg', 'resprate_mean', 'sodium_avg', 'spo2_mean', 'sysbp_mean', 'uo_rt_12hr', 'uo_rt_24hr', 'uo_rt_6hr', 'wbc_avg', 'sedative', 'vasopressor', 'vent', 'age', 'F', 'M', 'asian', 'black', 'hispanic', 'native', 'other', 'unknown', 'white']


In [None]:
print(list(X_train.columns))

['F', 'M', 'asian', 'black', 'hispanic', 'native', 'other', 'unknown', 'white', 'sedative', 'vasopressor', 'vent', 'aniongap_avg', 'bicarbonate_avg', 'bun_avg', 'chloride_avg', 'creat', 'diasbp_mean', 'glucose_avg', 'heartrate_mean', 'hematocrit_avg', 'hemoglobin_avg', 'potassium_avg', 'resprate_mean', 'sodium_avg', 'spo2_mean', 'sysbp_mean', 'uo_rt_12hr', 'uo_rt_24hr', 'uo_rt_6hr', 'wbc_avg', 'age']


In [None]:
# Explicitly identify the index of the 'aki_stage' column
aki_stage_index = list(train.columns).index('aki_stage')
print(aki_stage_index)

1


In [None]:
print(list(X_train.shape))

[37802, 32]


In [None]:
print(y_test)

     aki_stage
0          0.0
1          3.0
2          0.0
3          0.0
4          0.0
...        ...
4719       0.0
4720       2.0
4721       0.0
4722       0.0
4723       0.0

[4724 rows x 1 columns]


In [None]:
# Save the X and y variables to CSV files
X_train.to_csv('/content/drive/MyDrive/Colab Notebooks/AKI/final/input_data/X_train-raw.csv', index=False)
X_test.to_csv('/content/drive/MyDrive/Colab Notebooks/AKI/final/input_data/X_test-raw.csv', index=False)
X_validation.to_csv('/content/drive/MyDrive/Colab Notebooks/AKI/final/input_data/X_validation-raw.csv', index=False)

y_train.to_csv('/content/drive/MyDrive/Colab Notebooks/AKI/final/input_data/y_train-raw.csv', index=False)
y_test.to_csv('/content/drive/MyDrive/Colab Notebooks/AKI/final/input_data/y_test-raw.csv', index=False)
y_validation.to_csv('/content/drive/MyDrive/Colab Notebooks/AKI/final/input_data/y_validation-raw.csv', index=False)

print("X and y variables saved to CSV files.")

# Load the variables from the CSV files for modeling
X_train = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/AKI/final/input_data/X_train-raw.csv')
X_test = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/AKI/final/input_data/X_test-raw.csv')
X_validation = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/AKI/final/input_data/X_validation-raw.csv')

y_train = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/AKI/final/input_data/y_train-raw.csv')
y_test = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/AKI/final/input_data/y_test-raw.csv')
y_validation = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/AKI/final/input_data/y_validation-raw.csv')

print("X and y variables loaded from CSV files.")

# Check that the loaded data matches the original
print(f"Loaded X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
print(f"Loaded X_test shape: {X_test.shape}, y_test shape: {y_test.shape}")
print(f"Loaded X_validation shape: {X_validation.shape}, y_validation shape: {y_validation.shape}")

In [None]:
# Save the X and y variables to CSV files
X_train.to_csv('/content/drive/MyDrive/Colab Notebooks/AKI/final/input_data/X_train-final.csv', index=False)
X_test.to_csv('/content/drive/MyDrive/Colab Notebooks/AKI/final/input_data/X_test-final.csv', index=False)
X_validation.to_csv('/content/drive/MyDrive/Colab Notebooks/AKI/final/input_data/X_validation-final.csv', index=False)

y_train.to_csv('/content/drive/MyDrive/Colab Notebooks/AKI/final/input_data/y_train-final.csv', index=False)
y_test.to_csv('/content/drive/MyDrive/Colab Notebooks/AKI/final/input_data/y_test-final.csv', index=False)
y_validation.to_csv('/content/drive/MyDrive/Colab Notebooks/AKI/final/input_data/y_validation-final.csv', index=False)

print("X and y variables saved to CSV files.")

# Load the variables from the CSV files for modeling
X_train = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/AKI/final/input_data/X_train-final.csv')
X_test = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/AKI/final/input_data/X_test-final.csv')
X_validation = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/AKI/final/input_data/X_validation-final.csv')

y_train = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/AKI/final/input_data/y_train-final.csv')
y_test = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/AKI/final/input_data/y_test-final.csv')
y_validation = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/AKI/final/input_data/y_validation-final.csv')

print("X and y variables loaded from CSV files.")

# Check that the loaded data matches the original
print(f"Loaded X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
print(f"Loaded X_test shape: {X_test.shape}, y_test shape: {y_test.shape}")
print(f"Loaded X_validation shape: {X_validation.shape}, y_validation shape: {y_validation.shape}")

X and y variables saved to CSV files.
X and y variables loaded from CSV files.
Loaded X_train shape: (37802, 32), y_train shape: (37802, 1)
Loaded X_test shape: (4724, 32), y_test shape: (4724, 1)
Loaded X_validation shape: (4724, 32), y_validation shape: (4724, 1)


In [None]:
# Save the X and y variables to CSV files
X_train.to_csv('/content/drive/MyDrive/Colab Notebooks/AKI/final/input_data/X_train-ord.csv', index=False)
X_test.to_csv('/content/drive/MyDrive/Colab Notebooks/AKI/final/input_data/X_test-ord.csv', index=False)
X_validation.to_csv('/content/drive/MyDrive/Colab Notebooks/AKI/final/input_data/X_validation-ord.csv', index=False)

y_train.to_csv('/content/drive/MyDrive/Colab Notebooks/AKI/final/input_data/y_train-ord.csv', index=False)
y_test.to_csv('/content/drive/MyDrive/Colab Notebooks/AKI/final/input_data/y_test-ord.csv', index=False)
y_validation.to_csv('/content/drive/MyDrive/Colab Notebooks/AKI/final/input_data/y_validation-ord.csv', index=False)

print("X and y variables saved to CSV files.")

# Load the variables from the CSV files for modeling
X_train = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/AKI/final/input_data/X_train-ord.csv')
X_test = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/AKI/final/input_data/X_test-ord.csv')
X_validation = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/AKI/final/input_data/X_validation-ord.csv')

y_train = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/AKI/final/input_data/y_train-ord.csv')
y_test = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/AKI/final/input_data/y_test-ord.csv')
y_validation = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/AKI/final/input_data/y_validation-ord.csv')

print("X and y variables loaded from CSV files.")

# Check that the loaded data matches the original
print(f"Loaded X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
print(f"Loaded X_test shape: {X_test.shape}, y_test shape: {y_test.shape}")
print(f"Loaded X_validation shape: {X_validation.shape}, y_validation shape: {y_validation.shape}")

X and y variables saved to CSV files.
X and y variables loaded from CSV files.
Loaded X_train shape: (37802, 32), y_train shape: (37802, 1)
Loaded X_test shape: (4724, 32), y_test shape: (4724, 1)
Loaded X_validation shape: (4724, 32), y_validation shape: (4724, 1)
