# Config

In [None]:
file_path = "/content/drive/MyDrive/mimic-iii-clinicalnote-v4.1/bq-results-20231215-074521-1702626519432.csv"
output_folder_path = "/content/drive/MyDrive/mimic-iii-clinicalnote-v4.1/processed_set"

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# import package and load the data

In [None]:
import numpy as np
import torch
import os
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import string
import nltk
from tqdm import tqdm
from sklearn.metrics import roc_curve
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
data = pd.read_csv(file_path)

data

Unnamed: 0,subject_id,hadm_id,icustay_id,slice_start,slice_end,first_AKI_time,concatenated_notes,avg_pCO2,avg_Glucose,avg_Platelet_Count,...,avg_Respiratory_rate,avg_Heartrate,avg_Arterial_blood_pressure_mean,avg_Arterial_blood_pressure_systolic,avg_Central_venous_pressure,avg_Arterial_blood_pressure_diastolic,avg_Tidal_volume_obs,avg_Temp_C,avg_Height_cm,AKI_Status
0,47271,116612,275725,2122-04-01 00:47:34,2122-04-01 02:47:34,2122-04-01 09:00:00,Chief Complaint: Myasthenia [**Last Name (un)...,,,,...,19.0,87.333333,,,,,751.0,36.000000,,Before AKI
1,47271,116612,275725,2122-04-01 02:47:34,2122-04-01 04:47:34,2122-04-01 09:00:00,"Airway, Inability to Protect (Risk for Aspirat...",,143.0,161.0,...,22.0,97.000000,,,,,750.0,36.166667,,Before AKI
2,47271,116612,275725,2122-04-01 04:47:34,2122-04-01 06:47:34,2122-04-01 09:00:00,Uo decreased to 25 cc total for 2 hours\n tx w...,42.0,,,...,20.0,91.000000,,,,,,,,Before AKI
3,47271,116612,275725,2122-04-01 06:47:34,2122-04-01 08:47:34,2122-04-01 09:00:00,,,,,...,21.5,92.000000,,,,,712.0,,175.13,Before AKI
4,47271,116612,275725,2122-04-01 08:47:34,2122-04-01 10:47:34,2122-04-01 09:00:00,Chief Complaint: Respiratory Failure\n I saw...,,,,...,22.0,100.500000,,,,,449.0,36.888889,,After AKI
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
527191,22624,110384,225423,2118-10-11 11:46:00,2118-10-11 13:46:00,2118-10-11 03:00:00,,,113.0,,...,17.0,80.333333,,,,,,36.777778,,After AKI
527192,22624,110384,225423,2118-10-11 13:46:00,2118-10-11 15:46:00,2118-10-11 03:00:00,,,,,...,16.5,87.000000,,,,,,,,After AKI
527193,22624,110384,225423,2118-10-11 15:46:00,2118-10-11 17:46:00,2118-10-11 03:00:00,"Pneumonia, probable aspiration\n Assessment:...",,,,...,17.0,98.500000,,,,,,37.111111,,After AKI
527194,22624,110384,225423,2118-10-11 17:46:00,2118-10-11 19:46:00,2118-10-11 03:00:00,,,,,...,16.5,96.500000,,,,,,,,After AKI


In [None]:
len(data['icustay_id'].unique())

43933

# Remove patient that happen AKI in first 12 hours

In [None]:
def keep_sixth_row(df):
    if len(df) >= 6:
        return df.iloc[5]
    else:
        return None

unique_id_keep_first_row = data.sort_values(by="slice_start", ascending=True)
unique_id_keep_first_row = unique_id_keep_first_row.groupby('icustay_id').apply(keep_sixth_row).reset_index(drop=True)

In [None]:
len(unique_id_keep_first_row)

43933

In [None]:
unique_id_keep_first_row['AKI_Status'].value_counts()

non-AKI       17801
Before AKI    16756
After AKI      9376
Name: AKI_Status, dtype: int64

In [None]:
unique_id_keep_first_row = unique_id_keep_first_row[unique_id_keep_first_row['AKI_Status']!='After AKI']
unique_id_keep_first_row["AKI_Status"].value_counts()

non-AKI       17801
Before AKI    16756
Name: AKI_Status, dtype: int64

In [None]:
data = data[data['icustay_id'].isin(unique_id_keep_first_row['icustay_id'])]
data

Unnamed: 0,subject_id,hadm_id,icustay_id,slice_start,slice_end,first_AKI_time,concatenated_notes,avg_pCO2,avg_Glucose,avg_Platelet_Count,...,avg_Respiratory_rate,avg_Heartrate,avg_Arterial_blood_pressure_mean,avg_Arterial_blood_pressure_systolic,avg_Central_venous_pressure,avg_Arterial_blood_pressure_diastolic,avg_Tidal_volume_obs,avg_Temp_C,avg_Height_cm,AKI_Status
12,47271,160335,220894,2122-04-27 00:48:12,2122-04-27 02:48:12,,,,,,...,19.0,64.000000,,,,,,36.388889,,non-AKI
13,47271,160335,220894,2122-04-27 02:48:12,2122-04-27 04:48:12,,PMH: Ms. [**Known lastname 6512**] is a 65 ye...,,,,...,17.0,56.666667,,,,,,,,non-AKI
14,47271,160335,220894,2122-04-27 04:48:12,2122-04-27 06:48:12,,,,93.0,172.0,...,18.0,67.000000,,,,,,36.111111,,non-AKI
15,47271,160335,220894,2122-04-27 06:48:12,2122-04-27 08:48:12,,Chief Complaint:\n 24 Hour Events:\n No ev...,,,,...,25.5,64.500000,,,,,,35.944444,,non-AKI
16,47271,160335,220894,2122-04-27 08:48:12,2122-04-27 10:48:12,,Chief Complaint: Generalized weakness\n HPI...,,,,...,22.5,75.500000,,,,,,,,non-AKI
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
527179,22623,185466,244142,2193-07-04 04:40:08,2193-07-04 06:40:08,2193-07-04 10:00:00,,,,,...,,,,,,,,,,Before AKI
527180,22623,185466,244142,2193-07-04 06:40:08,2193-07-04 08:40:08,2193-07-04 10:00:00,,,,,...,,,,,,,,,,Before AKI
527181,22623,185466,244142,2193-07-04 08:40:08,2193-07-04 10:40:08,2193-07-04 10:00:00,,,,,...,,,,,,,,,,After AKI
527182,22623,185466,244142,2193-07-04 10:40:08,2193-07-04 12:40:08,2193-07-04 10:00:00,,,,,...,,,,,,,,,,After AKI


# Censor clinical notes and structured data after AKI happend

In [None]:
def update_notes_for_AKI_status(df):
    """
    Updates the 'concatenated_notes' column to 'missing' where 'AKI_Status' equals 'After_AKI'.
    Also, updates 'concatenated_notes' to 'missing' if it is null.

    :param df: pandas DataFrame containing 'AKI_Status' and 'concatenated_notes' columns.
    :return: DataFrame with updated 'concatenated_notes' values.
    """
    df_copy = df.copy()
    # Update 'concatenated_notes' to 'missing' where 'AKI_Status' is 'After_AKI'
    df_copy.loc[df_copy['AKI_Status'] == 'After AKI', 'concatenated_notes'] = 'censoring'

    # Update 'concatenated_notes' to 'missing' where it is null
    df_copy.loc[df_copy['concatenated_notes'].isnull(), 'concatenated_notes'] = 'missing'

    return df_copy

def update_features_for_AKI_status(df):

    df_copy = df.copy()

    # List of structured feature columns to censor, replace this list with the actual column names you want to censor
    structured_features = [
        'avg_pCO2', 'avg_Glucose', 'avg_Platelet_Count', 'avg_Respiratory_rate',
        'avg_Heartrate', 'avg_Arterial_blood_pressure_mean',
        'avg_Arterial_blood_pressure_systolic', 'avg_Central_venous_pressure',
        'avg_Arterial_blood_pressure_diastolic', 'avg_Tidal_volume_obs',
        'avg_Temp_C', 'avg_Height_cm'
    ]
    # Set all structured features to NaN where 'AKI_Status' is 'After AKI'
    df_copy.loc[df_copy['AKI_Status'] == 'After AKI', structured_features] = None

    return df_copy


data_censor = update_notes_for_AKI_status(data)
data_censor = update_features_for_AKI_status(data_censor)

In [None]:
data[data['icustay_id']==210867]

Unnamed: 0,subject_id,hadm_id,icustay_id,slice_start,slice_end,first_AKI_time,concatenated_notes,avg_pCO2,avg_Glucose,avg_Platelet_Count,...,avg_Respiratory_rate,avg_Heartrate,avg_Arterial_blood_pressure_mean,avg_Arterial_blood_pressure_systolic,avg_Central_venous_pressure,avg_Arterial_blood_pressure_diastolic,avg_Tidal_volume_obs,avg_Temp_C,avg_Height_cm,AKI_Status
90665,1611,168370,210867,2109-04-17 20:47:29,2109-04-17 22:47:29,,,,,,...,,,,,,,,,,non-AKI
90666,1611,168370,210867,2109-04-17 22:47:29,2109-04-18 00:47:29,,,,69.0,143.0,...,,,,,,,,,,non-AKI
90667,1611,168370,210867,2109-04-18 00:47:29,2109-04-18 02:47:29,,,,,,...,,,,,,,,,,non-AKI
90668,1611,168370,210867,2109-04-18 02:47:29,2109-04-18 04:47:29,,\n\n\n\n\n,,181.0,,...,,,,,,,,,,non-AKI
90669,1611,168370,210867,2109-04-18 04:47:29,2109-04-18 06:47:29,,NEURO PT. REMAINS LETHARGIC BUT THIS AM EASILY...,,,,...,,,,,,,,,,non-AKI
90670,1611,168370,210867,2109-04-18 06:47:29,2109-04-18 08:47:29,,,,,,...,,,,,,,,,,non-AKI
90671,1611,168370,210867,2109-04-18 08:47:29,2109-04-18 10:47:29,,,,,,...,,,,,,,,,,non-AKI
90672,1611,168370,210867,2109-04-18 10:47:29,2109-04-18 12:47:29,,[**2109-4-18**] 10:54 AM\n CHEST (PORTABLE AP)...,,,,...,,,,,,,,,,non-AKI
90673,1611,168370,210867,2109-04-18 12:47:29,2109-04-18 14:47:29,,[**2109-4-18**] 1:08 PM\n CHEST (PORTABLE AP);...,,,,...,,,,,,,,,,non-AKI
90674,1611,168370,210867,2109-04-18 14:47:29,2109-04-18 16:47:29,,,,,,...,,,,,,,,,,non-AKI


In [None]:
data_censor[data_censor['icustay_id']==210867]

Unnamed: 0,subject_id,hadm_id,icustay_id,slice_start,slice_end,first_AKI_time,concatenated_notes,avg_pCO2,avg_Glucose,avg_Platelet_Count,...,avg_Respiratory_rate,avg_Heartrate,avg_Arterial_blood_pressure_mean,avg_Arterial_blood_pressure_systolic,avg_Central_venous_pressure,avg_Arterial_blood_pressure_diastolic,avg_Tidal_volume_obs,avg_Temp_C,avg_Height_cm,AKI_Status
90665,1611,168370,210867,2109-04-17 20:47:29,2109-04-17 22:47:29,,missing,,,,...,,,,,,,,,,non-AKI
90666,1611,168370,210867,2109-04-17 22:47:29,2109-04-18 00:47:29,,missing,,69.0,143.0,...,,,,,,,,,,non-AKI
90667,1611,168370,210867,2109-04-18 00:47:29,2109-04-18 02:47:29,,missing,,,,...,,,,,,,,,,non-AKI
90668,1611,168370,210867,2109-04-18 02:47:29,2109-04-18 04:47:29,,\n\n\n\n\n,,181.0,,...,,,,,,,,,,non-AKI
90669,1611,168370,210867,2109-04-18 04:47:29,2109-04-18 06:47:29,,NEURO PT. REMAINS LETHARGIC BUT THIS AM EASILY...,,,,...,,,,,,,,,,non-AKI
90670,1611,168370,210867,2109-04-18 06:47:29,2109-04-18 08:47:29,,missing,,,,...,,,,,,,,,,non-AKI
90671,1611,168370,210867,2109-04-18 08:47:29,2109-04-18 10:47:29,,missing,,,,...,,,,,,,,,,non-AKI
90672,1611,168370,210867,2109-04-18 10:47:29,2109-04-18 12:47:29,,[**2109-4-18**] 10:54 AM\n CHEST (PORTABLE AP)...,,,,...,,,,,,,,,,non-AKI
90673,1611,168370,210867,2109-04-18 12:47:29,2109-04-18 14:47:29,,[**2109-4-18**] 1:08 PM\n CHEST (PORTABLE AP);...,,,,...,,,,,,,,,,non-AKI
90674,1611,168370,210867,2109-04-18 14:47:29,2109-04-18 16:47:29,,missing,,,,...,,,,,,,,,,non-AKI


# Process text

In [None]:
def preprocess_text(text):
    # Check for NaN values
    if pd.isna(text):
        return 'missing'  # or return an empty string if preferred

    # Lowercase the text
    text = text.lower()

    # Remove punctuations
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Tokenization
    tokens = word_tokenize(text)

    # Remove stopwords and lemmatize the words
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]

    return ' '.join(tokens)  # Join tokens back into one string

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

tqdm.pandas(desc="Processing Text")
data_censor['processed_text'] = data_censor['concatenated_notes'].progress_apply(preprocess_text)

#for testing purpose
# data_censor['processed_text'] = "text"

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
Processing Text: 100%|██████████| 414684/414684 [07:15<00:00, 952.23it/s] 


# Change AKI_state to AKI_label

In [None]:
data_copy = data_censor.copy()
data_copy['AKI_Label'] = data_censor['AKI_Status'].apply(lambda x: 1 if x != 'non-AKI' else 0)
data_copy = data_copy.drop(['AKI_Status'], axis=1)
data_censor = data_copy
del data_copy
data_censor.columns

Index(['subject_id', 'hadm_id', 'icustay_id', 'slice_start', 'slice_end',
       'first_AKI_time', 'concatenated_notes', 'avg_pCO2', 'avg_Glucose',
       'avg_Platelet_Count', 'avg_Potassium', 'avg_Calculated_total_CO2',
       'avg_Urea_nitrogen', 'avg_Respiratory_rate', 'avg_Heartrate',
       'avg_Arterial_blood_pressure_mean',
       'avg_Arterial_blood_pressure_systolic', 'avg_Central_venous_pressure',
       'avg_Arterial_blood_pressure_diastolic', 'avg_Tidal_volume_obs',
       'avg_Temp_C', 'avg_Height_cm', 'processed_text', 'AKI_Label'],
      dtype='object')

# Split the data into training, testing and validation set

In [None]:
data_final = data_censor.drop(['subject_id', 'hadm_id', 'slice_end', 'first_AKI_time', 'concatenated_notes'], axis=1)
data_final = data_final.sort_values(by=['icustay_id', 'slice_start'])

In [None]:
Xy_id = data_final.drop_duplicates(subset=['icustay_id'])[['icustay_id', 'AKI_Label']]
Xy_id.head(20)

Unnamed: 0,icustay_id,AKI_Label
196956,200003,0
325344,200006,1
508475,200007,0
221328,200009,1
206472,200012,0
310080,200014,1
155795,200016,0
519023,200019,1
176636,200021,0
493091,200025,1


In [None]:
X_train_id, X_test_id, y_train_id, y_test_id = train_test_split(Xy_id['icustay_id'], Xy_id['AKI_Label'], test_size=0.2, random_state=42)
X_train_id, X_val_id, y_train_id, y_val_id = train_test_split(X_train_id, y_train_id, test_size=0.25, random_state=42)

In [None]:
X_train_id_df = pd.DataFrame(X_train_id, columns=['icustay_id'])
X_test_id_df = pd.DataFrame(X_test_id, columns=['icustay_id'])
X_val_id_df = pd.DataFrame(X_val_id, columns=['icustay_id'])

# Create a DataFrame from IDs and labels for merging
train_id_label_df = pd.DataFrame({'icustay_id': X_train_id, 'AKI_Label': y_train_id})
test_id_label_df = pd.DataFrame({'icustay_id': X_test_id, 'AKI_Label': y_test_id})
val_id_label_df = pd.DataFrame({'icustay_id': X_val_id, 'AKI_Label': y_val_id})

# Merge to align with the X DataFrames
X_train_aligned = X_train_id_df.merge(data_final, on='icustay_id')
X_test_aligned = X_test_id_df.merge(data_final, on='icustay_id')
X_val_aligned = X_val_id_df.merge(data_final, on='icustay_id')


X_train_aligned = X_train_aligned.sort_values(by=['icustay_id', 'slice_start'])
X_test_aligned = X_test_aligned.sort_values(by=['icustay_id', 'slice_start'])
X_val_aligned = X_val_aligned.sort_values(by=['icustay_id', 'slice_start'])
X_train_aligned

Unnamed: 0,icustay_id,slice_start,avg_pCO2,avg_Glucose,avg_Platelet_Count,avg_Potassium,avg_Calculated_total_CO2,avg_Urea_nitrogen,avg_Respiratory_rate,avg_Heartrate,avg_Arterial_blood_pressure_mean,avg_Arterial_blood_pressure_systolic,avg_Central_venous_pressure,avg_Arterial_blood_pressure_diastolic,avg_Tidal_volume_obs,avg_Temp_C,avg_Height_cm,processed_text,AKI_Label
89940,200006,2159-09-03 11:28:14,,,,,,,,,,,,,,,,missing,1
89941,200006,2159-09-03 13:28:14,,,,,,,,,,,,,,,,missing,1
89942,200006,2159-09-03 15:28:14,,,,,,,,,,,,,,,,missing,1
89943,200006,2159-09-03 17:28:14,,,,,,,,,,,,,,,,hospital ward name 4 icu nursing admitprogress...,1
89944,200006,2159-09-03 19:28:14,,,,,,,,,,,,,,,,patient admitted hospital ward name 6 ett stom...,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58663,299995,2116-03-05 07:44:39,,,,,,,,,,,,,,,,respiratory care pt extubated today 0745 post ...,0
58664,299995,2116-03-05 09:44:39,,,,,,,,,,,,,,,,missing,0
58665,299995,2116-03-05 11:44:39,,,,,,,,,,,,,,,,missing,0
58666,299995,2116-03-05 13:44:39,,,,,,,,,,,,,,,,missing,0


In [None]:
# Extract the labels from the merged DataFrames
y_train_aligned = X_train_aligned.drop_duplicates(subset='icustay_id')['AKI_Label']
y_test_aligned = X_test_aligned.drop_duplicates(subset='icustay_id')['AKI_Label']
y_val_aligned = X_val_aligned.drop_duplicates(subset='icustay_id')['AKI_Label']

# Drop the labels from the X DataFrames
X_train = X_train_aligned.drop(['AKI_Label'], axis=1)
X_test = X_test_aligned.drop(['AKI_Label'], axis=1)
X_val = X_val_aligned.drop(['AKI_Label'], axis=1)
X_train

Unnamed: 0,icustay_id,slice_start,avg_pCO2,avg_Glucose,avg_Platelet_Count,avg_Potassium,avg_Calculated_total_CO2,avg_Urea_nitrogen,avg_Respiratory_rate,avg_Heartrate,avg_Arterial_blood_pressure_mean,avg_Arterial_blood_pressure_systolic,avg_Central_venous_pressure,avg_Arterial_blood_pressure_diastolic,avg_Tidal_volume_obs,avg_Temp_C,avg_Height_cm,processed_text
89940,200006,2159-09-03 11:28:14,,,,,,,,,,,,,,,,missing
89941,200006,2159-09-03 13:28:14,,,,,,,,,,,,,,,,missing
89942,200006,2159-09-03 15:28:14,,,,,,,,,,,,,,,,missing
89943,200006,2159-09-03 17:28:14,,,,,,,,,,,,,,,,hospital ward name 4 icu nursing admitprogress...
89944,200006,2159-09-03 19:28:14,,,,,,,,,,,,,,,,patient admitted hospital ward name 6 ett stom...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58663,299995,2116-03-05 07:44:39,,,,,,,,,,,,,,,,respiratory care pt extubated today 0745 post ...
58664,299995,2116-03-05 09:44:39,,,,,,,,,,,,,,,,missing
58665,299995,2116-03-05 11:44:39,,,,,,,,,,,,,,,,missing
58666,299995,2116-03-05 13:44:39,,,,,,,,,,,,,,,,missing


In [None]:
np.shape(y_train_aligned)

(20733,)

## Mean imputation (values calculated by training set)

In [None]:
def impute_missing_values_efficiently(df):
    df_copy = df.copy()
    # Calculate population means for each column, excluding certain columns
    population_means = df_copy.drop(columns=['icustay_id', 'slice_start', 'processed_text'], ).mean()

    # Exclude non-imputation columns
    columns_to_impute = df_copy.columns.difference(['icustay_id', 'slice_start', 'processed_text'])

    # Compute group means and fill NaNs within each group
    group_means = df_copy.groupby('icustay_id')[columns_to_impute].transform('mean')
    df_copy[columns_to_impute] = df_copy[columns_to_impute].fillna(group_means)

    # Fill remaining NaNs with population means
    df_copy[columns_to_impute] = df_copy[columns_to_impute].fillna(population_means)

    return df_copy

In [None]:
X_train_imputed = impute_missing_values_efficiently(X_train)

In [None]:
population_means = X_train_imputed.drop(['icustay_id', 'slice_start', 'processed_text'], axis=1).mean()

In [None]:
# use values calculated by training set to impute testing and validation set
columns_to_impute = X_train_imputed.columns.difference(['icustay_id', 'slice_start', 'processed_text'])

X_val_imputed = X_val.copy()
X_test_imputed = X_test.copy()

X_val_imputed[columns_to_impute] = X_val[columns_to_impute].fillna(population_means)
X_test_imputed[columns_to_impute] = X_test[columns_to_impute].fillna(population_means)

## Normalizatiodn (parameter is set based on training set)

In [None]:
from sklearn.preprocessing import StandardScaler

def normalize_data(X_train, X_val, X_test, columns_to_normalize):
    scaler = StandardScaler()

    # Fit on training data
    scaler.fit(X_train[columns_to_normalize])

    # Transform the training, validation, and test sets
    X_train_normalized = X_train.copy()
    X_val_normalized = X_val.copy()
    X_test_normalized = X_test.copy()

    X_train_normalized[columns_to_normalize] = scaler.transform(X_train[columns_to_normalize])
    X_val_normalized[columns_to_normalize] = scaler.transform(X_val[columns_to_normalize])
    X_test_normalized[columns_to_normalize] = scaler.transform(X_test[columns_to_normalize])

    return X_train_normalized, X_val_normalized, X_test_normalized

# Normalize the datasets
columns_to_normalize = X_train_imputed.columns.difference(['icustay_id', 'slice_start', 'AKI_Label', 'subject_id', 'hadm_id', 'slice_end', 'first_AKI_time', 'concatenated_notes', 'processed_text'])
X_train_normalized, X_val_normalized, X_test_normalized = normalize_data(X_train_imputed, X_val_imputed, X_test_imputed, columns_to_normalize)

In [None]:
# # Inspect row 34179 of the 'processed_text' column in X_train
# row_index = 34179
# suspect_text = X_train_normalized['processed_text'].iloc[row_index]

# print("Content at row 34179:")
# print(suspect_text)
# print("Type of the content:", type(suspect_text))


In [None]:
# X_train_normalized.iloc[row_index]

# Save training, testing and validation set to folder

In [None]:
X_val_normalized.describe()

Unnamed: 0,icustay_id,avg_pCO2,avg_Glucose,avg_Platelet_Count,avg_Potassium,avg_Calculated_total_CO2,avg_Urea_nitrogen,avg_Respiratory_rate,avg_Heartrate,avg_Arterial_blood_pressure_mean,avg_Arterial_blood_pressure_systolic,avg_Central_venous_pressure,avg_Arterial_blood_pressure_diastolic,avg_Tidal_volume_obs,avg_Temp_C,avg_Height_cm
count,82944.0,82944.0,82944.0,82944.0,82944.0,82944.0,82944.0,82944.0,82944.0,82944.0,82944.0,82944.0,82944.0,82944.0,82944.0,82944.0
mean,250684.448929,0.000379,0.01302,-0.011776,0.025414,-0.011045,0.010686,-0.001284,-0.001162,-0.004658,-0.004143,0.011839,-0.003338,-0.004172,0.009015,-0.000606
std,28846.70492,0.518005,0.536028,0.379142,0.562656,0.522106,0.455208,0.717638,0.845352,0.696729,0.80964,0.928351,0.060553,0.597146,1.526368,0.279485
min,200007.0,-4.48462,-2.003206,-1.959467,-4.308026,-6.025965,-1.257054,-4.433859,-6.968314,-14.151686,-12.977008,-3.528275,-0.344064,-10.91559,-43.773617,-28.152725
25%,225676.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,251496.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,275595.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,299999.0,13.749265,23.471135,13.580224,11.10254,8.320005,16.679544,69.32473,35.546749,33.350269,13.862507,41.274487,15.529861,10.844408,400.57469,12.037768


In [None]:
# don't forget to revise process text function

In [None]:
# Save the datasets
X_train_normalized.to_csv(os.path.join(output_folder_path, 'X_train_raw.csv'), index=False)
X_test_normalized.to_csv(os.path.join(output_folder_path, 'X_test_raw.csv'), index=False)
X_val_normalized.to_csv(os.path.join(output_folder_path, 'X_val_raw.csv'), index=False)
y_train_aligned.to_csv(os.path.join(output_folder_path, 'y_train_id.csv'), index=False)
y_test_aligned.to_csv(os.path.join(output_folder_path, 'y_test_id.csv'), index=False)
y_val_aligned.to_csv(os.path.join(output_folder_path, 'y_val_id.csv'), index=False)

In [None]:
X_train_normalized.drop_duplicates(subset=['icustay_id'])['icustay_id']

89940     200006
75756     200009
18204     200014
49560     200025
103392    200028
           ...  
9564      299981
150180    299984
245076    299986
136824    299988
58656     299995
Name: icustay_id, Length: 20733, dtype: int64

In [None]:
y_train_aligned

89940     1
75756     1
18204     1
49560     1
103392    1
         ..
9564      0
150180    1
245076    0
136824    1
58656     0
Name: AKI_Label, Length: 20733, dtype: int64