In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout

# Data Preprocessing

In [2]:
df_raw = pd.read_csv('train_test.csv')

In [7]:
drop_cols = ['charttime','hosp_admittime','hosp_dischtime','icu_intime','icu_outtime','los_icu','text_embeddings',
             'gender','admission_age','weight_admit','height','admission_type','charlson_score','atrial_fibrillation',
             'malignant_cancer','chf','ckd','cld','copd','diabetes','hypertension','ihd','stroke','los_icu_class']
df_raw = df_raw.drop(columns=drop_cols)

In [8]:
df_full = df_raw.groupby('id').transform(lambda x: x.fillna(x.mean()))

In [9]:
df = df_full.fillna(df_full.mean())

# Prepare data for training

In [19]:
X = df.drop(columns=['icu_death'])
y = df['icu_death']

time_steps = 25
X = np.array([X[i:i + time_steps] for i in range(0, len(X), time_steps)])
y = np.array([y[i] for i in range(time_steps - 1, len(y), time_steps)])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [32]:
X_train_2d = X_train.reshape(-1, X_train.shape[-1])
X_test_2d = X_test.reshape(-1, X_test.shape[-1])

In [35]:
# min-max standardization
scaler = MinMaxScaler(feature_range=(0, 1))
scaler.fit(X_train_2d)
X_train_2d_scaled = scaler.transform(X_train_2d)
X_test_2d_scaled = scaler.transform(X_test_2d)

In [41]:
X_train = np.array([X_train_2d_scaled[i:i + time_steps] for i in range(0, len(X_train_2d_scaled), time_steps)])
X_test = np.array([X_test_2d_scaled[i:i + time_steps] for i in range(0, len(X_test_2d_scaled), time_steps)])

# Modeling

In [42]:
# Construction
model = Sequential()
model.add(LSTM(units=25, return_sequences=False, input_shape=(time_steps, X_train.shape[2])))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))

  super().__init__(**kwargs)


In [43]:
model.compile(optimizer='adam', loss='binary_crossentropy')

In [44]:
# calculate weight of each classification
total_samples = len(y_train)
class_count = np.bincount(y_train)
class_weights = total_samples / (len(class_count) * class_count)
# change into dictionary format
class_weights_dict = dict(enumerate(class_weights))

In [45]:
class_weights_dict

{0: 0.5592341956484496, 1: 4.720535068691251}

In [46]:
# fit model
model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_test, y_test), class_weight=class_weights_dict)

Epoch 1/50
[1m409/409[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 8ms/step - loss: 0.6870 - val_loss: 0.4874
Epoch 2/50
[1m409/409[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 7ms/step - loss: 0.6220 - val_loss: 0.4776
Epoch 3/50
[1m409/409[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 7ms/step - loss: 0.5950 - val_loss: 0.4748
Epoch 4/50
[1m409/409[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 7ms/step - loss: 0.5810 - val_loss: 0.5154
Epoch 5/50
[1m409/409[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 7ms/step - loss: 0.5791 - val_loss: 0.5309
Epoch 6/50
[1m409/409[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 8ms/step - loss: 0.5821 - val_loss: 0.6236
Epoch 7/50
[1m409/409[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 8ms/step - loss: 0.5696 - val_loss: 0.5607
Epoch 8/50
[1m409/409[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 7ms/step - loss: 0.5519 - val_loss: 0.5387
Epoch 9/50
[1m409/409[0m [32m━━━━━━━━

<keras.src.callbacks.history.History at 0x22e80206790>

In [29]:
# # save the trained model
# model.save('lstm_mortality.keras')

In [47]:
# on testing set
y_test_pred = model.predict(X_test)

# classify based on probability
y_pred_class = (y_test_pred > 0.5).astype(int)

# evaluate the result
auroc = roc_auc_score(y_test, y_pred_class)
precision = precision_score(y_test, y_pred_class)
recall = recall_score(y_test, y_pred_class)
f1 = f1_score(y_test, y_pred_class)

print("AUROC:", auroc)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

[1m103/103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step
AUROC: 0.6778598757532128
Precision: 0.18357933579335795
Recall: 0.6546052631578947
F1 Score: 0.28674351585014407


In [31]:
# # predict the whole X
# X_scaled = np.concatenate((X_train, X_test), axis=0)
# y_pred = model.predict(X_scaled)

# # classify based on probability
# y_pred_class = (y_pred > 0.5).astype(int)

# # evaluate the result
# auroc = roc_auc_score(y, y_pred_class)
# precision = precision_score(y, y_pred_class)
# recall = recall_score(y, y_pred_class)
# f1 = f1_score(y, y_pred_class)

# print("AUROC:", auroc)
# print("Precision:", precision)
# print("Recall:", recall)
# print("F1 Score:", f1)

[1m511/511[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step
Precision: 0.31702544031311153
Recall: 0.8642560758743332
F1 Score: 0.4638880050906777


In [32]:
result_df = pd.DataFrame(y_pred, columns=['predictions'])

# output death probability of all patients
result_df.to_csv('lstm_probs_of_death_traintest.csv', index=False)