In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout

# Data Processing

In [2]:
df_raw = pd.read_csv('train_test.csv')

In [3]:
df_raw['los_icu_class'] = df_raw['los_icu_class'].apply(lambda x: 0 if x == 'less than 3 days' else 1)

In [4]:
drop_cols = ['charttime','hosp_admittime','hosp_dischtime','icu_intime','icu_outtime','los_icu','text_embeddings',
             'gender','admission_age','weight_admit','height','admission_type','charlson_score','atrial_fibrillation',
             'malignant_cancer','chf','ckd','cld','copd','diabetes','hypertension','ihd','stroke']
df_raw = df_raw.drop(columns=drop_cols)

In [5]:
df_full = df_raw.groupby('id').transform(lambda x: x.fillna(x.mean()))

In [6]:
df = df_full.fillna(df_full.mean())

# Prepare data for modeling 

In [7]:
X = df.drop(columns=['los_icu_class'])
y = df['los_icu_class']

time_steps = 25
X = np.array([X[i:i + time_steps] for i in range(0, len(X), time_steps)])
y = np.array([y[i] for i in range(time_steps - 1, len(y), time_steps)])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
X_train_2d = X_train.reshape(-1, X_train.shape[-1])
X_test_2d = X_test.reshape(-1, X_test.shape[-1])

In [9]:
# min-max standardization
scaler = MinMaxScaler(feature_range=(0, 1))
scaler.fit(X_train_2d)
X_train_2d_scaled = scaler.transform(X_train_2d)
X_test_2d_scaled = scaler.transform(X_test_2d)

In [10]:
X_train = np.array([X_train_2d_scaled[i:i + time_steps] for i in range(0, len(X_train_2d_scaled), time_steps)])
X_test = np.array([X_test_2d_scaled[i:i + time_steps] for i in range(0, len(X_test_2d_scaled), time_steps)])

# Modeling

In [11]:
model = Sequential()
model.add(LSTM(units=25, return_sequences=False, input_shape=(time_steps, X_train.shape[2])))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))

  super().__init__(**kwargs)


In [12]:
model.compile(optimizer='adam', loss='binary_crossentropy')

In [13]:
model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_test, y_test))

Epoch 1/50
[1m409/409[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 8ms/step - loss: 0.6947 - val_loss: 0.6827
Epoch 2/50
[1m409/409[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 7ms/step - loss: 0.6855 - val_loss: 0.6815
Epoch 3/50
[1m409/409[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 8ms/step - loss: 0.6826 - val_loss: 0.6808
Epoch 4/50
[1m409/409[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 7ms/step - loss: 0.6812 - val_loss: 0.6806
Epoch 5/50
[1m409/409[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 7ms/step - loss: 0.6794 - val_loss: 0.6781
Epoch 6/50
[1m409/409[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 7ms/step - loss: 0.6763 - val_loss: 0.6808
Epoch 7/50
[1m409/409[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 7ms/step - loss: 0.6720 - val_loss: 0.6772
Epoch 8/50
[1m409/409[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 7ms/step - loss: 0.6719 - val_loss: 0.6767
Epoch 9/50
[1m409/409[0m [32m━━━━━━━━

<keras.src.callbacks.history.History at 0x2021729f130>

In [14]:
# # save the trained model
# model.save('lstm_los.keras')

In [15]:
# on testing set
y_test_pred = model.predict(X_test)

# classify based on probability
y_pred_class = (y_test_pred > 0.5).astype(int)

# evaluate the result
auroc = roc_auc_score(y_test, y_pred_class)
precision = precision_score(y_test, y_pred_class)
recall = recall_score(y_test, y_pred_class)
f1 = f1_score(y_test, y_pred_class)

print("AUROC:", auroc)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

[1m103/103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 12ms/step
AUROC: 0.5547304973706231
Precision: 0.511178247734139
Recall: 0.5662650602409639
F1 Score: 0.5373134328358209


In [16]:
# # prediction on whole X
# predictions = model.predict(X_scaled)
# result_df = pd.DataFrame(predictions, columns=['probs'])

# # output probability to csv
# result_df.to_csv('lstm_probs_of_los_traintest.csv', index=False)