In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout

# Data Processing

In [4]:
df_raw = pd.read_csv('train_test.csv')

In [8]:
df_raw['los_num'] = df_raw['los_icu_class'].apply(lambda x: 0 if x == 'less than 3 days' else 1)

In [9]:
select_cols = [col for col in df_raw.columns if col not in ('charttime','hosp_admittime','hosp_dischtime','icu_intime','icu_outtime','los_icu','text_embeddings','los_icu_class')]

In [10]:
df_full = df_raw[select_cols].groupby('id').transform(lambda x: x.fillna(x.mean()))

In [11]:
df = df_full.fillna(df_full.mean())

In [21]:
df.to_csv('filled.csv', index=False)

# Prepare data for modeling 

In [16]:
feature_cols = df.columns.drop(['los_num'])
target = 'los_num'
X = df[feature_cols]
y = df[target]

# numerical columns
num = df.select_dtypes(include=['float']).columns
X_num = X[num]

# min-max standardization
scaler = MinMaxScaler(feature_range=(0, 1))
X_num_scaled = scaler.fit_transform(X_num)
X_num_scaled = pd.DataFrame(X_num_scaled, columns=num, index=X_num.index)
X_scaled = X.copy()
X_scaled[num] = X_num_scaled[num]

time_steps = 25
X_scaled = np.array([X_scaled[i:i + time_steps] for i in range(0, len(X_scaled), time_steps)])
y = np.array([y[i] for i in range(time_steps - 1, len(y), time_steps)])

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
y_train = y_train.reshape(-1, 1)

# Modeling

In [17]:
model = Sequential()
model.add(LSTM(units=25, return_sequences=False, input_shape=(time_steps, X_train.shape[2])))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))

  super().__init__(**kwargs)


In [18]:
model.compile(optimizer='adam', loss='binary_crossentropy')

In [19]:
model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_test, y_test))

Epoch 1/50
[1m409/409[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 7ms/step - loss: 0.6883 - val_loss: 0.6832
Epoch 2/50
[1m409/409[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 6ms/step - loss: 0.6803 - val_loss: 0.6801
Epoch 3/50
[1m409/409[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - loss: 0.6773 - val_loss: 0.6807
Epoch 4/50
[1m409/409[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - loss: 0.6766 - val_loss: 0.6804
Epoch 5/50
[1m409/409[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - loss: 0.6749 - val_loss: 0.6812
Epoch 6/50
[1m409/409[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - loss: 0.6726 - val_loss: 0.6789
Epoch 7/50
[1m409/409[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - loss: 0.6684 - val_loss: 0.6764
Epoch 8/50
[1m409/409[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - loss: 0.6597 - val_loss: 0.6755
Epoch 9/50
[1m409/409[0m [32m━━━━━━━━

<keras.src.callbacks.history.History at 0x25e3e9cdc10>

In [20]:
# save the trained model
model.save('lstm_los.keras')

In [12]:
# on testing set
y_test_pred = model.predict(X_test)

# classify based on probability
y_pred_class = (y_test_pred > 0.5).astype(int)

# evaluate the result
precision = precision_score(y_test, y_pred_class)
recall = recall_score(y_test, y_pred_class)
f1 = f1_score(y_test, y_pred_class)

print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

[1m103/103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step
Precision: 0.5738137082601055
Recall: 0.43708165997322623
F1 Score: 0.49620060790273557


In [13]:
# prediction on whole X
predictions = model.predict(X_scaled)

[1m511/511[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step


In [14]:
result_df = pd.DataFrame(predictions, columns=['probs'])

# output probability to csv
result_df.to_csv('lstm_probs_of_los_traintest.csv', index=False)