In [1]:
import pandas as pd
import numpy as np

rng = np.random.RandomState(42)

In [2]:
data = pd.read_csv('train_test.csv')

# Preprocessing

In [4]:
# drop features with 50% null
missing = data.isnull().sum()

threshold = 0.5 * len(data)
col_to_drop = missing[missing > threshold].index
data = data.drop(columns=col_to_drop)

In [5]:
# choose the last row for each patient
data = data.sort_values(by=['id', 'charttime'], ascending=False)
data = data.groupby('id').first().reset_index()
data.shape

(16322, 111)

In [6]:
# fill nan with mean
num = data.select_dtypes(include=['float']).columns

data[num] = data[num].fillna(data[num].mean())

In [7]:
# classify the los_icu
data.loc[data['los_icu_class'] == 'less than 3 days', 'los_icu_class'] = 0
data.loc[data['los_icu_class'] == 'greater than or equal to 3 days', 'los_icu_class'] = 1
data['los_icu_class'] = data['los_icu_class'].astype('int')

In [8]:
# split train and test before oversampling and feature selection
from sklearn.model_selection import train_test_split

# construct X and y
X = data.drop(columns=['id', 'charttime', 'hosp_admittime', 'hosp_dischtime', 'icu_intime', 'icu_outtime', 
                       'los_icu', 'text_embeddings', 'los_icu_class'])
y = data['los_icu_class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=rng, stratify=y)

In [9]:
from collections import Counter
Counter(y_train) # do not exist sample imbalance problem

Counter({0: 6912, 1: 6145})

# Feature Selection

In [10]:
# numerical features
num = X.select_dtypes(include=['float']).columns

X_num_train = X_train[num]
X_num_test = X_test[num]

data_trained = pd.concat([X_num_train, y_train], axis=1)

In [11]:
# Z-score standardization
from sklearn import preprocessing

scaler = preprocessing.StandardScaler()
fit_scaler = scaler.fit(X_num_train)

train_num_scaled = fit_scaler.transform(X_num_train)
test_num_scaled = fit_scaler.transform(X_num_test)

In [12]:
# merge scaled numeric feature with other categorical features
train_num_scaled = pd.DataFrame(train_num_scaled, columns=num, index=X_num_train.index)
test_num_scaled = pd.DataFrame(test_num_scaled, columns=num, index=X_num_test.index)

X_train[num] = train_num_scaled[num]
X_test[num] = test_num_scaled[num]

In [13]:
from sklearn.ensemble import RandomForestClassifier

n_selected = 20

rf_model = RandomForestClassifier(n_estimators=100, criterion='entropy', random_state=rng)
rf_model.fit(X_train, y_train)

feature_importances = rf_model.feature_importances_
feature_importances = pd.concat([
                                pd.DataFrame({'feature': X_train.columns}),
                                pd.DataFrame({'importance': feature_importances})], 
                                axis=1)
rf_selected = feature_importances.sort_values(by='importance', ascending=False).head(n_selected)['feature']

In [41]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression(penalty='l1', C=1/5, solver='liblinear', random_state=rng)
rfe = RFE(log_reg, n_features_to_select=20)
fit = rfe.fit(X_train, y_train)

rfe_selected = X_train.columns[fit.support_]

# Modeling

In [14]:
# Grid Search for hyperparameters
from sklearn.model_selection import GridSearchCV

def grid_search(X_train, y_train, model, param, cv=5, scoring='recall'):
    search = GridSearchCV(model, param, cv=cv, scoring=scoring)
    search.fit(X_train, y_train)
    best_model = search.best_estimator_
    return best_model

In [15]:
# Model Evaluation
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report

def model_evaluate(pred, test, model):
    prec = precision_score(test, pred)
    recall = recall_score(test, pred)
    f1 = f1_score(test, pred)
    class_report = classification_report(test, pred, labels=[0, 1])

    print(f"Precision: {prec:.3f}, ", f"Recall: {recall:.3f}, ", f"F1: {f1:.3f}")
    print("\nClassification Report:")
    print(class_report)

In [16]:
X_train_sub = X_train[rf_selected]
X_test_sub = X_test[rf_selected]

In [17]:
from sklearn.ensemble import RandomForestClassifier

param_grid = {
    'n_estimators': [50, 100, 150, 200],
    'max_depth': [5, 10, 15]
}

rf_model = RandomForestClassifier(random_state=rng)

rf_best = grid_search(X_train_sub, y_train, rf_model, param_grid)
rf_best.fit(X_train_sub, y_train)

In [18]:
y_pred = rf_best.predict(X_test_sub)

model_evaluate(y_pred, y_test, model="Random Forest")

Precision: 0.646,  Recall: 0.569,  F1: 0.605

Classification Report:
              precision    recall  f1-score   support

           0       0.65      0.72      0.69      1729
           1       0.65      0.57      0.61      1536

    accuracy                           0.65      3265
   macro avg       0.65      0.65      0.65      3265
weighted avg       0.65      0.65      0.65      3265



In [21]:
import xgboost as xgb

xg = xgb.XGBClassifier(objective='multi:softmax', num_class=2, seed=42)

xg.fit(X_train_sub, y_train)

y_pred = xg.predict(X_test_sub)
model_evaluate(y_pred, y_test, model="XGBoost")

Precision: 0.609,  Recall: 0.564,  F1: 0.585

Classification Report:
              precision    recall  f1-score   support

           0       0.64      0.68      0.66      1729
           1       0.61      0.56      0.59      1536

    accuracy                           0.62      3265
   macro avg       0.62      0.62      0.62      3265
weighted avg       0.62      0.62      0.62      3265



In [20]:
rf_probs = rf_best.predict_proba(X[rf_selected])
rf_probs = pd.DataFrame(rf_probs)
rf_probs['id'] = data['id']
rf_probs.to_csv('los_probability_sd_rf_traintest.csv', index=False)

In [23]:
xg_probs = xg.predict_proba(X[rf_selected])
xg_probs = pd.DataFrame(xg_probs)
xg_probs['id'] = data['id']
xg_probs.to_csv('los_probability_sd_xgb_traintest.csv', index=False)