In [1]:
import pandas as pd
import numpy as np

In [2]:
rng = np.random.RandomState(42)

In [3]:
data = pd.read_csv('train_test.csv')
data.shape

(408050, 112)

In [14]:
data.head()

Unnamed: 0,id,charttime,hosp_admittime,hosp_dischtime,icu_intime,icu_outtime,los_icu,icu_death,gender,admission_age,...,glucose_bg,d_dimer,fibrinogen,thrombin,inr,pt,ptt,urineoutput,text_embeddings,los_icu_class
0,20001305,1978-03-25 02:00:00,1978-03-25 02:58:00,1978-03-27 19:23:00,1978-03-25 02:59:00,1978-03-27 21:46:00,2.78,1,1.0,84.22776,...,,,,,,,,,[ 4.95544821e-02 -3.71760167e-02 -1.27426326e-...,less than 3 days
1,20001305,1978-03-25 03:00:00,1978-03-25 02:58:00,1978-03-27 19:23:00,1978-03-25 02:59:00,1978-03-27 21:46:00,2.78,1,1.0,84.22776,...,,,,,,,,,[ 4.95544821e-02 -3.71760167e-02 -1.27426326e-...,less than 3 days
2,20001305,1978-03-25 04:00:00,1978-03-25 02:58:00,1978-03-27 19:23:00,1978-03-25 02:59:00,1978-03-27 21:46:00,2.78,1,1.0,84.22776,...,,,,,,,,,[ 4.95544821e-02 -3.71760167e-02 -1.27426326e-...,less than 3 days
3,20001305,1978-03-25 05:00:00,1978-03-25 02:58:00,1978-03-27 19:23:00,1978-03-25 02:59:00,1978-03-27 21:46:00,2.78,1,1.0,84.22776,...,,,,,,,,,[ 4.95544821e-02 -3.71760167e-02 -1.27426326e-...,less than 3 days
4,20001305,1978-03-25 06:00:00,1978-03-25 02:58:00,1978-03-27 19:23:00,1978-03-25 02:59:00,1978-03-27 21:46:00,2.78,1,1.0,84.22776,...,,,,,,,,,[ 4.95544821e-02 -3.71760167e-02 -1.27426326e-...,less than 3 days


In [15]:
data.dtypes

id                   int64
charttime           object
hosp_admittime      object
hosp_dischtime      object
icu_intime          object
                    ...   
pt                 float64
ptt                float64
urineoutput        float64
text_embeddings     object
los_icu_class       object
Length: 112, dtype: object

In [4]:
cats = ['icu_death', 'gender', 'admission_type', 'atrial_fibrillation', 'malignant_cancer',
       'chf', 'ckd', 'cld', 'copd', 'diabetes', 'hypertension', 'ihd', 'stroke']
for col in cats:
   data[col] = data[col].astype('int')

In [5]:
data.count()

id                 408050
charttime          408050
hosp_admittime     408050
hosp_dischtime     408050
icu_intime         408050
                    ...  
pt                 408039
ptt                408039
urineoutput        407894
text_embeddings    408050
los_icu_class      408050
Length: 112, dtype: int64

In [None]:
data.to_csv('train_test.csv', index=False)

# Data Preparation

In [6]:
# drop features with 50% null
missing = data.isnull().sum()

threshold = 0.5 * len(data)
col_to_drop = missing[missing > threshold].index
data = data.drop(columns=col_to_drop)

In [7]:
data.shape

(408050, 111)

In [8]:
# choose the last row for each patient
data = data.sort_values(by=['id', 'charttime'], ascending=False)
data = data.groupby('id').first().reset_index()
data.shape

(16322, 111)

In [9]:
# fill nan with mean
num = data.select_dtypes(include=['float']).columns

data[num] = data[num].fillna(data[num].mean())

In [65]:
# split train and test before oversampling and feature selection
from sklearn.model_selection import train_test_split

# construct X and y
# for prediction of icu_death
X = data.drop(columns=['id', 'charttime', 'hosp_admittime', 'hosp_dischtime', 'icu_intime', 'icu_outtime', 
                       'los_icu', 'icu_death', 'text_embeddings', 'los_icu_class'])
y = data['icu_death']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=rng, stratify=y)

In [28]:
from collections import Counter
Counter(y_train) # exist sample imbalance problem

Counter({0: 11707, 1: 1350})

In [66]:
# deal with sample imbalance, skip this part when predicting los
from imblearn.over_sampling import SMOTE

smote = SMOTE(sampling_strategy='auto', random_state=rng)
X_train, y_train = smote.fit_resample(X_train, y_train)
Counter(y_train)

Counter({0: 11707, 1: 11707})

In [67]:
X_test

Unnamed: 0,gender,admission_age,weight_admit,height,admission_type,charlson_score,atrial_fibrillation,malignant_cancer,chf,ckd,...,sodium_bg,lactate_bg,glucose_bg,d_dimer,fibrinogen,thrombin,inr,pt,ptt,urineoutput
8707,0,49.972871,105.0,173.0,0,6.0,0,1,0,1,...,132.0,1.4,131.0,4207.000000,305.0,61.6,1.2,12.8,24.8,15.0
2713,1,54.730927,45.3,170.0,1,5.0,0,0,0,0,...,140.0,1.3,146.0,1718.000000,201.0,16.3,1.1,11.9,28.6,100.0
10616,1,86.388185,67.6,168.0,1,6.0,0,0,0,0,...,136.0,1.1,161.0,17202.000000,307.0,20.7,1.4,15.6,32.0,85.0
1114,0,62.945085,80.4,168.0,0,2.0,0,0,0,0,...,137.0,1.1,102.0,4014.070198,124.0,16.2,1.2,12.6,26.0,300.0
3121,1,82.574104,89.0,170.0,1,5.0,0,0,0,0,...,133.0,10.0,152.0,1954.000000,563.0,16.3,1.1,11.6,32.3,75.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12124,1,70.664353,38.7,152.0,1,4.0,0,0,0,0,...,138.0,1.0,70.0,15335.000000,371.0,10.0,1.0,11.3,30.3,20.0
10171,0,76.853155,50.5,170.0,0,9.0,0,0,1,1,...,139.0,0.6,142.0,3758.000000,983.0,20.7,1.3,14.3,31.9,280.0
8088,0,52.814742,143.0,170.0,1,4.0,1,0,1,1,...,131.0,1.7,159.0,9713.000000,123.0,150.0,1.6,17.1,36.8,25.0
11277,1,78.405795,66.7,170.0,0,10.0,0,0,0,0,...,144.0,0.9,369.0,1701.000000,363.0,18.1,1.1,12.3,35.4,15.0


# Feature Selection

In [68]:
# numerical features
num = X.select_dtypes(include=['float']).columns

X_num_train = X_train[num]
X_num_test = X_test[num]

data_trained = pd.concat([X_num_train, y_train], axis=1)

In [69]:
# using correlation analysis to select features, but all the features show a low relationship with death
corr = data_trained.corr()
threshold = 0.5
high_corr = corr['icu_death'][abs(corr['icu_death']) > threshold].index.tolist()
high_corr

['icu_death']

In [71]:
# t-test
from scipy.stats import ttest_ind

low_risk = data_trained[data_trained['icu_death'] == 0]
high_risk = data_trained[data_trained['icu_death'] == 1]
ttest_selected = []

for c in data_trained.columns:
    t_statistic, p_value = ttest_ind(low_risk[c], high_risk[c])
    if p_value < 0.05:
        ttest_selected.append(c)

print(len(ttest_selected))

69


In [72]:
# Z-score standardization
from sklearn import preprocessing

scaler = preprocessing.StandardScaler()
fit_scaler = scaler.fit(X_num_train)

train_num_scaled = fit_scaler.transform(X_num_train)
test_num_scaled = fit_scaler.transform(X_num_test)

In [73]:
# merge scaled numeric feature with other categorical features
train_num_scaled = pd.DataFrame(train_num_scaled, columns=num, index=X_num_train.index)
test_num_scaled = pd.DataFrame(test_num_scaled, columns=num, index=X_num_test.index)

X_train[num] = train_num_scaled[num]
X_test[num] = test_num_scaled[num]

RFE

In [54]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression(penalty='l1', C=1/5, solver='liblinear', random_state=rng)
rfe = RFE(log_reg, n_features_to_select=20)
fit = rfe.fit(X_train, y_train)

rfe_selected = X_train.columns[fit.support_]

RF

In [57]:
from sklearn.ensemble import RandomForestClassifier

n_selected = 20

rf_model = RandomForestClassifier(n_estimators=100, criterion='entropy', random_state=rng)
rf_model.fit(X_train, y_train)

feature_importances = rf_model.feature_importances_
feature_importances = pd.concat([
                                pd.DataFrame({'feature': X_train.columns}),
                                pd.DataFrame({'importance': feature_importances})], 
                                axis=1)
rf_selected = feature_importances.sort_values(by='importance', ascending=False).head(n_selected)['feature']

In [58]:
# intersection
inter_selected = list(set(ttest_selected) & set(rfe_selected) & set(rf_selected))
print(len(inter_selected))

5


# Modeling

In [61]:
# Grid Search for hyperparameters
from sklearn.model_selection import GridSearchCV

def grid_search(X_train, y_train, model, param, cv=5, scoring='recall'):
    search = GridSearchCV(model, param, cv=cv, scoring=scoring)
    search.fit(X_train, y_train)
    best_model = search.best_estimator_
    return best_model

In [62]:
# Model Evaluation
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report

def model_evaluate(pred, test, model):
    prec = precision_score(test, pred)
    recall = recall_score(test, pred)
    f1 = f1_score(test, pred)
    class_report = classification_report(test, pred, labels=[0, 1])

    print(f"Precision: {prec:.3f}, ", f"Recall: {recall:.3f}, ", f"F1: {f1:.3f}")
    print("\nClassification Report:")
    print(class_report)

In [91]:
X_train_sub = X_train[rf_selected]
X_test_sub = X_test[rf_selected]

In [92]:
from sklearn.ensemble import RandomForestClassifier

param_grid = {
    'n_estimators': [50, 100, 150, 200],
    'max_depth': [5, 10, 15]
}

rf_model = RandomForestClassifier(random_state=rng)

rf_best = grid_search(X_train_sub, y_train, rf_model, param_grid)
rf_best.fit(X_train_sub, y_train)

In [93]:
y_pred = rf_best.predict(X_test_sub)

model_evaluate(y_pred, y_test, model="Random Forest")

Precision: 0.413,  Recall: 0.365,  F1: 0.387

Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.94      0.93      2928
           1       0.41      0.36      0.39       337

    accuracy                           0.88      3265
   macro avg       0.67      0.65      0.66      3265
weighted avg       0.87      0.88      0.88      3265



In [89]:
X_train_sub = X_train[rf_selected]
X_test_sub = X_test[rf_selected]

In [90]:
import xgboost as xgb

xg = xgb.XGBClassifier(objective='multi:softmax', num_class=2, seed=42)

xg.fit(X_train_sub, y_train)

y_pred = xg.predict(X_test_sub)
model_evaluate(y_pred, y_test, model="XGBoost")

Precision: 0.486,  Recall: 0.252,  F1: 0.332

Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.97      0.94      2928
           1       0.49      0.25      0.33       337

    accuracy                           0.90      3265
   macro avg       0.70      0.61      0.64      3265
weighted avg       0.87      0.90      0.88      3265



In [95]:
rf_probs = rf_best.predict_proba(X[rf_selected])
rf_probs = pd.DataFrame(rf_probs)
rf_probs['id'] = data['id']
rf_probs.to_csv('death_probability_sd_rf_traintest.csv', index=False)

In [96]:
xg_probs = xg.predict_proba(X[rf_selected])
xg_probs = pd.DataFrame(xg_probs)
xg_probs['id'] = data['id']
xg_probs.to_csv('death_probability_sd_rf_traintest.csv', index=False)