In [1]:
import pandas as pd
import numpy as np

from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.ensemble import HistGradientBoostingRegressor

In [2]:
# load data
X_train_raw = pd.read_csv('./train_features.csv')
y_train = pd.read_csv('./train_labels.csv')
X_test_raw = pd.read_csv('./test_features.csv')

In [3]:
print(X_train_raw.shape, y_train.shape)
print(X_test_raw.shape)
print(X_train_raw.describe())

(227940, 37) (18995, 16)
(151968, 37)
                 pid           Time            Age        EtCO2           PTT  \
count  227940.000000  227940.000000  227940.000000  9783.000000  10299.000000   
mean    15788.831219       7.014399      62.073809    32.883114     40.091310   
std      9151.896286       4.716103      16.451854     7.802065     26.034961   
min         1.000000       1.000000      15.000000    10.000000     12.500000   
25%      7879.000000       4.000000      52.000000    28.500000     27.800000   
50%     15726.000000       7.000000      64.000000    33.000000     32.200000   
75%     23725.000000      10.000000      74.000000    38.000000     40.600000   
max     31658.000000     315.000000     100.000000   100.000000    250.000000   

                BUN       Lactate          Temp           Hgb         HCO3  \
count  20105.000000  10756.000000  81115.000000  22295.000000  12559.00000   
mean      23.192664      2.859716     36.852136     10.628208     23.48810  

In [4]:
# extract statistics from patients: median, mean, var, std, min, max
def describe_patients(df):
    s = 12
    new_df = []
    n = [np.nanmedian, np.nanmean, np.nanvar, np.nanstd, np.nanmin, np.nanmax]
    for i in range(int(len(df)/s)):
        p_data = df.iloc[i*s:(i+1)*s, 2:]
        features = np.zeros((len(n), df.shape[1]-2))
        for j in range(len(n)):
            features[j] = n[j](p_data, axis=0)
        new_df.append(features.ravel())
    
    new_df = np.array(new_df)
    return new_df

In [5]:
X_train = describe_patients(X_train_raw)
X_test = describe_patients(X_test_raw)

  r, k = function_base._ureduce(a, func=_nanmedian, axis=axis, out=out,
  features[j] = n[j](p_data, axis=0)
  features[j] = n[j](p_data, axis=0)
  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  features[j] = n[j](p_data, axis=0)


In [6]:
print(X_train.shape, y_train.shape)
print(X_test.shape)

(18995, 210) (18995, 16)
(12664, 210)


In [7]:
# subtask 1
sub1_labels = y_train.columns[1:11]
y_sub1 = y_train[sub1_labels]

In [8]:
# cross validation subtask 1
for i in range(len(sub1_labels)):
    y = y_sub1.iloc[:, i]
    pipeline = make_pipeline(SimpleImputer(strategy='median'),
                             StandardScaler(),
                             HistGradientBoostingClassifier())
    scores = cross_val_score(pipeline, X_train, y, cv=5, scoring='roc_auc')
    print('{:.6f} (+/-{:.04f}) - {} ROC AUC cross validation'.format(scores.mean(),
                                                                     scores.std()*2, sub1_labels[i]))

0.928249 (+/-0.0070) - LABEL_BaseExcess ROC AUC cross validation
0.802415 (+/-0.0234) - LABEL_Fibrinogen ROC AUC cross validation
0.742763 (+/-0.0072) - LABEL_AST ROC AUC cross validation
0.743751 (+/-0.0075) - LABEL_Alkalinephos ROC AUC cross validation
0.743376 (+/-0.0137) - LABEL_Bilirubin_total ROC AUC cross validation
0.803267 (+/-0.0145) - LABEL_Lactate ROC AUC cross validation
0.892865 (+/-0.0074) - LABEL_TroponinI ROC AUC cross validation
0.829418 (+/-0.0138) - LABEL_SaO2 ROC AUC cross validation
0.760311 (+/-0.0418) - LABEL_Bilirubin_direct ROC AUC cross validation
0.933957 (+/-0.0106) - LABEL_EtCO2 ROC AUC cross validation


In [9]:
# full model & prediction subtask 1
all_predictions = pd.DataFrame({'pid': X_test_raw.iloc[0::12, 0].values})
for i in range(len(sub1_labels)):
    y = y_sub1.iloc[:, i]
    pipeline = pipeline.fit(X_train, y)
    y_predict = pipeline.predict_proba(X_train)[:, 1]
    predictions = pipeline.predict_proba(X_test)[:, 1]
    all_predictions[sub1_labels[i]] = predictions
    score = metrics.roc_auc_score(y, y_predict)
    print('{:.6f} - {} ROC AUC'.format(score, sub1_labels[i]))

0.967764 - LABEL_BaseExcess ROC AUC
0.957451 - LABEL_Fibrinogen ROC AUC
0.859260 - LABEL_AST ROC AUC
0.860479 - LABEL_Alkalinephos ROC AUC
0.864130 - LABEL_Bilirubin_total ROC AUC
0.890365 - LABEL_Lactate ROC AUC
0.978385 - LABEL_TroponinI ROC AUC
0.925872 - LABEL_SaO2 ROC AUC
0.952244 - LABEL_Bilirubin_direct ROC AUC
0.989054 - LABEL_EtCO2 ROC AUC


In [10]:
# subtask 2
sub2_label = y_train.columns[11]
y_sub2 = y_train[sub2_label]

In [11]:
# cross validation subtask 2
pipeline = make_pipeline(SimpleImputer(strategy='median'),
                         StandardScaler(),
                         HistGradientBoostingClassifier())
scores = cross_val_score(pipeline, X_train, y_sub2, cv=5, scoring='roc_auc')
print('{:.6f} (+/-{:.04f}) - {} ROC AUC cross validation'.format(scores.mean(),
                                                                 scores.std()*2, sub2_label))

0.708812 (+/-0.0536) - LABEL_Sepsis ROC AUC cross validation


In [12]:
# full model & prediction subtask 2
pipeline = pipeline.fit(X_train, y_sub2)
y_predict = pipeline.predict_proba(X_train)[:, 1]
predictions = pipeline.predict_proba(X_test)[:, 1]
all_predictions[sub2_label] = predictions

score = metrics.roc_auc_score(y_sub2, y_predict)
print('{:.6f} - {} ROC AUC'.format(score, sub2_label))

0.900596 - LABEL_Sepsis ROC AUC


In [13]:
# subtask 3
sub3_labels = y_train.columns[12:]
y_sub3 = y_train[sub3_labels]

In [14]:
# cross validation subtask 3
for i in range(len(sub3_labels)):
    y = y_sub3.iloc[:, i]
    pipeline = make_pipeline(SimpleImputer(strategy='median'),
                             StandardScaler(),
                             HistGradientBoostingRegressor())
    scores = cross_val_score(pipeline, X_train, y, cv=5, scoring='r2')
    print('{:.6f} (+/-{:.04f}) - {} R2 cross validation'.format(scores.mean(),
                                                                scores.std()*2, sub3_labels[i]))

0.412141 (+/-0.0166) - LABEL_RRate R2 cross validation
0.619322 (+/-0.0312) - LABEL_ABPm R2 cross validation
0.380660 (+/-0.0299) - LABEL_SpO2 R2 cross validation
0.633261 (+/-0.0281) - LABEL_Heartrate R2 cross validation


In [15]:
# full model & prediction subtask 3
for i in range(len(sub3_labels)):
    y = y_sub3.iloc[:, i]
    pipeline = pipeline.fit(X_train, y)
    y_predict = pipeline.predict(X_train)
    predictions = pipeline.predict(X_test)
    all_predictions[sub3_labels[i]] = predictions
    score = metrics.r2_score(y, y_predict)
    print('{:.6f} - {} R2'.format(score, sub3_labels[i]))

0.568123 - LABEL_RRate R2
0.725160 - LABEL_ABPm R2
0.526574 - LABEL_SpO2 R2
0.708299 - LABEL_Heartrate R2


In [16]:
# create submission file
all_predictions.to_csv('prediction.zip', index=False, float_format='%.3f', compression='zip')