In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score, r2_score

In [2]:
from sklearn.feature_selection import VarianceThreshold
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier, HistGradientBoostingRegressor
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import sklearn.metrics as metrics

In [3]:
train_data = pd.read_csv('train_features.csv')
labels = pd.read_csv('train_labels.csv')
test_data = pd.read_csv('test_features.csv')

In [4]:
train_data_feature = train_data.iloc[:, 3:]
train_data_age = train_data.iloc[:, 2]
test_data_feature = test_data.iloc[:, 3:]
test_data_age = test_data.iloc[:, 2]

In [5]:
def extract_age_features(df, time_series_length = 12):
    df_np = df.to_numpy()
    df_np_row = df_np.shape[0]
    feature_nanmean = []
    k = 0
    for i in range(int(df_np_row/time_series_length)):
        mean_collection = np.nanmean(df_np[k:k+time_series_length], axis = 0)
        feature_nanmean.append(mean_collection)
        k += time_series_length

    feature_nanmean_np = np.array(feature_nanmean)
    feature_nanmean_np = feature_nanmean_np.reshape((feature_nanmean_np.shape[0],1))

    return feature_nanmean_np

In [6]:
train_age = extract_age_features(train_data_age)
test_age = extract_age_features(test_data_age)

In [7]:
def extract_features(df, time_series_length = 12):
    df_np = df.to_numpy()
    df_np_row = df_np.shape[0]
    feature_nanmedian = []
    feature_nanmean = []
    feature_nanvar = []
    feature_nanmin = []
    feature_nanmax = []
    feature_nanquantile_25 = []
    feature_nanquantile_75 = []
    
    k = 0
    for i in range(int(df_np_row/time_series_length)):
        median_collection = np.nanmedian(df_np[k:k+time_series_length], axis = 0)
        feature_nanmedian.append(median_collection)
        mean_collection = np.nanmean(df_np[k:k+time_series_length], axis = 0)
        feature_nanmean.append(mean_collection)
        var_collection = np.nanvar(df_np[k:k+time_series_length], axis = 0)
        feature_nanvar.append(var_collection)
        max_collection = np.nanmax(df_np[k:k+time_series_length], axis = 0)
        feature_nanmax.append(max_collection)
        min_collection = np.nanmin(df_np[k:k+time_series_length], axis = 0)
        feature_nanmin.append(min_collection)
        quantile25_collection = np.nanquantile(df_np[k:k+time_series_length], 0.25, axis = 0)
        feature_nanquantile_25.append(quantile25_collection)
        quantile75_collection = np.nanquantile(df_np[k:k+time_series_length], 0.75, axis = 0)
        feature_nanquantile_75.append(quantile75_collection)
        k += time_series_length
        
    feature_nanmedian_np = np.array(feature_nanmedian)
    feature_nanmean_np = np.array(feature_nanmean)
    feature_nanvar_np = np.array(feature_nanvar)
    feature_nanmax_np = np.array(feature_nanmax)
    feature_nanmin_np = np.array(feature_nanmin)
    feature_nanquantile_25 = np.array(feature_nanquantile_25)
    feature_nanquantile_75 = np.array(feature_nanquantile_75)
    
    feature = np.concatenate((feature_nanmedian_np,
                              feature_nanmean_np,
                              feature_nanvar_np, 
                              feature_nanmax_np,
                              feature_nanmin_np,
                             feature_nanquantile_25,
                             feature_nanquantile_75),axis = 1)
    return feature

In [9]:
train_feature = extract_features(train_data_feature)
test_feature = extract_features(test_data_feature)

  mean_collection = np.nanmean(df_np[k:k+time_series_length], axis = 0)
  var_collection = np.nanvar(df_np[k:k+time_series_length], axis = 0)
  max_collection = np.nanmax(df_np[k:k+time_series_length], axis = 0)
  min_collection = np.nanmin(df_np[k:k+time_series_length], axis = 0)


In [10]:
X_train = np.concatenate((train_age,train_feature),axis = 1)
X_test = np.concatenate((test_age,test_feature),axis = 1)

In [11]:
df_prediction = pd.DataFrame({'pid': test_data.iloc[0::12, 0].values})

Subtask 1

In [12]:
Y_train_task1 = labels.iloc[:,1:11].to_numpy()

In [13]:
subtask1_labels = ['LABEL_BaseExcess', 'LABEL_Fibrinogen', 'LABEL_AST',
         'LABEL_Alkalinephos', 'LABEL_Bilirubin_total', 
         'LABEL_Lactate', 'LABEL_TroponinI', 'LABEL_SaO2',
         'LABEL_Bilirubin_direct', 'LABEL_EtCO2']

In [23]:
pipeline_task1 = make_pipeline(StandardScaler(),
                        HistGradientBoostingClassifier(early_stopping=False))

hyperparameter_list = {
    'histgradientboostingclassifier__max_depth': [2, 4, 6, 8],
    'histgradientboostingclassifier__max_bins': [50, 100, 200, 250],
    'histgradientboostingclassifier__min_samples_leaf': [10, 100, 200, 400]}

cv = GridSearchCV(pipeline_task1, hyperparameter_list, n_jobs=-1, cv=3, scoring='roc_auc')

for i, label in enumerate(subtask1_labels):
    cv.fit(X_train, Y_train_task1[:, i])
    predictions = cv.predict_proba(X_test)[:, 1]
    df_prediction[label] = predictions

Subtask 2

In [24]:
Y_train_task2 = labels.iloc[:,11].to_numpy()

In [25]:
subtask2_labels = ['LABEL_Sepsis']

In [26]:
for i, label in enumerate(subtask2_labels):
    cv.fit(X_train, Y_train_task2)
    predictions = cv.predict_proba(X_test)[:, 1]
    df_prediction[label] = predictions

Subtask 3

In [27]:
Y_train_task3 = labels.iloc[:,12:].to_numpy()

In [28]:
subtask3_labels = ['LABEL_RRate', 'LABEL_ABPm', 'LABEL_SpO2',
                      'LABEL_Heartrate']

In [29]:
pipeline_task3 = make_pipeline(StandardScaler(),
                        HistGradientBoostingRegressor(early_stopping=False))

hyperparameter_list = {
    'histgradientboostingregressor__max_depth': [2, 4, 6, 8],
    'histgradientboostingregressor__max_bins': [50, 100, 200, 250],
    'histgradientboostingregressor__min_samples_leaf': [10, 100, 200, 400]}

cv_reg = GridSearchCV(pipeline_task3, hyperparameter_list, n_jobs=-1, cv=3, scoring='r2')

for i, label in enumerate(subtask3_labels):
    cv_reg.fit(X_train, Y_train_task3[:, i])
    predictions = cv_reg.predict(X_test)
    df_prediction[label] = predictions

In [30]:
df_prediction

Unnamed: 0,pid,LABEL_BaseExcess,LABEL_Fibrinogen,LABEL_AST,LABEL_Alkalinephos,LABEL_Bilirubin_total,LABEL_Lactate,LABEL_TroponinI,LABEL_SaO2,LABEL_Bilirubin_direct,LABEL_EtCO2,LABEL_Sepsis,LABEL_RRate,LABEL_ABPm,LABEL_SpO2,LABEL_Heartrate
0,0,0.971705,0.332272,0.898902,0.891144,0.839011,0.535326,0.003273,0.411632,0.115691,0.018276,0.069226,14.407236,84.975998,99.023374,84.511634
1,10001,0.030526,0.025343,0.256814,0.304349,0.225634,0.039633,0.079742,0.068925,0.022205,0.015303,0.021315,17.480967,84.933554,94.937720,101.063791
2,10003,0.003146,0.023493,0.119264,0.066782,0.092931,0.153233,0.030325,0.232407,0.031991,0.018608,0.020701,17.659165,80.014698,98.363566,89.572704
3,10004,0.017343,0.030885,0.287863,0.257228,0.245222,0.055249,0.042371,0.088792,0.023782,0.045382,0.016291,16.264747,76.599885,95.591901,87.582601
4,10005,0.122561,0.032320,0.122121,0.097058,0.125068,0.069876,0.004492,0.061036,0.008751,0.000724,0.021569,19.368770,74.904270,95.894995,63.193428
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12659,9989,0.623467,0.075279,0.194044,0.152247,0.162494,0.427577,0.001609,0.171285,0.009642,0.000568,0.080980,20.113246,79.499337,95.881810,107.370443
12660,9991,0.486830,0.068563,0.117401,0.153926,0.118809,0.349632,0.010652,0.129664,0.015524,0.003803,0.054655,18.864902,93.307269,98.544420,76.677856
12661,9992,0.604651,0.026770,0.062203,0.055202,0.085309,0.290854,0.004077,0.684722,0.008426,0.005557,0.045083,18.945241,68.635762,96.842251,79.548213
12662,9994,0.988688,0.672701,0.790128,0.771806,0.818453,0.966887,0.006783,0.963037,0.158132,0.015611,0.275708,14.927344,92.476597,98.415322,97.623084


In [31]:
df_prediction.to_csv('prediction.zip', index=False, float_format='%.3f', compression='zip')