# Models (sepsis-pics)


## Common


In [14]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_validate
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from common_eicu import KEY_IDENTITY, KEY_FLAG, CATEGORICAL_COLUMNS_FULL
from common_model import SEED, SCORING, cv


In [2]:
MODEL_RESULT_PATH = './data/model_result.csv'

# index: 'accuracy mean', 'accuracy std', 'AUC mean', 'AUC std'
df_result = pd.read_csv(
    MODEL_RESULT_PATH,
    index_col='index',
)
print(df_result.T.sort_values(by=['accuracy mean', 'AUC mean']))

def update_result(name, *, accuracy_mean, accuracy_std, auc_mean, auc_std):
    df_result[name] = [accuracy_mean, accuracy_std, auc_mean, auc_std]


index                 accuracy mean  accuracy std  AUC mean   AUC std
NaiveBayes                 0.355021      0.077871  0.847471  0.032685
KNN                        0.951654      0.006188  0.538089  0.014292
LDA                        0.960859      0.007866  0.952716  0.008640
Ridge                      0.961338      0.006027  0.952718  0.008641
ExtraTrees                 0.961367      0.006078  0.979020  0.005014
Logistic                   0.965813      0.006286  0.954961  0.005054
SVM                        0.973148      0.004429  0.956572  0.005167
MLP                        0.980292      0.002775  0.971080  0.008732
DecisionTree               0.990703      0.001290  0.949481  0.006011
AdaBoost                   0.993110      0.001999  0.994568  0.002271
HistGradientBoosting       0.994059      0.001413  0.995656  0.002128
XGBoost                    0.995220      0.000405  0.995103  0.002424
LightGB                    0.995400      0.000900  0.996000  0.001600
CatBoost            

In [3]:
def plot_roc_file(title, input_path, auc_mean, auc_std):

    df_roc = pd.read_csv(input_path, sep='\t')
    fig = plt.figure(figsize=(7, 6))
    ax = fig.add_subplot()

    ax.set_title(title)
    ax.set_xlabel('False Positive Rate')
    ax.set_ylabel('True Positive Rate')

    t = [0, 1]
    ax.plot(t, t, '--', c='lightgray', label='chance level (AUC = 0.5)')

    x = df_roc['FPR']
    y = df_roc['TPR']
    label = fr'ROC (AUC = {auc_mean:.2f} $\pm$ {auc_std:.2f})'
    ax.plot(x, y, 'b-', label=label)

    ax.legend(loc='lower right')


In [4]:
def test_sklearn(name, model, *, encode=False, n_jobs=None):
    '''
    Test the given model using sklearn's
    cross validation, print the result
    and pass it to `update_result`.

    Parameters
    ----------
    name : str
        Model name.
    model : scikit-learn compact model
        The model to test.
    encode : False, 'ordinal' or 'dummy', default=False
        Category encoding option.
        False - No encoding.
        'ordinal' - Use ordinal encoder.
        'dummy' - Use dummy encoder.
    n_jobs : int or None, default=None
        Number of jobs to run in parallel.
    '''

    if encode == 'dummy':
        X = X_dummy
    elif encode == 'ordinal':
        X = X_ordinal
    else:
        X = X_raw

    scores = cross_validate(
        model,
        X,
        y,
        cv=cv,
        scoring=SCORING,
        groups=groups,
        n_jobs=n_jobs,
    )

    scores_accuracy = scores['test_accuracy']
    accuracy_mean = scores_accuracy.mean()
    accuracy_std = scores_accuracy.std()
    scores_auc = scores['test_roc_auc']
    auc_mean = scores_auc.mean()
    auc_std = scores_auc.std()
    update_result(
        name,
        accuracy_mean=accuracy_mean,
        accuracy_std=accuracy_std,
        auc_mean=auc_mean,
        auc_std=auc_std,
    )

    print(f'>>> CV Result ({name})')
    print(f'accuracy_mean: {accuracy_mean:.4f}')
    print(f'accuracy_std:  {accuracy_std:.4f}')
    print(f'auc_mean:      {auc_mean:.4f}')
    print(f'auc_std:       {auc_std:.4f}')


## Prepare Data


In [5]:
df_data = pd.read_csv('./data/data_eicu_full.csv.gz')

# clamp infinite values
df_data.replace(np.inf, 9999, inplace=True)

# set categorical columns
for column_name in CATEGORICAL_COLUMNS_FULL:
    df_data[column_name] = df_data[column_name].astype('category')


In [6]:
X_raw = df_data.drop(columns=[KEY_IDENTITY, KEY_FLAG])
X_raw.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100133 entries, 0 to 100132
Data columns (total 59 columns):
 #   Column              Non-Null Count   Dtype   
---  ------              --------------   -----   
 0   gender              100133 non-null  category
 1   age                 100133 non-null  float64 
 2   ethnicity           100133 non-null  category
 3   offset              100133 non-null  int64   
 4   vasopressor         100133 non-null  category
 5   heparin             100133 non-null  category
 6   urine               100133 non-null  float64 
 7   PEEP                100133 non-null  float64 
 8   creatinine          100133 non-null  float64 
 9   platelet            100133 non-null  float64 
 10  INR                 100133 non-null  float64 
 11  PT                  100133 non-null  float64 
 12  PTT                 100133 non-null  float64 
 13  lactate             100133 non-null  float64 
 14  RDW                 100133 non-null  float64 
 15  total bilirubin  

In [7]:
X_dummy = pd.get_dummies(X_raw)
X_dummy.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100133 entries, 0 to 100132
Data columns (total 68 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   age                         100133 non-null  float64
 1   offset                      100133 non-null  int64  
 2   urine                       100133 non-null  float64
 3   PEEP                        100133 non-null  float64
 4   creatinine                  100133 non-null  float64
 5   platelet                    100133 non-null  float64
 6   INR                         100133 non-null  float64
 7   PT                          100133 non-null  float64
 8   PTT                         100133 non-null  float64
 9   lactate                     100133 non-null  float64
 10  RDW                         100133 non-null  float64
 11  total bilirubin             100133 non-null  float64
 12  direct bilirubin            100133 non-null  float64
 13  bicarbonate   

In [8]:
y = df_data[KEY_FLAG].ravel()
y


array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [9]:
groups = df_data[KEY_IDENTITY].ravel()
groups


array([ 141288,  141288,  141288, ..., 3353251, 3353251, 3353251],
      dtype=int64)

## CatBoost


In [18]:
%%time
from catboost import CatBoostClassifier

catboost_model = CatBoostClassifier(
    cat_features=CATEGORICAL_COLUMNS_FULL,
    task_type='GPU',
    devices='0',
    random_state=SEED,
    verbose=False,
)

test_sklearn(
    'CatBoost',
    catboost_model,
)


>>> CV Result (CatBoost)
accuracy_mean: 0.9957
accuracy_std:  0.0007
auc_mean:      0.9957
auc_std:       0.0021
CPU times: total: 5min 54s
Wall time: 4min 28s


## Light Gradient Boosting


In [28]:
%%time
from lightgbm import LGBMClassifier

lgb_model = LGBMClassifier(
    # Categorical features will be automatically detected.
    random_state=SEED,
)

test_sklearn(
    'LightGB',
    lgb_model,
)


>>> CV Result (LightGB)
accuracy_mean: 0.9954
accuracy_std:  0.0009
auc_mean:      0.9960
auc_std:       0.0016
CPU times: total: 1min 37s
Wall time: 18.8 s


## Extreme Gradient Boosting


In [22]:
%%time
from xgboost import XGBClassifier

xgboost_model = XGBClassifier(
    enable_categorical=True,
    tree_method='gpu_hist',
    random_state=SEED,
)

test_sklearn(
    'XGBoost',
    xgboost_model,
)


>>> CV Result (XGBoost)
accuracy_mean: 0.9952
accuracy_std:  0.0004
auc_mean:      0.9951
auc_std:       0.0024
CPU times: total: 1min 15s
Wall time: 22.8 s


## Hist Gradient Boosting


In [24]:
%%time
from sklearn.ensemble import HistGradientBoostingClassifier

histGB_model = HistGradientBoostingClassifier(
    random_state=SEED,
    categorical_features=CATEGORICAL_COLUMNS_FULL,
)

test_sklearn(
    'HistGradientBoosting',
    histGB_model,
    encode='ordinal',
)


>>> CV Result (GradientBoosting)
accuracy_mean: 0.9941
accuracy_std:  0.0014
auc_mean:      0.9957
auc_std:       0.0021
CPU times: total: 2min 31s
Wall time: 28.2 s


## Extra Trees


In [34]:
%%time
from sklearn.ensemble import ExtraTreesClassifier

extra_trees_model = ExtraTreesClassifier(
    class_weight='balanced',
    random_state=SEED,
    n_jobs=-1,  # use all processors
)

test_sklearn(
    'ExtraTrees',
    extra_trees_model,
    encode='dummy',
)


>>> CV Result (ExtraTrees)
accuracy_mean: 0.9614
accuracy_std:  0.0061
auc_mean:      0.9790
auc_std:       0.0050
CPU times: total: 24.1 s
Wall time: 43.1 s


## Ridge Classifier


In [39]:
%%time
from sklearn.linear_model import RidgeClassifier

ridge_model = RidgeClassifier(
    # class_weight='balanced',  # worse...
    random_state=SEED,
)

test_sklearn(
    'Ridge',
    ridge_model,
    encode='dummy',
)


>>> CV Result (Ridge)
accuracy_mean: 0.9613
accuracy_std:  0.0060
auc_mean:      0.9527
auc_std:       0.0086
CPU times: total: 14.6 s
Wall time: 13.5 s


## Logistic Regression


In [43]:
%%time
from sklearn.linear_model import LogisticRegression

logistic_model = LogisticRegression(
    solver='newton-cholesky',
    # class_weight='balanced',  # worse...
    random_state=SEED,
    n_jobs=-1,  # use all processors
)

test_sklearn(
    'Logistic',
    logistic_model,
    encode='dummy',
)


>>> CV Result (Logistic)
accuracy_mean: 0.9658
accuracy_std:  0.0063
auc_mean:      0.9550
auc_std:       0.0051
CPU times: total: 13 s
Wall time: 27.7 s


## K-Nearest Neighbors


In [44]:
%%time
from sklearn.neighbors import KNeighborsClassifier

KNN_model = KNeighborsClassifier(
    n_jobs=-1,  # use all processors
)

test_sklearn(
    'KNN',
    KNN_model,
    encode='dummy',
)


>>> CV Result (KNN)
accuracy_mean: 0.9517
accuracy_std:  0.0062
auc_mean:      0.5381
auc_std:       0.0143
CPU times: total: 8min 22s
Wall time: 56 s


## AdaBoost


In [47]:
%%time
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

adaboost_model = AdaBoostClassifier(
    random_state=SEED,
)

test_sklearn(
    'AdaBoost',
    adaboost_model,
    encode='dummy',
)


>>> CV Result (AdaBoost)
accuracy_mean: 0.9931
accuracy_std:  0.0020
auc_mean:      0.9946
auc_std:       0.0023
CPU times: total: 2min 25s
Wall time: 2min 25s


## Linear Discriminant Analysis


In [49]:
%%time
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

lda_model = LinearDiscriminantAnalysis()

test_sklearn(
    'LDA',
    lda_model,
    encode='dummy',
)


>>> CV Result (LDA)
accuracy_mean: 0.9609
accuracy_std:  0.0079
auc_mean:      0.9527
auc_std:       0.0086
CPU times: total: 53 s
Wall time: 22 s


## MLP


In [52]:
%%time
from sklearn.neural_network import MLPClassifier

mlp_model = make_pipeline(
    StandardScaler(),
    MLPClassifier(
        hidden_layer_sizes=(100,),
        max_iter=1000,
        random_state=SEED,
    ),
)

test_sklearn(
    'MLP',
    mlp_model,
    encode='dummy',
    n_jobs=6,
)


>>> CV Result (MLP)
accuracy_mean: 0.9803
accuracy_std:  0.0028
auc_mean:      0.9711
auc_std:       0.0087
CPU times: total: 23min 41s
Wall time: 11min 35s


## Decision Tree


In [56]:
%%time
from sklearn.tree import DecisionTreeClassifier

decision_tree_model = DecisionTreeClassifier(
    class_weight='balanced',
    random_state=SEED,
)

test_sklearn(
    'DecisionTree',
    decision_tree_model,
    encode='dummy',
    n_jobs=6,
)


>>> CV Result (DecisionTree)
accuracy_mean: 0.9907
accuracy_std:  0.0013
auc_mean:      0.9495
auc_std:       0.0060
CPU times: total: 11.3 s
Wall time: 19.2 s


## SVM (With RBF Kernel)


In [57]:
%%time
from sklearn.svm import SVC

svm_model = make_pipeline(
    StandardScaler(),
    SVC(
        kernel='rbf',  # default
        random_state=SEED,
    ),
)

test_sklearn(
    'SVM',
    svm_model,
    encode='dummy',
    n_jobs=6,
)


>>> CV Result (SVM)
accuracy_mean: 0.9731
accuracy_std:  0.0044
auc_mean:      0.9566
auc_std:       0.0052
CPU times: total: 11.3 s
Wall time: 6min 5s


## Naive Bayes


In [16]:
%%time
from sklearn.naive_bayes import MultinomialNB

nb_model = make_pipeline(
    MinMaxScaler(),
    MultinomialNB(),
)

test_sklearn(
    'NaiveBayes',
    nb_model,
    encode='dummy',
    n_jobs=6,
)


>>> CV Result (NaiveBayes)
accuracy_mean: 0.9614
accuracy_std:  0.0061
auc_mean:      0.6899
auc_std:       0.0657
CPU times: total: 11.3 s
Wall time: 12 s


## Save Result


In [18]:
df_result.to_csv(MODEL_RESULT_PATH)


## Summary


In [17]:
df_summary = df_result.T.sort_values(
    by=['accuracy mean', 'AUC mean'],
    ascending=False,
)

df_summary.style.format('{:.4f}')


index,accuracy mean,accuracy std,AUC mean,AUC std
CatBoost,0.9957,0.0007,0.9957,0.0021
LightGB,0.9954,0.0009,0.996,0.0016
XGBoost,0.9952,0.0004,0.9951,0.0024
HistGradientBoosting,0.9941,0.0014,0.9957,0.0021
AdaBoost,0.9931,0.002,0.9946,0.0023
DecisionTree,0.9907,0.0013,0.9495,0.006
MLP,0.9803,0.0028,0.9711,0.0087
SVM,0.9731,0.0044,0.9566,0.0052
Logistic,0.9658,0.0063,0.955,0.0051
ExtraTrees,0.9614,0.0061,0.979,0.005
