# Models (sepsis-pics)


## Common


In [41]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_validate
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OrdinalEncoder
from common_eicu import KEY_IDENTITY, KEY_FLAG, CATEGORICAL_COLUMNS
from common_model import SEED, SCORING, cv


In [50]:
MODEL_RESULT_PATH = './data/model_result.csv'

# index: 'Accuracy Mean', 'Accuracy Std', 'AUC Mean', 'AUC Std'
df_result = pd.read_csv(
    MODEL_RESULT_PATH,
    index_col='Index',
)
print(
    df_result.T.sort_values(
        by=['Accuracy Mean', 'AUC Mean'],
        ascending=False,
    )
)

def update_result(name, *, accuracy_mean, accuracy_std, auc_mean, auc_std):
    df_result[name] = [accuracy_mean, accuracy_std, auc_mean, auc_std]


Index         Accuracy Mean  Accuracy Std  AUC Mean   AUC Std
CatBoost           0.995700      0.000700  0.995700  0.002100
LightGB            0.995264      0.001002  0.996036  0.001771
XGBoost            0.995069      0.000750  0.995147  0.001838
HistGB             0.994180      0.001242  0.995606  0.002084
AdaBoost           0.992804      0.002055  0.994584  0.002278
DecisionTree       0.990253      0.001654  0.948244  0.007490
MLP                0.982250      0.003155  0.974170  0.007848
SVM                0.972904      0.004621  0.955998  0.005210
Logistic           0.965833      0.006287  0.954735  0.005334
ExtraTrees         0.961367      0.006078  0.977292  0.004562
NaiveBayes         0.961367      0.006078  0.690024  0.064777
Ridge              0.961338      0.006027  0.952407  0.008768
LDA                0.960990      0.007581  0.952406  0.008767
KNN                0.950666      0.006280  0.539846  0.017137


In [43]:
def plot_roc_file(title, input_path, auc_mean, auc_std):

    df_roc = pd.read_csv(input_path, sep='\t')
    fig = plt.figure(figsize=(7, 6))
    ax = fig.add_subplot()

    ax.set_title(title)
    ax.set_xlabel('False Positive Rate')
    ax.set_ylabel('True Positive Rate')

    t = [0, 1]
    ax.plot(t, t, '--', c='lightgray', label='chance level (AUC = 0.5)')

    x = df_roc['FPR']
    y = df_roc['TPR']
    label = fr'ROC (AUC = {auc_mean:.2f} $\pm$ {auc_std:.2f})'
    ax.plot(x, y, 'b-', label=label)

    ax.legend(loc='lower right')


In [44]:
def test_sklearn(name, model, *, encode=False, n_jobs=None):
    '''
    Test the given model using sklearn's
    cross validation, print the result
    and pass it to `update_result`.

    Parameters
    ----------
    name : str
        Model name.
    model : scikit-learn compact model
        The model to test.
    encode : False, 'ordinal' or 'dummy', default=False
        Category encoding option.
        False - No encoding.
        'ordinal' - Use ordinal encoder.
        'dummy' - Use dummy encoder.
    n_jobs : int or None, default=None
        Number of jobs to run in parallel.
    '''

    if encode == 'dummy':
        X = X_dummy
    elif encode == 'ordinal':
        X = X_ordinal
    else:
        X = X_raw

    scores = cross_validate(
        model,
        X,
        y,
        cv=cv,
        scoring=SCORING,
        groups=groups,
        n_jobs=n_jobs,
    )

    scores_accuracy = scores['test_accuracy']
    accuracy_mean = scores_accuracy.mean()
    accuracy_std = scores_accuracy.std()
    scores_auc = scores['test_roc_auc']
    auc_mean = scores_auc.mean()
    auc_std = scores_auc.std()
    update_result(
        name,
        accuracy_mean=accuracy_mean,
        accuracy_std=accuracy_std,
        auc_mean=auc_mean,
        auc_std=auc_std,
    )

    print(f'>>> CV Result ({name})')
    print(f'accuracy_mean: {accuracy_mean:.4f}')
    print(f'accuracy_std:  {accuracy_std:.4f}')
    print(f'auc_mean:      {auc_mean:.4f}')
    print(f'auc_std:       {auc_std:.4f}')


## Prepare Data


In [5]:
df_data = pd.read_csv('./data/data_eicu_full.csv.gz')

# clamp infinite values
df_data.replace(np.inf, 9999, inplace=True)

# set categorical columns
for column_name in CATEGORICAL_COLUMNS:
    df_data[column_name] = df_data[column_name].astype('category')


In [6]:
X_raw = df_data.drop(columns=[KEY_IDENTITY, KEY_FLAG])
X_raw.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100133 entries, 0 to 100132
Data columns (total 60 columns):
 #   Column              Non-Null Count   Dtype   
---  ------              --------------   -----   
 0   gender              100133 non-null  category
 1   age                 100133 non-null  float64 
 2   ethnicity           100133 non-null  category
 3   Apache-IV           100133 non-null  float64 
 4   offset              100133 non-null  int64   
 5   vasopressor         100133 non-null  category
 6   heparin             100133 non-null  category
 7   urine               100133 non-null  float64 
 8   PEEP                100133 non-null  float64 
 9   creatinine          100133 non-null  float64 
 10  platelet            100133 non-null  float64 
 11  INR                 100133 non-null  float64 
 12  PT                  100133 non-null  float64 
 13  PTT                 100133 non-null  float64 
 14  lactate             100133 non-null  float64 
 15  RDW              

In [20]:
ordinal_encoder = OrdinalEncoder(
    dtype=np.uint8,
)
X_ordinal = X_raw.copy()
X_ordinal[CATEGORICAL_COLUMNS] = \
    ordinal_encoder.fit_transform(X_ordinal[CATEGORICAL_COLUMNS])
X_ordinal.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100133 entries, 0 to 100132
Data columns (total 60 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   gender              100133 non-null  uint8  
 1   age                 100133 non-null  float64
 2   ethnicity           100133 non-null  uint8  
 3   Apache-IV           100133 non-null  float64
 4   offset              100133 non-null  int64  
 5   vasopressor         100133 non-null  uint8  
 6   heparin             100133 non-null  uint8  
 7   urine               100133 non-null  float64
 8   PEEP                100133 non-null  float64
 9   creatinine          100133 non-null  float64
 10  platelet            100133 non-null  float64
 11  INR                 100133 non-null  float64
 12  PT                  100133 non-null  float64
 13  PTT                 100133 non-null  float64
 14  lactate             100133 non-null  float64
 15  RDW                 100133 non-nul

In [7]:
X_dummy = pd.get_dummies(X_raw)
X_dummy.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100133 entries, 0 to 100132
Data columns (total 69 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   age                         100133 non-null  float64
 1   Apache-IV                   100133 non-null  float64
 2   offset                      100133 non-null  int64  
 3   urine                       100133 non-null  float64
 4   PEEP                        100133 non-null  float64
 5   creatinine                  100133 non-null  float64
 6   platelet                    100133 non-null  float64
 7   INR                         100133 non-null  float64
 8   PT                          100133 non-null  float64
 9   PTT                         100133 non-null  float64
 10  lactate                     100133 non-null  float64
 11  RDW                         100133 non-null  float64
 12  total bilirubin             100133 non-null  float64
 13  direct bilirub

In [8]:
y = df_data[KEY_FLAG].ravel()
y


array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [9]:
groups = df_data[KEY_IDENTITY].ravel()
groups


array([ 141288,  141288,  141288, ..., 3353251, 3353251, 3353251],
      dtype=int64)

## CatBoost


In [11]:
%%time
from catboost import CatBoostClassifier

catboost_model = CatBoostClassifier(
    cat_features=CATEGORICAL_COLUMNS,
    task_type='GPU',
    devices='0',
    random_state=SEED,
    verbose=False,
)

test_sklearn(
    'CatBoost',
    catboost_model,
)


>>> CV Result (CatBoost)
accuracy_mean: 0.9956
accuracy_std:  0.0009
auc_mean:      0.9956
auc_std:       0.0022
CPU times: total: 6min 5s
Wall time: 4min 39s


## Light Gradient Boosting


In [14]:
%%time
from lightgbm import LGBMClassifier

lgb_model = LGBMClassifier(
    # Categorical features will be automatically detected.
    random_state=SEED,
)

test_sklearn(
    'LightGB',
    lgb_model,
)


>>> CV Result (LightGB)
accuracy_mean: 0.9953
accuracy_std:  0.0010
auc_mean:      0.9960
auc_std:       0.0018
CPU times: total: 1min 37s
Wall time: 19.4 s


## Extreme Gradient Boosting


In [15]:
%%time
from xgboost import XGBClassifier

xgboost_model = XGBClassifier(
    enable_categorical=True,
    tree_method='gpu_hist',
    random_state=SEED,
)

test_sklearn(
    'XGBoost',
    xgboost_model,
)


>>> CV Result (XGBoost)
accuracy_mean: 0.9951
accuracy_std:  0.0008
auc_mean:      0.9951
auc_std:       0.0018
CPU times: total: 1min 5s
Wall time: 22.3 s


## Hist Gradient Boosting


In [21]:
%%time
from sklearn.ensemble import HistGradientBoostingClassifier

histGB_model = HistGradientBoostingClassifier(
    random_state=SEED,
    categorical_features=CATEGORICAL_COLUMNS,
)

test_sklearn(
    'HistGB',
    histGB_model,
    encode='ordinal',
)


>>> CV Result (HistGradientBoosting)
accuracy_mean: 0.9942
accuracy_std:  0.0012
auc_mean:      0.9956
auc_std:       0.0021
CPU times: total: 2min 36s
Wall time: 28.7 s


## Extra Trees


In [22]:
%%time
from sklearn.ensemble import ExtraTreesClassifier

extra_trees_model = ExtraTreesClassifier(
    class_weight='balanced',
    random_state=SEED,
    n_jobs=-1,  # use all processors
)

test_sklearn(
    'ExtraTrees',
    extra_trees_model,
    encode='dummy',
)


>>> CV Result (ExtraTrees)
accuracy_mean: 0.9614
accuracy_std:  0.0061
auc_mean:      0.9773
auc_std:       0.0046
CPU times: total: 22.3 s
Wall time: 43.1 s


## Ridge Classifier


In [23]:
%%time
from sklearn.linear_model import RidgeClassifier

ridge_model = RidgeClassifier(
    # class_weight='balanced',  # worse...
    random_state=SEED,
)

test_sklearn(
    'Ridge',
    ridge_model,
    encode='dummy',
)


>>> CV Result (Ridge)
accuracy_mean: 0.9613
accuracy_std:  0.0060
auc_mean:      0.9524
auc_std:       0.0088
CPU times: total: 13.8 s
Wall time: 12.5 s


## Logistic Regression


In [24]:
%%time
from sklearn.linear_model import LogisticRegression

logistic_model = LogisticRegression(
    solver='newton-cholesky',
    # class_weight='balanced',  # worse...
    random_state=SEED,
    n_jobs=-1,  # use all processors
)

test_sklearn(
    'Logistic',
    logistic_model,
    encode='dummy',
)


>>> CV Result (Logistic)
accuracy_mean: 0.9658
accuracy_std:  0.0063
auc_mean:      0.9547
auc_std:       0.0053
CPU times: total: 11.8 s
Wall time: 26.9 s


## K-Nearest Neighbors


In [25]:
%%time
from sklearn.neighbors import KNeighborsClassifier

KNN_model = KNeighborsClassifier(
    n_jobs=-1,  # use all processors
)

test_sklearn(
    'KNN',
    KNN_model,
    encode='dummy',
)


>>> CV Result (KNN)
accuracy_mean: 0.9507
accuracy_std:  0.0063
auc_mean:      0.5398
auc_std:       0.0171
CPU times: total: 8min 57s
Wall time: 1min


## AdaBoost


In [27]:
%%time
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

adaboost_model = AdaBoostClassifier(
    random_state=SEED,
)

test_sklearn(
    'AdaBoost',
    adaboost_model,
    encode='dummy',
    n_jobs=-1,
)


>>> CV Result (AdaBoost)
accuracy_mean: 0.9928
accuracy_std:  0.0021
auc_mean:      0.9946
auc_std:       0.0023
CPU times: total: 10.9 s
Wall time: 35.2 s


## Linear Discriminant Analysis


In [29]:
%%time
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

lda_model = LinearDiscriminantAnalysis()

test_sklearn(
    'LDA',
    lda_model,
    encode='dummy',
    n_jobs=-1,
)


>>> CV Result (LDA)
accuracy_mean: 0.9610
accuracy_std:  0.0076
auc_mean:      0.9524
auc_std:       0.0088
CPU times: total: 10.9 s
Wall time: 17.1 s


## MLP


In [36]:
%%time
from sklearn.neural_network import MLPClassifier

mlp_model = make_pipeline(
    StandardScaler(),
    MLPClassifier(
        hidden_layer_sizes=(100,),
        early_stopping=True,
        max_iter=1000,
        random_state=SEED,
    ),
)

test_sklearn(
    'MLP',
    mlp_model,
    encode='dummy',
    n_jobs=-1,
)


>>> CV Result (MLP)
accuracy_mean: 0.9822
accuracy_std:  0.0032
auc_mean:      0.9742
auc_std:       0.0078
CPU times: total: 11 s
Wall time: 47.3 s


## Decision Tree


In [30]:
%%time
from sklearn.tree import DecisionTreeClassifier

decision_tree_model = DecisionTreeClassifier(
    class_weight='balanced',
    random_state=SEED,
)

test_sklearn(
    'DecisionTree',
    decision_tree_model,
    encode='dummy',
    n_jobs=-1,
)


>>> CV Result (DecisionTree)
accuracy_mean: 0.9903
accuracy_std:  0.0017
auc_mean:      0.9482
auc_std:       0.0075
CPU times: total: 10.9 s
Wall time: 14.3 s


## SVM (With RBF Kernel)


In [37]:
%%time
from sklearn.svm import SVC

svm_model = make_pipeline(
    StandardScaler(),
    SVC(
        kernel='rbf',  # default
        random_state=SEED,
    ),
)

test_sklearn(
    'SVM',
    svm_model,
    encode='dummy',
    n_jobs=-1,
)


>>> CV Result (SVM)
accuracy_mean: 0.9729
accuracy_std:  0.0046
auc_mean:      0.9560
auc_std:       0.0052
CPU times: total: 11 s
Wall time: 4min


## Naive Bayes


In [31]:
%%time
from sklearn.naive_bayes import MultinomialNB

nb_model = make_pipeline(
    MinMaxScaler(),
    MultinomialNB(),
)

test_sklearn(
    'NaiveBayes',
    nb_model,
    encode='dummy',
    n_jobs=-1,
)


>>> CV Result (NaiveBayes)
accuracy_mean: 0.9614
accuracy_std:  0.0061
auc_mean:      0.6900
auc_std:       0.0648
CPU times: total: 10.9 s
Wall time: 11.6 s


## Save Result


In [39]:
df_result.to_csv(MODEL_RESULT_PATH)


## Summary


In [51]:
df_summary = df_result.T.sort_values(
    by=['Accuracy Mean', 'AUC Mean'],
    ascending=False,
).copy()
df_summary.columns.name = ''
df_summary.index.name = 'Model Name'
df_summary.style.format('{:.4f}')


Unnamed: 0_level_0,Accuracy Mean,Accuracy Std,AUC Mean,AUC Std
Model Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CatBoost,0.9957,0.0007,0.9957,0.0021
LightGB,0.9953,0.001,0.996,0.0018
XGBoost,0.9951,0.0008,0.9951,0.0018
HistGB,0.9942,0.0012,0.9956,0.0021
AdaBoost,0.9928,0.0021,0.9946,0.0023
DecisionTree,0.9903,0.0017,0.9482,0.0075
MLP,0.9822,0.0032,0.9742,0.0078
SVM,0.9729,0.0046,0.956,0.0052
Logistic,0.9658,0.0063,0.9547,0.0053
ExtraTrees,0.9614,0.0061,0.9773,0.0046
