# Baseline model

*Du passé faisons table rase*

We use as a simple baseline a model where only the information a time T is used.

### Import 

In [1]:
from sepsis.io import get_train, get_valid
from sepsis import constants

import pandas as pd

In [2]:
train = get_train()

In [3]:
vars = constants.TRAIN_FEATURES
X_train = train[vars]
y_train = train['SepsisLabel']

In [4]:
def add_feat(X):

    Xtilde = X.drop(['Gender', 'Age', 'Unit1','Unit2', 'HospAdmTime'], axis=1)

    Xtilde = Xtilde.set_index('ID').groupby('ID').ffill()
    cummax = Xtilde.groupby('ID').cummax().reset_index(drop=True)
    cummin = Xtilde.groupby('ID').cummin().reset_index(drop=True)
    rollmean = Xtilde.groupby('ID').rolling(4).mean().reset_index(drop=True)

    Xtilde = Xtilde.reset_index(drop=True)
    X_max = Xtilde/cummax
    X_min = Xtilde/cummin
    X_roll = Xtilde/rollmean
    
    X_max = X_max.rename({col:f'{col}_omax' for col in Xtilde.columns }, axis=1)
    X_min = X_min.rename({col:f'{col}_omin' for col in Xtilde.columns}, axis=1)
    X_roll = X_roll.rename({col:f'{col}_oroll' for col in Xtilde.columns}, axis=1)
    

    X = X.drop('ID', axis=1)
    return pd.concat((X,X_max, X_min, X_roll), axis=1)

In [5]:
X_train = add_feat(X_train)

### Logistic Regression

In [20]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import numpy as np
pipe = Pipeline([ ('impute', SimpleImputer(missing_values=np.nan)),('scaler', StandardScaler()), ('lr', LogisticRegression(class_weight='balanced', C=0.5))])

In [21]:
from sklearn.model_selection import cross_validate
cross_validate(pipe, X_train, y_train, scoring=['accuracy', 'f1', 'recall', 'precision'], cv=5)

{'fit_time': array([11.90558434, 12.38771319, 13.3988018 , 11.73445964, 12.69082808]),
 'score_time': array([1.45044923, 1.10850048, 1.00815701, 0.91079378, 0.8796792 ]),
 'test_accuracy': array([0.62269342, 0.63567996, 0.67609775, 0.73479096, 0.7433023 ]),
 'test_f1': array([0.06161152, 0.06026287, 0.06878901, 0.07325038, 0.07213057]),
 'test_recall': array([0.68884409, 0.64964158, 0.66532258, 0.5828853 , 0.55475924]),
 'test_precision': array([0.03224792, 0.03159695, 0.03626949, 0.03908081, 0.03857294])}

In [None]:
from sklearn.model_selection import GridSearchCV
param_grid = [
  {'lr__C': [0.1, 0.5, 1, 10, 100, 1000]}
 ]
gs = GridSearchCV(pipe, param_grid, scoring='f1')

In [None]:
gs.fit(X_train, y_train)

In [None]:
best = gs.best_estimator_
gs.best_score_

0.05534478677446324

not good ...

In [None]:
from sklearn.model_selection import cross_validate
cross_validate(best, X_train, y_train, scoring=['accuracy', 'f1', 'recall', 'precision'])

{'fit_time': array([2.56453085, 2.64168382, 2.2443347 , 2.12499452, 2.26426649]),
 'score_time': array([0.41758394, 0.48809052, 0.33798933, 0.37431622, 0.33893943]),
 'test_accuracy': array([0.55792183, 0.55432072, 0.62294316, 0.70410099, 0.71147641]),
 'test_f1': array([0.05035953, 0.05239763, 0.05559087, 0.06171846, 0.05665745]),
 'test_recall': array([0.65188172, 0.68525986, 0.6171595 , 0.54121864, 0.48174692]),
 'test_precision': array([0.02619144, 0.02724026, 0.02910631, 0.03272515, 0.03009865])}

In [None]:
# evaluate on valid but need to use custom metrics