In [1]:
import pandas as pd
import numpy as np

X_train_df = pd.read_csv('../data/final/X_train_df.csv')
X_test_df = pd.read_csv('../data/final/X_test_df.csv')
y_train_ser = pd.read_csv('../data/final/y_train_ser.csv')
y_test_ser = pd.read_csv('../data/final/y_test_ser.csv')

In [4]:
X_train_df.drop('Unnamed: 0', axis=1, inplace=True)
X_test_df.drop('Unnamed: 0', axis=1, inplace=True)
y_train_ser

Unnamed: 0,0
0,0
1,0
2,0
3,0
4,0
...,...
55755,0
55756,0
55757,0
55758,0


In [7]:
from imblearn.over_sampling import SMOTE
random_state=2020
sm = SMOTE(sampling_strategy='minority', random_state=random_state)
X_train_res, y_train_res = sm.fit_resample(X_train_df.values, y_train_ser.values)
y_train_res.sum

<function ndarray.sum>

In [8]:
y_train_res.sum()/len(y_train_res)

0.5

In [10]:
SEED=2020
# import datetime
from datetime import datetime

# import a host of Scikit-learn models
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

# import model metrics
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score

# import ROC_AUC scoring, we will use area under the ROC curve for comparison
from sklearn.metrics import roc_curve, auc, roc_auc_score, confusion_matrix, classification_report

def get_models():
    """Generate a library of base learners."""
    nb = GaussianNB()
    svc = SVC(C=100, probability=True)
    knn = KNeighborsClassifier(n_neighbors=3)
    lr = LogisticRegression(C=1, random_state=SEED, solver='liblinear')
    nn = MLPClassifier((100, 50), early_stopping=True, random_state=SEED, max_iter=400, activation='tanh')
    gb = GradientBoostingClassifier(n_estimators=100, random_state=SEED)
    rf = RandomForestClassifier(n_estimators=10, max_features=3, random_state=SEED)

    models = {#'svm': svc,
              'knn': knn,
              'naive bayes': nb,
              'mlp-nn': nn,
              'random forest': rf,
              'gbm': gb,
              'logistic': lr,
              }

    return models


def train_predict(model_list, xtrain, ytrain, xtest):
    """Fit models in list on training set and return preds"""
    P = np.zeros((y_test.shape[0], len(model_list)))
    P = pd.DataFrame(P)

    print("Fitting models.")
    cols = list()
    for i, (name, m) in enumerate(models.items()):
        current_time = datetime.now() 
        print("%s..." % name, end=" ", flush=False)
        m.fit(xtrain, ytrain)
        P.iloc[:, i] = m.predict_proba(xtest)[:, 1]
        cols.append(name)
        time_elapsed = datetime.now() - current_time
        print("Time elapsed ", time_elapsed)
        print("done")

    P.columns = cols
    print("Done.\n")
    return P


def score_models(P, y):
    """Score model in prediction DF"""
    print("Scoring models.")
    for m in P.columns:
        score = roc_auc_score(y, P.loc[:, m])
        ap = average_precision_score(y, P.loc[:, m])
        print("%-26s: %.3f, %.3f" % (m, score, ap))
    print("Done.\n")

In [13]:
y_test = y_test_ser.values
models = get_models()
P = train_predict(models, X_train_res, y_train_res, X_test_df.values)
score_models(P, y_test)

Fitting models.
knn... Time elapsed  0:03:30.309751
done
naive bayes... Time elapsed  0:00:00.143163
done
mlp-nn... Time elapsed  0:01:08.860761
done
random forest... Time elapsed  0:00:02.780074
done
gbm... Time elapsed  0:02:38.615075
done
logistic... Time elapsed  0:00:08.553231
done
Done.

Scoring models.
knn                       : 0.626, 0.042
naive bayes               : 0.665, 0.077
mlp-nn                    : 0.642, 0.060
random forest             : 0.688, 0.054
gbm                       : 0.736, 0.083
logistic                  : 0.754, 0.100
Done.



In [14]:
from imblearn.over_sampling import ADASYN
sm = ADASYN(sampling_strategy='minority', random_state=random_state)
X_train_res, y_train_res = sm.fit_resample(X_train_df.values, y_train_ser.values)
y_train_res.sum()/len(y_train_res)

0.49998156851903053

In [15]:
y_test = y_test_ser.values
models = get_models()
P = train_predict(models, X_train_res, y_train_res, X_test_df.values)
score_models(P, y_test)

Fitting models.
knn... Time elapsed  0:03:36.086396
done
naive bayes... Time elapsed  0:00:00.152936
done
mlp-nn... Time elapsed  0:00:46.870722
done
random forest... Time elapsed  0:00:02.463693
done
gbm... Time elapsed  0:02:11.727784
done
logistic... Time elapsed  0:00:08.628508
done
Done.

Scoring models.
knn                       : 0.625, 0.041
naive bayes               : 0.663, 0.077
mlp-nn                    : 0.680, 0.061
random forest             : 0.679, 0.056
gbm                       : 0.739, 0.082
logistic                  : 0.753, 0.099
Done.



In [16]:
X_train_df.head()

Unnamed: 0,age,min_result_BUN,min_result_Hct,min_result_Hgb,min_result_RBC,min_result_WBC x 1000,min_result_bicarbonate,min_result_calcium,min_result_chloride,min_result_creatinine,...,ethnicity_Caucasian,ethnicity_Hispanic,ethnicity_Other/Unknown,unittype_CCU-CTICU,unittype_CSICU,unittype_Cardiac ICU,unittype_MICU,unittype_Med-Surg ICU,unittype_Neuro ICU,unittype_SICU
0,0.406268,1.023213,0.262566,0.356537,0.568954,0.561636,-0.061688,0.664046,0.413381,-0.43405,...,1,0,0,0,0,0,0,1,0,0
1,0.346603,-0.677429,0.84041,0.882307,0.452071,-0.262118,-0.620186,0.334288,-0.325351,-0.560845,...,1,0,0,0,0,0,0,1,0,0
2,-1.443338,-0.677429,1.187116,1.182747,0.744279,0.2609,0.869143,-0.325227,0.117888,-0.51858,...,0,0,0,1,0,0,0,0,0,0
3,-0.966021,-1.03173,1.007342,1.332967,0.311811,-0.667457,0.124478,1.983076,-1.507323,-0.43405,...,1,0,0,0,0,0,0,1,0,0
4,0.823921,-0.323129,-1.085736,-1.183217,-1.35962,-0.235967,1.427641,0.664046,0.117888,-0.152282,...,1,0,0,0,0,0,0,1,0,0


In [20]:
#gb = GradientBoostingClassifier(n_estimators=100, random_state=SEED)
#gb.fit(X_train_df.values, y_train_ser.values)
#y_pred = gb.predict_proba(X_test_df.values)[:, 1]
#score = roc_auc_score(y_test_ser.values, y_pred)
ap = average_precision_score(y_test_ser.values, y_pred)
print(score, ap)

0.7785718487032491 0.11621831939118292


In [21]:
gb

GradientBoostingClassifier(random_state=2020)

In [24]:
feat_imp = pd.DataFrame({'features':X_train_df.columns, 'feature_importance':gb.feature_importances_})
feat_imp.sort_values(by='feature_importance', ascending=False)

Unnamed: 0,features,feature_importance
22,max_result_creatinine,0.373892
7,min_result_calcium,0.083989
32,delta_result_bicarbonate,0.040206
5,min_result_WBC x 1000,0.037507
6,min_result_bicarbonate,0.031893
...,...,...
53,ethnicity_Caucasian,0.000000
52,ethnicity_Asian,0.000000
51,ethnicity_African American,0.000000
49,from_or_Yes,0.000000


In [26]:
import pickle
filename = '../models/gbm_initial.pkl'
pickle.dump(gb, open(filename, 'wb'))