Things to check:

-Maybe we have too many features? Try dropping some of the less important
categorical/engineered ones

-Other methods of upsampling? Maybe mix up and down sampling?

-Tuning parameters of SMOTE? Cursory look says 2 neighbors works best

-Maybe the paper just used training AUC??


In [1]:
from preprocess import surgery_preprocess # A .py with preprocessing code
# The models we'll use, minus XGBoost
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
# For eval
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold, cross_val_score
from sklearn.metrics import roc_auc_score, recall_score, accuracy_score
# imblearn is 'imbalanced learn,' an sklearn-compatible package for
# dealing with imbalanced data
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import RandomOverSampler, SMOTENC, BorderlineSMOTE, SMOTE, ADASYN,KMeansSMOTE, SVMSMOTE
from scipy.stats import uniform
# HyperOpt is a hyperparameter tuning package
from hyperopt import fmin, tpe, anneal, hp, Trials, space_eval
import numpy as np

X_train, X_test, y_train, y_test = surgery_preprocess()
X_train.columns

Index(['FVC', 'FEV1', 'Performance', 'Pain', 'Haemoptysis', 'Dyspnoea',
       'Cough', 'Weakness', 'Tumor_size', 'Type2_diabetes', 'MI_6months',
       'PAD', 'Smoking', 'Asthma', 'Age', 'FEV1/FVC', 'FVC_deficit',
       'FEV1_deficit', 'FEV1/FVC_deficit', 'FEV1^2', 'FVC^2', 'Age*FVC',
       'Age*FEV1', 'FVC*FEV1', 'FVC^2*FEV1', 'FVC*FEV1^2', '_DGN2', '_DGN3',
       '_DGN4', '_DGN5'],
      dtype='object')

In [None]:
# Initializing a few things that will be used in several cells

kf = StratifiedKFold(n_splits = 5, shuffle=True)
# SMOTENC is built to work with categorical data. ROS randomly resamples the
# minority class. Similar performance so far, but ROS is faster
smoter_nc = SMOTENC(categorical_features = list(range(2,14))+list(range(26,30)),
                             k_neighbors=2)
ros = RandomOverSampler()

### Logistic Regression

In [None]:
# HyperOpt requires you to manually build the objective function which
# it will try to minimize. Only argument needed is params, which are the
# hyperparameters it tries. Might be able to use other parameters to the
# objective using functools?
def lr_objective(params):
    lr_pipe.set_params(**params)
    score = cross_val_score(lr_pipe, X, y, cv=cv, scoring='roc_auc')
    return 1-score.mean()
lr = LogisticRegression()
lr_pipe = Pipeline([('upsample', smoter_nc), ('model', lr)])
lr_params = {'model__C': hp.uniform('model__C', 0,1000)}
lr_trials = Trials() # Stores logging information

# This call is what actually optimizer parameters
lr_best = fmin(fn = lr_objective, space=lr_params, 
               algo=anneal.suggest, max_evals = 200,
               trials=lr_trials)
lr_best_params = space_eval(lr_params, lr_best) # Retrieves those parameters

In [None]:
print(best_params)
lr_pipe.set_params(**lr_best_params)
lr_pipe.fit(X_train, y_train) 
print(roc_auc_score(y_test, lr_pipe.predict_proba(X_test)[:,1]))
print(recall_score(y_test, lr_pipe.predict(X_test)))

### SVM

In [None]:
def svm_objective(params):
    sv_pipe.set_params(**params)
    score = cross_val_score(sv_pipe, X_train, y_train, cv=kf, scoring='roc_auc')
    return 1-score.mean()

# I tried a gaussian kernel and it often failed to fit. Maybe
# worth playing with sigmoid/poly, or tuning class_weights?
sv = svm.SVC(kernel='linear', probability=True)
sv_pipe = Pipeline([('upsample', ros), ('model', sv)])
sv_params = {'model__C': hp.uniform('model__C', 0,200)}
sv_trials = Trials()
sv_best = fmin(fn=svm_objective, space=sv_params,
           algo=anneal.suggest, max_evals=50,
           trials=sv_trials)

sv_best_params = space_eval(sv_params, sv_trials.argmin)

In [None]:
print(best_params)
sv_pipe.set_params(**sv_best_params)
sv_pipe.fit(X_train, y_train) 
print(roc_auc_score(y_test, sv_pipe.predict_proba(X_test)[:,1]))
print(recall_score(y_test, sv_pipe.predict(X_test)))

### Random Forest

In [None]:
def rf_objective(params):
    rf_pipe.set_params(**params)
    score = cross_val_score(rf_pipe, X_train_t, y_train, cv=kf, scoring='recall').mean()
    return 1-score

rf = RandomForestClassifier()
rf_pipe = Pipeline([('upsample', ros), ('model', rf)])
# These search spaces are kinda arbitrary
rf_params = {'model__n_estimators': hp.choice('model__n_estimators', range(50,200)),
             'model__min_samples_leaf': hp.choice('model__min_samples_leaf',range(1,21)),
             'model__min_samples_split': hp.choice('model__min_samples_split',range(2,21)),
            'model__max_features': hp.uniform('model__max_features', 0.5, 1.0),
            'model__max_depth': hp.choice('model__max_depth', range(3,10))}
rf_trials = Trials()
rf_best = fmin(fn=rf_objective, space=rf_params, algo=tpe.suggest,
              max_evals=100,trials=rf_trials)
rf_best_params = space_eval(rf_params, rf_best)

In [None]:
print(best_params)
f_pipe.set_params(**rf_best_params)
f_pipe.fit(X_train, y_train) 
print(roc_auc_score(y_test, rf_pipe.predict_proba(X_test)[:,1]))
print(recall_score(y_test, rf_pipe.predict(X_test)))