In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.model_selection import (cross_val_score, train_test_split,
                                    GridSearchCV, RandomizedSearchCV)
from sklearn.preprocessing import Imputer
%load_ext autoreload
%autoreload 2

# SEED = 42 # Initial results were performed with a SEED of 42, but let's change things up.
SEED = 25

In [2]:
train = pd.read_csv('data/train_final.csv')
test = pd.read_csv('data/test_final.csv')
train = train.drop(['id'], axis=1)
test = test.drop(['id'], axis=1)

In [3]:
# Divide dataset into X and y
y = train.Y
X = train.drop(["Y"], axis=1)
X_test = test

In [4]:
# Impute missing features
from sklearn.preprocessing import Imputer

# train = train.apply(lambda x: x.fillna(x.value_counts().index[0]))
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
train_xform = imp.fit_transform(X)
# TODO: Impute dataframe so that F5 uses median
# and F19 uses mean. For now, we'll impute via mean for both.

X = pd.DataFrame(train_xform, columns=X.columns)
test_xform = imp.transform(X_test)
X_test = pd.DataFrame(test_xform, columns=X_test.columns)

In [5]:
X_train = X
y_train = y

In [6]:
import os
os.environ['OMP_NUM_THREADS'] = '4' # Let OpenMP use 1 thread to avoid possible subprocess call hangs
import hyperopt

In [7]:
# Set up logging for XGBoost param tuning.

import logging
logging.basicConfig(filename="logs/xgb_skl_hyperopt.log", level=logging.INFO)

In [22]:
# import xgboost as xgb
# from sklearn.metrics import roc_auc_score

# dtrain = xgb.DMatrix(X_train, label=y_train)
# params = {  'n_estimators': 100, 
#             'eta': 0.1,
#             # A problem with max_depth casted to float instead of int with
#             # the hp.quniform method.
#             'max_depth': 25,
#             'min_child_weight': 6,
#             'subsample': 0.7,
#             'gamma': 0.3,
#             'colsample_bytree': 0.7,
#             'eval_metric': 'auc',
#             'objective': 'binary:logistic',
#             # Increase this number if you have more cores. Otherwise, remove it and it will default 
#             # to the maxium number. 
#             'nthread': 4,
#             'booster': 'gbtree',
#             'tree_method': 'exact',
#             'silent': 1,
#             'seed': SEED
#         }
# scores = xgb.cv(params, dtrain, nfold=5, stratified=True, verbose_eval=500)


[0]	train-auc:0.823153+0.0165849	test-auc:0.815401+0.0144789


In [40]:
# scores.tail(1).iloc[0, 1]

0.0098138053455324153

In [41]:
import xgboost as xgb
from sklearn.metrics import roc_auc_score
# Hyperparameters tuning
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe

# Scoring and optimization functions

def score(params):
    logging.info("Training with params: ")
    logging.info(params)
    num_round = int(params['n_estimators'])
    del params['n_estimators']
    dtrain = xgb.DMatrix(X_train, label=y_train)
    score_history = xgb.cv(params, dtrain, num_round,
                    nfold=5, stratified=True,
                    early_stopping_rounds=250,
                    verbose_eval=500)
    # Only use scores from last boosting round for this set of hyperparams.
    mean_last_round = score_history.tail(1).iloc[0, 0]
    std_last_round = score_history.tail(1).iloc[0, 1]
    # TODO: Add the importance for the selected features
    logging.info("\tMean Score: {0}\n".format(mean_last_round))
    logging.info("\tStd Dev: {0}\n\n".format(std_last_round))
    # The score function should return the loss (1-score)
    # since the optimize function looks for the minimum
    loss = 1 - mean_last_round
    return {'loss': loss, 'status': STATUS_OK}

In [42]:
def optimize(
             #trials, 
             random_state=SEED):
    """
    This is the optimization function that given a space (space here) of 
    hyperparameters and a scoring function (score here), finds the best hyperparameters.
    """
    # To learn more about XGBoost parameters, head to this page: 
    # https://github.com/dmlc/xgboost/blob/master/doc/parameter.md
    space = {
        'n_estimators': hp.quniform('n_estimators', 100, 1000, 1),
        'eta': hp.quniform('eta', 0.01, 0.1, 0.025),
        # A problem with max_depth casted to float instead of int with
        # the hp.quniform method.
        'max_depth':  hp.choice('max_depth', np.arange(5, 30, 4, dtype=int)),
        'min_child_weight': hp.quniform('min_child_weight', 1, 7, 2),
        'subsample': hp.quniform('subsample', 0.6, 1, 0.1),
        'gamma': hp.quniform('gamma', 0.1, 1, 0.2),
        'colsample_bytree': hp.quniform('colsample_bytree', 0.5, 1, 0.1),
        'eval_metric': 'auc',
        'objective': 'binary:logistic',
        # Increase this number if you have more cores. Otherwise, remove it and it will default 
        # to the maxium number. 
        'nthread': 4,
        'booster': 'gbtree',
        'tree_method': 'exact',
        'silent': 1,
        'seed': random_state
    }
    # Use the fmin function from Hyperopt to find the best hyperparameters
    best = fmin(score, space, algo=tpe.suggest, 
                # trials=trials, 
                max_evals=250)
    return best

In [43]:
best_hyperparams = optimize(
                            #trials
                            )
print("The best hyperparameters are: ", "\n")
print(best_hyperparams)

[0]	train-auc:0.824938+0.0166366	test-auc:0.813122+0.01288
[0]	train-auc:0.853022+0.00442814	test-auc:0.837581+0.0117243
[0]	train-auc:0.856823+0.0056061	test-auc:0.834943+0.0125085
[0]	train-auc:0.808594+0.0244222	test-auc:0.793251+0.0239726
[0]	train-auc:0.847141+0.00284951	test-auc:0.814121+0.0130268
[0]	train-auc:0.858265+0.00546674	test-auc:0.834697+0.0125382
[0]	train-auc:0.847142+0.00454815	test-auc:0.835843+0.0126234
[0]	train-auc:0.823617+0.016144	test-auc:0.817208+0.0117573
[0]	train-auc:0.815403+0.0125306	test-auc:0.815038+0.0126083
[0]	train-auc:0.854088+0.00518082	test-auc:0.836857+0.012655
[0]	train-auc:0.5+0	test-auc:0.5+0
[0]	train-auc:0.821155+0.0165794	test-auc:0.809751+0.0156826
[0]	train-auc:0.817899+0.0171294	test-auc:0.81166+0.0145074
[0]	train-auc:0.853713+0.00482869	test-auc:0.833493+0.0129354
[0]	train-auc:0.811689+0.0152934	test-auc:0.811075+0.0153527
[0]	train-auc:0.855371+0.00515924	test-auc:0.837334+0.0129644
[0]	train-auc:0.846948+0.00477786	test-auc:0.836

In [None]:
scores