In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.model_selection import (cross_val_score, train_test_split,
                                    GridSearchCV, RandomizedSearchCV)
from sklearn.preprocessing import Imputer
%load_ext autoreload
%autoreload 2

# SEED = 42 # Initial results were performed with a SEED of 42, but let's change things up.
SEED = 25

In [2]:
train = pd.read_csv('data/train_final.csv')
test = pd.read_csv('data/test_final.csv')
train = train.drop(['id'], axis=1)
test = test.drop(['id'], axis=1)

In [3]:
# Divide dataset into X and y
y = train.Y
X = train.drop(["Y"], axis=1)
X_test = test

In [4]:
# Impute missing features
from sklearn.preprocessing import Imputer

# train = train.apply(lambda x: x.fillna(x.value_counts().index[0]))
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
train_xform = imp.fit_transform(X)
# TODO: Impute dataframe so that F5 uses median
# and F19 uses mean. For now, we'll impute via mean for both.

X = pd.DataFrame(train_xform, columns=X.columns)
test_xform = imp.transform(X_test)
X_test = pd.DataFrame(test_xform, columns=X_test.columns)

X.head(5)

Unnamed: 0,F1,F2,F3,F4,F5,F6,F7,F8,F9,F10,...,F18,F19,F20,F21,F22,F23,F24,F25,F26,F27
0,1.0,0.0,0.107576,0.0,0.0,1.0,1.0,1.0,1.0,0.0,...,104.0,22902.0,1.0,0.0,18.0,0.042295,1.0,0.0,27.0,0.02825
1,1.0,0.0,0.142357,0.0,0.0,7.0,1.0,1.0,1.0,1.0,...,144.0,11400.0,1.0,0.0,8.0,0.021417,1.0,0.0,67.0,0.253574
2,1.0,0.0,0.492318,0.0,3.0,4205.0,1.0,1.0,3.0,1.0,...,112.0,4833.0,1.0,0.0,13.0,0.502212,1.0,1.0,35.0,0.373397
3,1.0,0.0,-0.053028,0.0,2.0,2.0,1.0,1.0,5.0,2.0,...,127.0,3250.0,1.0,1.0,8.0,0.0,1.0,0.0,50.0,0.674254
4,1.0,0.0,0.730797,0.0,0.0,11.0,1.0,1.0,1.0,1.0,...,148.0,4000.0,1.0,1.0,5.0,0.787592,1.0,0.0,71.0,0.371157


In [5]:
# Split data.
# X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.10, random_state=SEED)
X_train = X
y_train = y

In [6]:
import os
os.environ['OMP_NUM_THREADS'] = '4' # Let OpenMP use 1 thread to avoid possible subprocess call hangs
import hyperopt

### Credits to this Kaggle post for inspiration:
### https://www.kaggle.com/yassinealouini/predicting-red-hat-business-value/hyperopt-the-xgboost-model/code

In [7]:
# Set up logging for XGBoost param tuning.

import logging
logging.basicConfig(filename="logs/xgb_hyperopt.log", level=logging.INFO)

In [8]:
import xgboost as xgb
from sklearn.metrics import roc_auc_score
# Hyperparameters tuning
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe

# Scoring and optimization functions

def score(params):
    logging.info("Training with params: ")
    logging.info(params)
    num_round = int(params['n_estimators'])
    del params['n_estimators']
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dvalid = xgb.DMatrix(X_val, label=y_val)
    watchlist = [(dvalid, 'eval'), (dtrain, 'train')]
    gbm_model = xgb.train(params, dtrain, num_round,
                          evals=watchlist,
                          verbose_eval=500)
    predictions = gbm_model.predict(dvalid,
                                    ntree_limit=gbm_model.best_iteration + 1)
    score = roc_auc_score(y_val, predictions)
    # TODO: Add the importance for the selected features
    logging.info("\tScore {0}\n\n".format(score))
    # The score function should return the loss (1-score)
    # since the optimize function looks for the minimum
    loss = 1 - score
    return {'loss': loss, 'status': STATUS_OK}

In [9]:


def optimize(
             #trials, 
             random_state=SEED):
    """
    This is the optimization function that given a space (space here) of 
    hyperparameters and a scoring function (score here), finds the best hyperparameters.
    """
    # To learn more about XGBoost parameters, head to this page: 
    # https://github.com/dmlc/xgboost/blob/master/doc/parameter.md
    space = {
        'n_estimators': hp.quniform('n_estimators', 100, 1000, 1),
        'eta': hp.quniform('eta', 0.01, 0.1, 0.025),
        # A problem with max_depth casted to float instead of int with
        # the hp.quniform method.
        'max_depth':  hp.choice('max_depth', np.arange(5, 30, 4, dtype=int)),
        'min_child_weight': hp.quniform('min_child_weight', 1, 7, 2),
        'subsample': hp.quniform('subsample', 0.6, 1, 0.1),
        'gamma': hp.quniform('gamma', 0.1, 1, 0.2),
        'colsample_bytree': hp.quniform('colsample_bytree', 0.5, 1, 0.1),
        'eval_metric': 'auc',
        'objective': 'binary:logistic',
        # Increase this number if you have more cores. Otherwise, remove it and it will default 
        # to the maxium number. 
        'nthread': 4,
        'booster': 'gbtree',
        'tree_method': 'exact',
        'silent': 1,
        'seed': random_state
    }
    # Use the fmin function from Hyperopt to find the best hyperparameters
    best = fmin(score, space, algo=tpe.suggest, 
                # trials=trials, 
                max_evals=250)
    return best

In [10]:
best_hyperparams = optimize(
                            #trials
                            )
print("The best hyperparameters are: ", "\n")
print(best_hyperparams)

[0]	eval-auc:0.828442	train-auc:0.842444
[0]	eval-auc:0.824303	train-auc:0.839469
[0]	eval-auc:0.814955	train-auc:0.846895
[500]	eval-auc:0.8392	train-auc:0.978388
[0]	eval-auc:0.799387	train-auc:0.8315
[0]	eval-auc:0.830811	train-auc:0.854556
[500]	eval-auc:0.82435	train-auc:0.999999
[0]	eval-auc:0.821289	train-auc:0.851162
[0]	eval-auc:0.819121	train-auc:0.846984
[500]	eval-auc:0.838219	train-auc:0.994233
[0]	eval-auc:0.828271	train-auc:0.860166
[500]	eval-auc:0.819217	train-auc:1
[0]	eval-auc:0.821589	train-auc:0.850649
[500]	eval-auc:0.812064	train-auc:0.999999
[0]	eval-auc:0.821473	train-auc:0.851447
[500]	eval-auc:0.822344	train-auc:1
[0]	eval-auc:0.821829	train-auc:0.848365
[500]	eval-auc:0.837029	train-auc:0.991455
[0]	eval-auc:0.828915	train-auc:0.842521
[500]	eval-auc:0.848834	train-auc:0.927116
[0]	eval-auc:0.821121	train-auc:0.850302
[0]	eval-auc:0.828965	train-auc:0.86053
[0]	eval-auc:0.828893	train-auc:0.860531
[500]	eval-auc:0.826926	train-auc:1
[0]	eval-auc:0.819677	tra

KeyboardInterrupt: 