In [1]:
import time
import pandas as pd
import numpy as np
from sklearn import pipeline
from sklearn.model_selection import cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from hyperopt import hp

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv', header=None)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [3]:
X = df.loc[:,0:7]
y = df.loc[:,8]

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# GridSearchCV

In [5]:
param_gridsearch = {
    'learning_rate' : [0.01, 0.1, 1],
    'max_depth' : [5, 10, 15],
    'max_leaves' : [5, 25, 50, 100, 500],
    'booster': ['gbtree', 'gblinear','dart'],
    'colsample_bytree' : [0.6, 0.75, 1],
}

In [None]:
start = time.time()

grid_obj = GridSearchCV(estimator=XGBClassifier(),
                        param_grid=param_gridsearch,
                        cv=5,
                        refit=True,
                        return_train_score=False,
                        scoring = 'accuracy',
                               )

grid_obj.fit(X_train, y_train)

estimator = grid_obj.best_estimator_
cvs = cross_val_score(estimator, X_train, y_train, cv=5)
results = pd.DataFrame(grid_obj.cv_results_)

In [7]:
print("##### Results")
print("Score best parameters: ", grid_obj.best_score_)
print("Best parameters: ", grid_obj.best_params_)
print("Cross-validation Score: ", cvs.mean())
print("Test Score: ", estimator.score(X_test, y_test))
print("Time elapsed: ", time.time() - start)

##### Results
Score best parameters:  0.7623084099693456
Best parameters:  {'booster': 'dart', 'colsample_bytree': 0.75, 'learning_rate': 0.01, 'max_depth': 5, 'max_leaves': 5}
Cross-validation Score:  0.7623084099693456
Test Score:  0.7987012987012987
Time elapsed:  312.0681128501892


# RandomSearchCV

In [8]:
param_random = {
    'learning_rate': list(np.logspace(np.log(0.01), np.log(0.99), num = 500, base=3)),
    'max_depth': list(range(5, 15)),
    'max_leaves': list(range(5, 500)),
    'boosting_type': ['gbtree', 'gblinear','dart'],
    'colsample_bytree': list(np.linspace(0.6, 1, 500)),
}

In [None]:
start = time.time()

grid_obj = RandomizedSearchCV(estimator=XGBClassifier(),
                            param_distributions=param_random,
                            cv=5,
                            refit=True,
                            return_train_score=False,
                            scoring = 'accuracy',
                            random_state=1)
grid_obj.fit(X_train, y_train)

estimator = grid_obj.best_estimator_
cvs = cross_val_score(estimator, X_train, y_train, cv=5)
results = pd.DataFrame(grid_obj.cv_results_)

In [10]:
print("##### Results")
print("Score best parameters: ", grid_obj.best_score_)
print("Best parameters: ", grid_obj.best_params_)
print("Cross-validation Score: ", cvs.mean())
print("Test Score: ", estimator.score(X_test, y_test))
print("Time elapsed: ", time.time() - start)

##### Results
Score best parameters:  0.7492736238837798
Best parameters:  {'max_leaves': 329, 'max_depth': 5, 'learning_rate': 0.006612261287076887, 'colsample_bytree': 0.8004008016032064, 'boosting_type': 'dart'}
Cross-validation Score:  0.7492736238837798
Test Score:  0.7987012987012987
Time elapsed:  4.390085220336914


In [None]:
param_hyperopt= {
    'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(1)),
    'max_depth': scope.int(hp.quniform('max_depth', 5, 15, 1)),
    'n_estimators': scope.int(hp.quniform('n_estimators', 5, 35, 1)),
    'num_leaves': scope.int(hp.quniform('num_leaves', 5, 50, 1)),
    'boosting_type': hp.choice('boosting_type', ['gbdt', 'dart']),
    'colsample_bytree': hp.uniform('colsample_by_tree', 0.6, 1.0),
    'reg_lambda': hp.uniform('reg_lambda', 0.0, 1.0),
}