In [1]:
import time
import pandas as pd
import numpy as np
from sklearn import pipeline
from sklearn.model_selection import cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv', header=None)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [3]:
X = df.loc[:,0:7]
y = df.loc[:,8]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# GridSearchCV

In [6]:
param_gridsearch = {
    'learning_rate' : [0.01, 0.1, 1],
    'max_depth' : [5, 10, 15],
    'max_leaves' : [5, 25, 50, 100, 500],
    'booster': ['gbtree', 'gblinear','dart'],
    'colsample_bytree' : [0.6, 0.75, 1],
}

In [None]:
start = time.time()

grid_obj = GridSearchCV(estimator=XGBClassifier(),
                        param_grid=param_gridsearch,
                        cv=2,
                        refit=True, return_train_score=False, scoring = 'accuracy')

grid_obj.fit(X_train, y_train)

estimator = grid_obj.best_estimator_
cvs = cross_val_score(estimator, X_train, y_train, cv=5)
results = pd.DataFrame(grid_obj.cv_results_)

In [9]:
print("##### Results")
print("Score best parameters: ", grid_obj.best_score_)
print("Best parameters: ", grid_obj.best_params_)
print("Cross-validation Score: ", cvs.mean())
print("Test Score: ", estimator.score(X_test, y_test))
print("Time elapsed: ", time.time() - start)

##### Results
Score best parameters:  0.7817589576547231
Best parameters:  {'booster': 'gblinear', 'colsample_bytree': 0.75, 'learning_rate': 1, 'max_depth': 10, 'max_leaves': 100}
Cross-validation Score:  0.7784752765560442
Test Score:  0.7337662337662337
Time elapsed:  241.50753903388977


# RandomSearchCV

In [10]:
param_random = {
    'learning_rate': list(np.logspace(np.log(0.01), np.log(0.99), num = 500, base=3)),
    'max_depth': list(range(5, 15)),
    'max_leaves': list(range(5, 500)),
    'boosting_type': ['gbtree', 'gblinear','dart'],
    'colsample_bytree': list(np.linspace(0.6, 1, 500)),
}

In [None]:
start = time.time()

grid_obj = RandomizedSearchCV(estimator=XGBClassifier(),
                            param_distributions=param_random,
                            cv=5,
                            refit=True,
                            return_train_score=False,
                            scoring = 'accuracy',
                            random_state=1)
grid_obj.fit(X_train, y_train)

estimator = grid_obj.best_estimator_
cvs = cross_val_score(estimator, X_train, y_train, cv=5)
results = pd.DataFrame(grid_obj.cv_results_)

In [12]:
print("##### Results")
print("Score best parameters: ", grid_obj.best_score_)
print("Best parameters: ", grid_obj.best_params_)
print("Cross-validation Score: ", cvs.mean())
print("Test Score: ", estimator.score(X_test, y_test))
print("Time elapsed: ", time.time() - start)

##### Results
Score best parameters:  0.7524057043849128
Best parameters:  {'max_leaves': 49, 'max_depth': 12, 'learning_rate': 0.18821179167129765, 'colsample_bytree': 0.8036072144288577, 'boosting_type': 'gbtree'}
Cross-validation Score:  0.7524057043849128
Test Score:  0.7467532467532467
Time elapsed:  14.16955280303955


# Bayesian 

In [13]:
from skopt import BayesSearchCV
from skopt.space import Real, Integer, Categorical

In [14]:
# Define search space
param_space = {
    'n_estimators': Integer(50, 500),
    'max_depth': Integer(3, 20),
    'learning_rate': Real(1e-3, 0.2, prior='log-uniform'),
    'subsample': Real(0.5, 1.0),
    'colsample_bytree': Real(0.4, 1.0),
    'gamma': Real(1e-3, 10.0, prior='log-uniform'),
    'reg_alpha': Real(1e-6, 1.0, prior='log-uniform'),
    'reg_lambda': Real(1e-6, 1.0, prior='log-uniform')
}

In [None]:
# Start timing
start = time.time()

# Set up BayesSearchCV
bayes_cv = BayesSearchCV(
    estimator=XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=1),
    search_spaces=param_space,
    n_iter=30,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    refit=True,
    random_state=1,
    return_train_score=False)

# Fit
bayes_cv.fit(X_train, y_train)

# Evaluate best model
best_model = bayes_cv.best_estimator_
cvs = cross_val_score(best_model, X_train, y_train, cv=5, scoring='accuracy')
results = pd.DataFrame(bayes_cv.cv_results_)

# End timing
end = time.time()
elapsed = end - start

In [16]:
# Output
print("##### Bayesian Search Results")
print(f"Score best parameters: {bayes_cv.best_score_:.4f}")
print(f"Best parameters: {bayes_cv.best_params_}")
print(f"Cross-validation Score: {cvs.mean():.4f}")
print(f"Test Score: {best_model.score(X_test, y_test):.4f}")
print(f"Time elapsed: {elapsed:.2f} seconds")

##### Bayesian Search Results
Score best parameters: 0.7622
Best parameters: OrderedDict([('colsample_bytree', 0.9896686905508654), ('gamma', 0.031083351083605607), ('learning_rate', 0.07058148871271128), ('max_depth', 7), ('n_estimators', 52), ('reg_alpha', 0.0009206404587371533), ('reg_lambda', 7.466394412768047e-05), ('subsample', 0.5678095465938425)])
Cross-validation Score: 0.7622
Test Score: 0.7468
Time elapsed: 29.79 seconds
