In [None]:
# XGBoost on Otto dataset, Tune learning_rate
from pandas import read_csv
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
import matplotlib
matplotlib.use('Agg')
from matplotlib import pyplot
# load data
data = read_csv('train.csv')
dataset = data.values
# split data into X and y
X = dataset[:,0:94]
y = dataset[:,94]
# encode string class values as integers
label_encoded_y = LabelEncoder().fit_transform(y)
# grid search
model = XGBClassifier()
learning_rate = [0.1]
param_grid = dict(learning_rate=learning_rate)
kfold = StratifiedKFold(n_splits=2, shuffle=True, random_state=7)
grid_search = GridSearchCV(model, param_grid, scoring="neg_log_loss", n_jobs=-1, cv=kfold)
grid_result = grid_search.fit(X, label_encoded_y)

In [None]:
print("Best: %f using %s" %(grid_result.best_score_,grid_result.best_params_))


Best: -0.001683 using {'learning_rate': 0.1}


In [None]:
means = grid_result.cv_results_["mean_test_score"]
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']

In [None]:
for mean, stdev, param in zip(means, stds, params):
  print("%f (%f) with : %r" %(mean,stdev,param))

-0.001683 (0.000136) with : {'learning_rate': 0.1}


In [None]:
pyplot.errorbar(learning_rate,means,yerr=stds)
pyplot.title("XGBoost learning_rate vs Log Loss")
pyplot.xlabel('learning_rate')
pyplot.ylabel('Log Loss')
pyplot.savefig('learning_rate.png')

In [None]:
# tuning learning rate and the number of trees

In [None]:
import numpy

In [None]:
label_encoded_y = LabelEncoder().fit_transform(y)

In [None]:
model = XGBClassifier()
n_estimators = [100]
learning_rate = [0.1]

In [None]:
param_grid = dict(learning_rate=learning_rate, n_estimatorss=n_estimators)

In [None]:
kfold = StratifiedKFold(n_splits=10, shuffle=True,random_state=7)

In [None]:
grid_search = GridSearchCV(model, param_grid, scoring="neg_log_loss", n_jobs=-1, cv=kfold)

In [None]:
grid_result = grid_search.fit(X,label_encoded_y)

In [None]:
print("Best: %f using %s" %(grid_result.best_score_, grid_result.best_params_))


Best: -0.001258 using {'learning_rate': 0.1, 'n_estimatorss': 100}


In [None]:
means = grid_result.cv_results_["mean_test_score"]
stds = grid_result.cv_results_["std_test_score"]
params = grid_result.cv_results_["params"]

In [None]:
for mean, stdev, param in zip(means, stds, params):
  print("%f (%f) with: %r" %(mean, stdev, param))

-0.001258 (0.001116) with: {'learning_rate': 0.1, 'n_estimatorss': 100}


In [None]:
scores = numpy.array(means).reshape(len(learning_rate), len(n_estimators))

In [None]:
for i, value in enumerate(learning_rate):
  pyplot.plot(n_estimators, scores[i], label="learning_rate: " +str(value))

In [None]:
pyplot.legend()

<matplotlib.legend.Legend at 0x7f50afb01c50>

In [None]:
pyplot.xlabel('n_estimators')

Text(0.5, 15.0, 'n_estimators')

In [None]:
pyplot.ylabel('Log Loss')
pyplot.savefig('n_estimators_vs_learning_rate.png')