In [1]:
# Tuning row subsampling

In [4]:
from pandas import read_csv
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
import matplotlib
matplotlib.use('Agg')
from matplotlib import pyplot

In [6]:
data = read_csv('train.csv')
dataset = data.values

In [7]:
X = dataset[:,0:94]
y = dataset[:,94]

In [8]:
label_encoded_y = LabelEncoder().fit_transform(y)
model = XGBClassifier()

In [10]:
subsample = [0.5]
param_grid = dict(subsample=subsample)

In [11]:
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=7)
grid_search = GridSearchCV(model, param_grid, scoring="neg_log_loss",n_jobs=-1, cv=kfold)
grid_result = grid_search.fit(X, label_encoded_y)

In [12]:
print("Best: %f using %s" %(grid_result.best_score_, grid_result.best_params_))

Best: -0.000626 using {'subsample': 0.5}


In [13]:
means = grid_result.cv_results_["mean_test_score"]
stds = grid_result.cv_results_["std_test_score"]
params = grid_result.cv_results_["params"]

In [14]:
for mean, stdev, param in zip(means, stds, params):
  print("%f (%f) with: %r" %(mean, stdev, param))

-0.000626 (0.000431) with: {'subsample': 0.5}


In [15]:
pyplot.errorbar(subsample, means, yerr=stds)
pyplot.title("XGBoost subsample vs Log Loss")
pyplot.xlabel('subsample')
pyplot.ylabel('Log Loss')
pyplot.savefig('subsample.png')

In [16]:
# tuning by column subsampling

In [17]:
model = XGBClassifier()
colsample_bytree = [0.1]

In [18]:
param_grid = dict(colsample_bytree=colsample_bytree)

In [19]:
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=7)

In [20]:
grid_search = GridSearchCV(model, param_grid, scoring="neg_log_loss",n_jobs=-1,cv=kfold)

In [21]:
grid_result = grid_search.fit(X,label_encoded_y)

In [22]:
print("Best: %f using %s" %(grid_result.best_score_,grid_result.best_params_))

Best: -0.235667 using {'colsample_bytree': 0.1}


In [23]:
means = grid_result.cv_results_["mean_test_score"]
stds = grid_result.cv_results_["std_test_score"]
params=grid_result.cv_results_["params"]

In [25]:
for mean, stdev, param in zip(means, stds, params):
  print("%f (%f) with: %r" %(mean, stdev, param))

-0.235667 (0.002086) with: {'colsample_bytree': 0.1}


In [27]:
pyplot.errorbar(colsample_bytree, means, yerr=stds)
pyplot.title("XGBoost colsample_bytree vs Log Loss")
pyplot.xlabel('colsample_bytree')
pyplot.ylabel('Log Loss')
pyplot.savefig('colsample_bytree.png')

In [28]:
# Tuning column sampling by split

In [29]:
model = XGBClassifier()
colsample_bylevel = [0.3]
param_grid = dict(colsample_bylevel=colsample_bylevel)

In [30]:
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=7)

In [31]:
grid_search=GridSearchCV(model, param_grid, scoring="neg_log_loss",n_jobs=-1, cv=kfold)

In [32]:
grid_result = grid_search.fit(X, label_encoded_y)

In [33]:
print("Best: %f using %s" %(grid_result.best_score_, grid_result.best_params_))

Best: -0.008942 using {'colsample_bylevel': 0.3}


In [34]:
means = grid_result.cv_results_["mean_test_score"]
stds = grid_result.cv_results_["std_test_score"]
params = grid_result.cv_results_["params"]

In [35]:
for mean, stdev, param in zip(means, stds, params):
  print("%f (%f) with: %r" %(mean, stdev, param))

-0.008942 (0.000371) with: {'colsample_bylevel': 0.3}


In [36]:
pyplot.errorbar(colsample_bylevel, means, yerr=stds)

<ErrorbarContainer object of 3 artists>

In [37]:
pyplot.title("XGBoost colsample_bylevel vs Log Loss")
pyplot.xlabel("colsample_bylevel")
pyplot.ylabel('Log Loss')
pyplot.savefig('colsample_bylevel.png')