# Searching for the optimal parameters

In [1]:
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn import metrics
import numpy as np
import pandas as pd
import pickle
from sklearn.datasets import fetch_openml

  from pandas import MultiIndex, Int64Index


In [6]:
from sklearn.model_selection import GridSearchCV

In [3]:
# import the mnist dataset
mnist = fetch_openml('mnist_784', version=1)
mnist.keys()

dict_keys(['data', 'target', 'frame', 'categories', 'feature_names', 'target_names', 'DESCR', 'details', 'url'])

In [7]:
# separate features and target
X, y = mnist["data"], mnist["target"]
print(X.shape)
print(y.shape)

(70000, 784)
(70000,)


In [8]:
# train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                       test_size=0.2, 
                                       random_state=42)

## Preprocessing

In [9]:
# standardize
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## XG Boost

There are in general two ways that you can control overfitting in XGBoost:

- The first way is to directly control model complexity.

    - This includes max_depth, min_child_weight and gamma.

- The second way is to add randomness to make training robust to noise.

    - This includes subsample and colsample_bytree.

    - You can also reduce stepsize eta. Remember to increase num_round when you do so.

[source](https://xgboost.readthedocs.io/en/stable/tutorials/param_tuning.html#:~:text=There%20are%20in,you%20do%20so.)

In [10]:
# modeling: XGBoost
model = XGBClassifier()

Gridsearch suggestions:  
https://towardsdatascience.com/doing-xgboost-hyper-parameter-tuning-the-smart-way-part-1-of-2-f6d255a45dde

Using this example but simplifying the grid to conserve time.


In [11]:
# define your parameter grid 

param_grid = {
    "learning_rate"     : [ 0.10, 0.15, 0.30 ] ,
     "max_depth"        : [ 3,  15],
     "min_child_weight" : [ 1 ],
     "gamma"            : [ 0.0 ],
     "colsample_bytree" : [ 0.3,  0.5  ] 
}

In [12]:
# establish cross-validation and gridsearch 
# note: setting cross-validation =2 to save time
# xgb has cross-validation built in so repeating here is unnecessary
xgb_grid = GridSearchCV(model, param_grid, verbose=1, cv=2, n_jobs=-1)

In [13]:
# conduct the search (this will take a while)
xgb_grid.fit(X_train_scaled, y_train)

Fitting 2 folds for each of 12 candidates, totalling 24 fits


  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
























































In [15]:
# see the best parameters and their score
print("Best parameters:")
print(xgb_grid.best_params_)
print("Best score in grid search:")
print(xgb_grid.best_score_)
print("best model from grid search:")
print(xgb_grid.score(X_test_scaled, y_test))

Best parameters:
{'colsample_bytree': 0.3, 'gamma': 0.0, 'learning_rate': 0.3, 'max_depth': 15, 'min_child_weight': 1}
Best score in grid search:
0.9690892857142857
best model from grid search:
0.9759285714285715


In [17]:
# predict
y_preds=xgb_grid.predict(X_test_scaled)
print(list(y_preds[:10]))
print(list(y_test[:10]))

['8', '4', '8', '7', '7', '0', '6', '2', '7', '4']
['8', '4', '8', '7', '7', '0', '6', '2', '7', '4']


In [18]:
# evaluate
print('Accuracy:', metrics.accuracy_score(y_test, y_preds))
print('Precision:', metrics.precision_score(y_test, y_preds,average='macro'))
print('Recall:', metrics.recall_score(y_test, y_preds,average='macro'))
print('F1 Score:', metrics.f1_score(y_test, y_preds,average='macro'))

Accuracy: 0.9759285714285715
Precision: 0.975946634556154
Recall: 0.9758378482860085
F1 Score: 0.9758807315441829


## Pickle the model

In [19]:
# xgboost
f = open('xgb_grid_model.pkl', 'wb')
pickle.dump(xgb_grid, f)
f.close() 