# Searching for the optimal parameters

In [1]:
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn import metrics
import numpy as np
import pandas as pd
import pickle
from sklearn.datasets import fetch_openml

  from pandas import MultiIndex, Int64Index


In [2]:
from sklearn.model_selection import GridSearchCV

In [3]:
# import the mnist dataset
mnist = fetch_openml('mnist_784', version=1)
mnist.keys()

dict_keys(['data', 'target', 'frame', 'categories', 'feature_names', 'target_names', 'DESCR', 'details', 'url'])

In [4]:
# separate features and target
X, y = mnist["data"], mnist["target"]
print(X.shape)
print(y.shape)

(70000, 784)
(70000,)


In [5]:
# train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                       test_size=0.2, 
                                       random_state=42)

## Preprocessing

In [6]:
# standardize
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Single Decision Tree

In [7]:
# instantiate with arbitrary hyperparameters
model = DecisionTreeClassifier()

In [8]:
# define your parameter grid 
param_grid = {
                'max_depth':[5,10,15], 
                'criterion':['entropy','gini'], 
                'min_samples_leaf':[6,10,20],
                'class_weight':['balanced', None]
             }

In [9]:
# establish cross-validation and gridsearch
tree_grid = GridSearchCV(model, param_grid, cv=10, verbose=1,n_jobs=-1)

In [10]:
# conduct the search
tree_grid.fit(X_train_scaled, y_train)

Fitting 10 folds for each of 36 candidates, totalling 360 fits


GridSearchCV(cv=10, estimator=DecisionTreeClassifier(), n_jobs=-1,
             param_grid={'class_weight': ['balanced', None],
                         'criterion': ['entropy', 'gini'],
                         'max_depth': [5, 10, 15],
                         'min_samples_leaf': [6, 10, 20]},
             verbose=1)

In [12]:
# see the best parameters and their score
print("Best parameters:")
print(tree_grid.best_params_)
print("Best score in grid search:")
print(tree_grid.best_score_)
print("best model from grid search:")
print(tree_grid.score(X_test_scaled, y_test))

Best parameters:
{'class_weight': None, 'criterion': 'entropy', 'max_depth': 15, 'min_samples_leaf': 6}
Best score in grid search:
0.8764285714285714
best model from grid search:
0.881


In [13]:
# predict
y_preds=tree_grid.predict(X_test_scaled)
print(list(y_preds[:10]))
print(list(y_test[:10]))

['8', '4', '9', '7', '7', '0', '6', '2', '7', '4']
['8', '4', '8', '7', '7', '0', '6', '2', '7', '4']


In [14]:
# evaluate
print('Accuracy:', metrics.accuracy_score(y_test, y_preds))
print('Precision:', metrics.precision_score(y_test, y_preds,average='macro'))
print('Recall:', metrics.recall_score(y_test, y_preds,average='macro'))
print('F1 Score:', metrics.f1_score(y_test, y_preds,average='macro'))

Accuracy: 0.881
Precision: 0.8792545759314961
Recall: 0.8792557961991131
F1 Score: 0.8791306621883453


## pickle the model

In [18]:
# pickle
f = open('tree_grid_model.pkl', 'wb')
pickle.dump(tree_grid, f)
f.close() 