In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Classification Algorithms. Part III

# Efficiently searching for optimal hyper-parameters: GridSearchCV

In [4]:
# load the pima indian dataset
url = 'https://raw.githubusercontent.com/um-perez-alvaro/Data-Science-Practice/master/Data/pima.csv'
pima = pd.read_csv(url)
pima.head()

Unnamed: 0,pregnant,glucose,bp,skin,insulin,bmi,pedigree,age,label
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [17]:
# define X and y
feature_cols = ['pregnant','glucose','bp','skin','insulin','bmi','age']
X = pima[feature_cols]
y = pima.label

In [11]:
# import the knn classifier
from sklearn.neighbors import KNeighborsClassifier
knn_clf = KNeighborsClassifier()

In [12]:
# import GridSearch
from sklearn.model_selection import GridSearchCV

In [13]:
# define the parameter values that should be searched
k_range = list(range(1, 51))
print(k_range)

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50]


In [30]:
# create a parameter grid dictionary that maps the parameter names to the values that should be searched
param_grid = {'n_neighbors':k_range}
print(param_grid)

{'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50]}


In [31]:
# initializer the grid
grid = GridSearchCV(knn_clf, param_grid, cv=10, scoring='accuracy')

In [32]:
grid.fit(X,y)

GridSearchCV(cv=10, error_score=nan,
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='deprecated', n_jobs=None,
             param_grid={'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
                                         13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
                                         23, 24, 25, 26, 27, 28, 29, 30, ...]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=0)

In [33]:
pd.DataFrame(grid.cv_results_)[['mean_test_score', 'params']] # scores into a DataFrame

Unnamed: 0,mean_test_score,params
0,0.679665,{'n_neighbors': 1}
1,0.712235,{'n_neighbors': 2}
2,0.70176,{'n_neighbors': 3}
3,0.718712,{'n_neighbors': 4}
4,0.720062,{'n_neighbors': 5}
5,0.735714,{'n_neighbors': 6}
6,0.739627,{'n_neighbors': 7}
7,0.738312,{'n_neighbors': 8}
8,0.737047,{'n_neighbors': 9}
9,0.743472,{'n_neighbors': 10}


In [34]:
# examine the best model
print(grid.best_score_)
print(grid.best_params_)

0.7552973342447027
{'n_neighbors': 17}


## Searching multiple parameters simultaneously

In [35]:
# define the parameter values that should be searched
k_rankge = list(range(1,51))
weights_options = ['uniform','distance']

In [36]:
# create a parameter grid
param_grid = {'n_neighbors' : k_range, 'weights' : weights_options}
print(param_grid)

{'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50], 'weights': ['uniform', 'distance']}


In [37]:
# instantiate and fit the grid
grid = GridSearchCV(knn_clf, param_grid, cv=10, scoring='accuracy')
grid.fit(X, y)

GridSearchCV(cv=10, error_score=nan,
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='deprecated', n_jobs=None,
             param_grid={'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
                                         13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
                                         23, 24, 25, 26, 27, 28, 29, 30, ...],
                         'weights': ['uniform', 'distance']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=0)

In [38]:
# view the results
pd.DataFrame(grid.cv_results_)[['mean_test_score', 'params']]

Unnamed: 0,mean_test_score,params
0,0.679665,"{'n_neighbors': 1, 'weights': 'uniform'}"
1,0.679665,"{'n_neighbors': 1, 'weights': 'distance'}"
2,0.712235,"{'n_neighbors': 2, 'weights': 'uniform'}"
3,0.679665,"{'n_neighbors': 2, 'weights': 'distance'}"
4,0.701760,"{'n_neighbors': 3, 'weights': 'uniform'}"
...,...,...
95,0.744891,"{'n_neighbors': 48, 'weights': 'distance'}"
96,0.731921,"{'n_neighbors': 49, 'weights': 'uniform'}"
97,0.744874,"{'n_neighbors': 49, 'weights': 'distance'}"
98,0.722796,"{'n_neighbors': 50, 'weights': 'uniform'}"


In [39]:
# examine the best model
print(grid.best_score_)
print(grid.best_params_)

0.7552973342447027
{'n_neighbors': 17, 'weights': 'uniform'}
