# K Nearest Neighbors Classifier

**Basic steps:**

1. Import the learning algorithm
2. Instantiate the model (choose hyper-parameters)
3. Learn the model
4. Predict the response

# Get Example Data

In [102]:
# target = InMichelin, whether or not a restaurant is in the Michelin guide
import pandas as pd
data = pd.read_csv("http://gattonweb.uky.edu/sheather/book/docs/datasets/MichelinNY.csv" , encoding="latin_1")
data.head()

Unnamed: 0,InMichelin,Restaurant Name,Food,Decor,Service,Price
0,0,14 Wall Street,19,20,19,50
1,0,212,17,17,16,43
2,0,26 Seats,23,17,21,35
3,1,44,19,23,16,52
4,0,A,23,12,19,24


In [103]:
# Delete extra variable that is not continuous
data = data.loc[:, data.columns != 'Restaurant Name']

data.head()

Unnamed: 0,InMichelin,Food,Decor,Service,Price
0,0,19,20,19,50
1,0,17,17,16,43
2,0,23,17,21,35
3,1,19,23,16,52
4,0,23,12,19,24


# Change variable names to X, y to create train/test split

In [121]:
y = data['InMichelin']
X = data.loc[:, data.columns != 'InMichelin']

print(y[0:5])
X.head()

0    0
1    0
2    0
3    1
4    0
Name: InMichelin, dtype: int64


Unnamed: 0,Food,Decor,Service,Price
0,19,20,19,50
1,17,17,16,43
2,23,17,21,35
3,19,23,16,52
4,23,12,19,24


# Train test split

In [122]:
from sklearn.model_selection import train_test_split

# Use train_test_split(X,y) to create four new data sets, defaults to .75/.25 split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=23)

print(X.shape)
X_train.shape

(164, 4)


(123, 4)

### Train model with k=5

In [17]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

#Print accuracy rounded to two digits to the right of decimal
print("accuracy: {:.2f}".format(knn.score(X_test, y_test)))

y_pred = knn.predict(X_test) # y_pred includes your predictions

accuracy: 0.83


### Train model with k=10

In [37]:
knn = KNeighborsClassifier(n_neighbors=21)
knn.fit(X_train, y_train)

#Print accuracy rounded to two digits to the right of decimal
print("accuracy: {:.2f}".format(knn.score(X_test, y_test)))
y_pred = knn.predict(X_test)

accuracy: 0.83


In [38]:
y_pred # view predictions for test data

array([0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1,
       0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1],
      dtype=int64)

## Using Cross validation for model evaluation

In [101]:
#import cross validation functions from sk learn

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import StratifiedKFold

from sklearn.model_selection import KFold
import numpy as np
# Set up function parameters for diff't cross validation strategies
kfold = KFold(n_splits=5)
skfold = StratifiedKFold(n_splits=5, shuffle=True)
rkf = RepeatedKFold(n_splits=5, n_repeats=10)

from statistics import mean 

print("KFold:\n{}".format(
mean(cross_val_score(KNeighborsClassifier(n_neighbors=5), X_train, y_train, cv=kfold))))

print("StratifiedKFold:\n{}".format(
mean(cross_val_score(KNeighborsClassifier(n_neighbors=5), X_train, y_train, cv=skfold))))

print("RepeatedKFold:\n{}".format(
mean(cross_val_score(KNeighborsClassifier(n_neighbors=5), X_train, y_train, cv=rkf))))

KFold:
0.7966666666666666
StratifiedKFold:
0.797
RepeatedKFold:
0.7817333333333334


## Grid Search with CV using loop

In [117]:
from sklearn.model_selection import cross_val_score
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=23)
neighbors = list(range(1, 100, 2))
cross_val_scores = []
for i in neighbors:
    knn = KNeighborsClassifier(n_neighbors=i)
    scores = cross_val_score(knn, X_train, y_train, cv=10)
    cross_val_scores.append(np.mean(scores))
print('best cross-validation score: {:.3f}'.format(np.max(cross_val_scores)))
best_n_neighbors = neighbors[np.argmax(cross_val_scores)]
print('best n_neighbors: {}'.format(best_n_neighbors))

knn = KNeighborsClassifier(n_neighbors=best_n_neighbors)
knn.fit(X_train, y_train)
print('test-set score: {:.3f}'.format(knn.score(X_test, y_test)))

best cross-validation score: 0.821
best n_neighbors: 23
test-set score: 0.829


In [86]:
from sklearn.model_selection import cross_val_score
seed = list(range(100))
neighbors = list(range(1, 100, 2))
cross_val_scores = []
for j in seed:
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=j)
    neighbors = list(range(1, 30, 2))
    cross_val_scores = []
    for i in neighbors:
        knn = KNeighborsClassifier(n_neighbors=29)
        scores = cross_val_score(knn, X_train, y_train, cv=10)
        cross_val_scores.append(np.mean(scores))
        # print(j, cross_val_scores)
print('best cross-validation score: {:.3f}'.format(np.max(cross_val_scores)))
best_n_seed = neighbors[np.argmax(cross_val_scores)]
print('best seed: {}'.format(best_n_seed))
best_n_neighbors = neighbors[np.argmax(cross_val_scores)]
print('best n_neighbors: {}'.format(best_n_neighbors))

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=best_n_seed)
knn = KNeighborsClassifier(n_neighbors=best_n_neighbors)
knn.fit(X_train, y_train)
print('test-set score: {:.3f}'.format(knn.score(X_test, y_test)))

best cross-validation score: 0.813
best seed: 1
best n_neighbors: 1
test-set score: 0.854


## GridSearchCV

In [136]:
from sklearn.model_selection import GridSearchCV
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=23)

param_grid = {'n_neighbors': np.arange(1, 15, 2)}

grid = GridSearchCV(KNeighborsClassifier(), param_grid=param_grid, cv=10)
grid.fit(X_train, y_train)

print('best mean cross-validation score: {:.3f}'.format(np.max(grid.best_score_)))
print('best parameter: {}'.format(grid.best_params_))

print('test-set score: {:.3f}'.format(grid.score(X_test, y_test)))

best mean cross-validation score: 0.796
best parameter: {'n_neighbors': 7}
test-set score: 0.829


array([ 1,  3,  5,  7,  9, 11, 13])

## Tuning models with grid search

In [None]:
from sklearn.model_selection import GridSearchCV
import numpy as np

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

#create dictionary data object with keys equal to parameter name 'n_neighbors' 
#for knn model and values equal to range of k values to create models for

param_grid = {'n_neighbors': [1,3,5,7,9]} #np.arange creates sequence of numbers for each k value

grid = GridSearchCV(KNeighborsClassifier(), param_grid=param_grid, cv=10)

#use meta model methods to fit score and predict model:
grid.fit(X_train, y_train)

#extract best score and parameter by calling objects "best_score_" and "best_params_"
print("best mean cross-validation score: {:.3f}".format(grid.best_score_))
print("best parameters: {}".format(grid.best_params_))
print("test-set score: {:.3f}".format(grid.score(X_test, y_test)))


best mean cross-validation score: 0.788
best parameters: {'n_neighbors': 5}
test-set score: 0.854


In [None]:
np.arange(1, 15, 2)

array([ 1,  3,  5,  7,  9, 11, 13])

In [None]:
# view data with complete tuning results
results = pd.DataFrame(grid.cv_results_)
results


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_neighbors,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,0.002016,0.000281,0.002462,0.000193,1,{'n_neighbors': 1},0.692308,1.0,0.615385,0.666667,0.583333,0.333333,0.583333,0.833333,0.666667,0.916667,0.689103,0.180178,5
1,0.002069,0.000264,0.002759,0.000726,3,{'n_neighbors': 3},0.769231,0.692308,0.769231,0.666667,0.75,0.583333,0.583333,0.833333,0.666667,0.916667,0.723077,0.100492,4
2,0.001858,9.3e-05,0.002262,9.1e-05,5,{'n_neighbors': 5},0.846154,0.923077,0.692308,0.666667,0.75,0.666667,0.75,0.916667,0.75,0.916667,0.787821,0.098828,1
3,0.001928,0.000209,0.002372,0.000141,7,{'n_neighbors': 7},0.846154,0.923077,0.615385,0.583333,0.75,0.583333,0.833333,0.833333,0.75,0.916667,0.763462,0.123585,3
4,0.001967,7.6e-05,0.002604,0.000294,9,{'n_neighbors': 9},0.846154,0.923077,0.615385,0.583333,0.75,0.583333,0.833333,0.916667,0.833333,0.916667,0.780128,0.131623,2
