## K-Nearest Neighbors Using Iris Dataset

Author: Thomas James Tiam-Lee

### Import Libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from ucimlrepo import fetch_ucirepo 

### Load the Dataset

In [3]:
iris = fetch_ucirepo(id=53) 

In [4]:
df_iris = pd.concat([iris['data']['features'], iris['data']['targets']], axis=1)
df_iris

Unnamed: 0,sepal length,sepal width,petal length,petal width,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica


### Separate Training, Validation, and Test Set

In [13]:
X = df_iris[['sepal length', 'sepal width', 'petal length', 'petal width']].to_numpy()
y = df_iris['class'].to_numpy()

In [14]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=30, random_state=1)

In [24]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=30, random_state=1)

### Train the Model

In [39]:
from sklearn.neighbors import KNeighborsClassifier

model = KNeighborsClassifier(n_neighbors=5)

In [40]:
model.fit(X_train, y_train)

### Make a Single Prediction

In [41]:
X_test[0], y_test[0]

(array([5.8, 4. , 1.2, 0.2]), 'Iris-setosa')

In [42]:
model.predict(X_test[0].reshape(1,-1))

array(['Iris-setosa'], dtype=object)

In [43]:
model.kneighbors(X_test[0].reshape(1,-1))

(array([[0.54772256, 0.55677644, 0.58309519, 0.59160798, 0.65574385]]),
 array([[43, 80, 74, 42, 83]], dtype=int64))

In [46]:
X_train[[43, 80, 74, 42, 83]], y_train[[43, 80, 74, 42, 83]]

(array([[5.7, 4.4, 1.5, 0.4],
        [5.7, 3.8, 1.7, 0.3],
        [5.4, 3.7, 1.5, 0.2],
        [5.5, 3.5, 1.3, 0.2],
        [5.3, 3.7, 1.5, 0.2]]),
 array(['Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa',
        'Iris-setosa'], dtype=object))

### Make Multiple Predictions

In [51]:
y_pred = model.predict(X_test)
y_pred

array(['Iris-setosa', 'Iris-versicolor', 'Iris-versicolor', 'Iris-setosa',
       'Iris-virginica', 'Iris-versicolor', 'Iris-virginica',
       'Iris-setosa', 'Iris-setosa', 'Iris-virginica', 'Iris-versicolor',
       'Iris-setosa', 'Iris-virginica', 'Iris-versicolor',
       'Iris-versicolor', 'Iris-setosa', 'Iris-versicolor',
       'Iris-versicolor', 'Iris-setosa', 'Iris-setosa', 'Iris-versicolor',
       'Iris-versicolor', 'Iris-versicolor', 'Iris-setosa',
       'Iris-virginica', 'Iris-versicolor', 'Iris-setosa', 'Iris-setosa',
       'Iris-versicolor', 'Iris-virginica'], dtype=object)

In [50]:
import sklearn.metrics as metrics

In [52]:
metrics.confusion_matrix(y_test, y_pred)

array([[11,  0,  0],
       [ 0, 13,  0],
       [ 0,  0,  6]], dtype=int64)

In [53]:
metrics.accuracy_score(y_test, y_pred)

1.0

### Hyperparameter Tuning

In [57]:
for k in range(1, 11):
  model = KNeighborsClassifier(n_neighbors=k)
  model.fit(X_train, y_train)
  y_pred = model.predict(X_val)
  accuracy = metrics.accuracy_score(y_val, y_pred)
  print(f'k = {k}    Accuracy = {accuracy}')

k = 1    Accuracy = 0.9666666666666667
k = 2    Accuracy = 0.9666666666666667
k = 3    Accuracy = 0.9666666666666667
k = 4    Accuracy = 0.9666666666666667
k = 5    Accuracy = 0.9333333333333333
k = 6    Accuracy = 0.9666666666666667
k = 7    Accuracy = 0.9666666666666667
k = 8    Accuracy = 1.0
k = 9    Accuracy = 1.0
k = 10    Accuracy = 1.0


### Choose the Final Model

Although the differences are minor, the best-performing $k$ appears to be 8, so we can choose that as the final model.

In [59]:
final_model = KNeighborsClassifier(n_neighbors=8)
final_model.fit(X_train, y_train)

### Evaluate Using the Test Set

In [62]:
y_pred = final_model.predict(X_test)

In [63]:
metrics.accuracy_score(y_test, y_pred)

1.0