# k-nearest neighbors (KNN)

In [None]:
from sklearn.datasets import make_blobs 
#https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_blobs.html
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import colors
%matplotlib inline

In [None]:
X, y = make_blobs(centers=2, cluster_std=2, random_state=0, n_samples=50)
plt.rcParams['image.cmap'] = "bwr"
plt.rcParams['savefig.bbox'] = "tight"
plt.rcParams['figure.dpi'] = "150"
plt.figure()
plt.gca().set_aspect("equal")
plt.scatter(X[:, 0], X[:, 1], c=y)
plt.savefig("images/knn_boundary_dataset.png", bbox_inches='tight')

In [None]:
X_test = np.array([[0.5, 3], [4, 2], [-.5, .5]])

plt.figure()
plt.gca().set_aspect("equal")
plt.scatter(X[:, 0], X[:, 1], c=y)
plt.scatter(X_test[:, 0], X_test[:, 1], c='k', marker="*")
plt.savefig("images/knn_boundary_test_points.png", bbox_inches='tight')

In [None]:
from sklearn.metrics import euclidean_distances
from sklearn.neighbors import KNeighborsClassifier

def plot_n_neighbors(X, y, n_neighbors=1):
    X_test = np.array([[0.5, 3], [4, 2], [-.5, .5]])
    dist = euclidean_distances(X, X_test)
    closest = np.argsort(dist, axis=0)

    plt.figure()
    plt.gca().set_aspect("equal")
    for x, neighbors in zip(X_test, closest.T):
        for neighbor in neighbors[:n_neighbors]:
            plt.arrow(x[0], x[1], X[neighbor, 0] - x[0],
                    X[neighbor, 1] - x[1], head_width=0, fc='k', ec='k')

    clf = KNeighborsClassifier(n_neighbors=n_neighbors).fit(X, y)
    test_points = plt.scatter(X_test[:, 0], X_test[:, 1], c=clf.predict(X_test), marker="*")
    training_points = plt.scatter(X[:, 0], X[:, 1], c=y)    
plot_n_neighbors(X, y)
plt.savefig("images/knn_boundary_k1.png", bbox_inches='tight')

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0)

from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X_train, y_train)
print(f"accuracy on training set: {knn.score(X_train, y_train):.2f}")
print(f"accuracy on test set: {knn.score(X_test, y_test):.2f}")

In [None]:
plot_n_neighbors(X, y, n_neighbors=3)
plt.savefig("images/knn_boundary_k3.png", bbox_inches='tight')

In [None]:
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)
print(f"accuracy on training set: {knn.score(X_train, y_train):.2f}")
print(f"accuracy on test set: {knn.score(X_test, y_test):.2f}")

### Another Application: Classifying Iris Species
![sepal_petal](images/iris_petal_sepal.png)

#### Meet the Data

In [None]:
from sklearn.datasets import load_iris
iris_dataset = load_iris()

In [None]:
iris_dataset

In [None]:
print("Keys of iris_dataset:\n", iris_dataset.keys())

In [None]:
print(iris_dataset['DESCR'][:193] + "\n...")

In [None]:
print("Target names:", iris_dataset['target_names'])

In [None]:
print("Feature names:\n", iris_dataset['feature_names'])

In [None]:
print("Type of data:", type(iris_dataset['data']))

In [None]:
print("Shape of data:", iris_dataset['data'].shape)

In [None]:
print("First five rows of data:\n", iris_dataset['data'][:5])

In [None]:
print("Type of target:", type(iris_dataset['target']))

In [None]:
print("Shape of target:", iris_dataset['target'].shape)

In [None]:
print("Target:\n", iris_dataset['target'])

#### Measuring Success: Training and Testing Data

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    iris_dataset['data'], iris_dataset['target'], random_state=0)

In [None]:
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)

In [None]:
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)

#### First Things First: Look at Your Data

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd

In [None]:
# create dataframe from data in X_train
# label the columns using the strings in iris_dataset.feature_names
plt.rcParams['image.cmap'] = "viridis"
iris_dataframe = pd.DataFrame(X_train, columns=iris_dataset.feature_names)
# create a scatter matrix from the dataframe, color by y_train
pd.plotting.scatter_matrix(iris_dataframe, c=y_train, figsize=(8, 8),
                           marker='o', hist_kwds={'bins': 20}, s=60,
                           alpha=.8)

#### Building Your First Model: k-Nearest Neighbors

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=1)

In [None]:
knn.fit(X_train, y_train)

#### Making Predictions

In [None]:
import numpy as np

In [None]:
X_new = np.array([[5, 2.9, 1, 0.2]])
print("X_new.shape:", X_new.shape)

In [None]:
prediction = knn.predict(X_new)
print("Prediction:", prediction)
print("Predicted target name:",
       iris_dataset['target_names'][prediction])

#### Evaluating the Model

In [None]:
y_pred = knn.predict(X_test)
print("Test set predictions:\n", y_pred)

In [None]:
print(y_test)

In [None]:
print("Test set score: {:.2f}".format(np.mean(y_pred == y_test)))

In [None]:
print("Test set score: {:.2f}".format(knn.score(X_test, y_test)))

### Summary and Outlook

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    iris_dataset['data'], iris_dataset['target'], random_state=0)

knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X_train, y_train)

print("Training set score: {:.2f}".format(knn.score(X_train, y_train)))
print("Test set score: {:.2f}".format(knn.score(X_test, y_test)))