In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.colors import ListedColormap
from sklearn import neighbors, datasets
from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import numpy as np
from ipywidgets import interactive
from IPython.display import display
%matplotlib widget

# Part 1 - Intuition

The data set consists of 50 samples from each of three species of *Iris* ([Iris setosa](https://en.wikipedia.org/wiki/Iris_setosa), [Iris virginica](https://en.wikipedia.org/wiki/Iris_virginica) and [Iris versicolor](https://en.wikipedia.org/wiki/Iris_versicolor)). Four features were measured from each sample: the length and the width of the [sepals](https://en.wikipedia.org/wiki/Setal) and [petals](https://en.wikipedia.org/wiki/Petal), in centimeters.

This interactive demo lets you explore the K-Nearest Neighbors algorithm for classification. The demo uses a kNN model in the Iris dataset. 

We can visualize the classifier decision boundary. A decision boundary is a line (in the case of two features), where all (or most) samples of one class are on one side of that line, and all samples of the other class are on the opposite side of the line. The line separates one class from the other. If you have more than two features, the decision boundary is not a line, but a (hyper)-plane in the dimension of your feature space.
Each point in the plane is colored with the class that would be assigned to it using the K-Nearest Neighbors algorithm.

In [None]:
def fit_knn(n_neighbors, weights, X, y):
    clf = neighbors.KNeighborsClassifier(n_neighbors, weights=weights)
    clf.fit(X, y)
    return clf

In [None]:
dataset = datasets.load_iris() # try with another dataset 

X = dataset.data[:, :2]
y = dataset.target

In [None]:
# # Create color maps
plt.ioff()

cmap_light = ListedColormap(["orange", "cyan", "cornflowerblue"])
cmap_bold = ["darkorange", "c", "darkblue"]

fig, ax = plt.subplots()
fig.canvas.header_visible = False

sns.scatterplot(
    x=X[:, 0],
    y=X[:, 1],
    hue=dataset.target_names[y],
    palette=cmap_bold,
    alpha=1.0,
    edgecolor="black",
)

def plot_boundary(n_neighbors, weights):
    clf = fit_knn(n_neighbors, weights, X, y)
    
    DecisionBoundaryDisplay.from_estimator(
        clf,
        X,
        cmap=cmap_light,
        ax=ax,
        response_method="predict",
        plot_method="pcolormesh",
        xlabel=dataset.feature_names[0],
        ylabel=dataset.feature_names[1],
        shading="auto",
    )

    sns.scatterplot(
        x=X[:, 0],
        y=X[:, 1],
        hue=dataset.target_names[y],
        palette=cmap_bold,
        alpha=1.0,
        edgecolor="black",
        legend=False
    )
    display(fig)
    return clf


inter = interactive(
    plot_boundary,
    weights=["uniform", "distance"],
    n_neighbors=(1, 10, 2),
)

display(inter)

The decision boundary is draw using the sepal width and sepal length features. Recall that `n_neighbors` refers to the number of neighbors to use by default for kneighbors queries and `weights` is used to implement a weight function used in prediction.

Using the interactive demo try to answer the following questions:
* What is the impact of the value of `n_neighbors` in the prediction? How does that relates to the bias-variance tradeoff?
* In what extent the predictions with `uniform` and `distance` weighting differ?

# Part 2 - Train and Test

In the next section you will use the kNN classifier to make predictions on the Iris dataset.

In [None]:
# 1. Split the dataset in training and testing. Use a test_size of 33%
# Hint: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html

In [None]:
# 2. Instantiate a kNN classifier with k=5 and uniform weighting 
# Hint: https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html

In [None]:
# 3. Use the classifier to predict the test set.
# Hint: https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html#sklearn.neighbors.KNeighborsClassifier.predict

In [None]:
# 4. Evaluate the classifier error on the test set (also known as hold-out evaluation)
# Hint: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html

# Extra - Inspect the misclassification(s)

Run the code below and inspect the model's misclassifications.

In [None]:
_, ax = plt.subplots()
sns.scatterplot(
    x=X[:, 0],
    y=X[:, 1],
    hue=dataset.target_names[y],
    palette=cmap_bold,
    alpha=1.0,
    edgecolor="black",
)

plt.scatter(    
    x=X_test[wrongs, 0],
    y=X_test[wrongs, 1],
    color=[cmap_bold[i] for i in y_pred[wrongs]],
    marker="+")
plt.show()