In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier

In [None]:
#X, y = datasets.load_breast_cancer(return_X_y=True)
#X, y = datasets.make_moons(n_samples=100, noise=0.1, random_state=42)
X, y = datasets.make_classification(n_samples=200, n_features=3, n_informative=3, n_classes=2, n_redundant=0, n_clusters_per_class=4, random_state=42)

# Only use the first two features and normalize the data
X = X[:, [0,1]]

# Split the data into a training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

# Normalize the data
sc = StandardScaler()
sc.fit(X_train)
X_train_scaled = sc.transform(X_train)
X_test_scaled = sc.transform(X_test)
X_scaled = sc.transform(X)

# Train k-nearest neighbors classifiers for k=1 to k=9
clfs = []
for k in range(1, 10):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train_scaled, y_train)
    clfs.append(knn)

# Plot the decision boundaries
fig, ax = plt.subplots(3, 3, figsize=(10, 10))
ax = ax.flatten()
h = 0.05
x_min, x_max = X_scaled[:, 0].min() - 1, X_scaled[:, 0].max() + 1
y_min, y_max = X_scaled[:, 1].min() - 1, X_scaled[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
X_predict = np.c_[xx.ravel(), yy.ravel()]

accuracy = []
for clf, name, a in zip(clfs, [f"{k}-Nearest-Neighbor" for k in range(1, 10)], ax):
    Z = clf.predict(X_predict).reshape(xx.shape)
    a.contourf(xx, yy, Z)
    a.set_title(f"{name} decision boundary")
    # Plot the misclassified test and training points
    test_accuracy = np.mean(clf.predict(X_test_scaled) == y_test)
    train_accuracy = np.mean(clf.predict(X_train_scaled) == y_train)
    print(
        f"{name} train accuracy: {train_accuracy:.3f}"
        f"  test accuracy: {test_accuracy:.3f}"
    )
    accuracy.append(test_accuracy)

# Output the best k
best_k = np.argmax(accuracy) + 1
print(f"Best k: {best_k}")

# Plot the data
colors, markers = ["blue", "limegreen", "gray", "cyan"], "s^oxv<>"
for a in ax:
    for i in range(len(np.unique(y))):
        a.scatter(X_scaled[y == i, 0], X_scaled[y == i, 1], color=colors[i], marker=markers[i], s=50, facecolors="none")

fig.tight_layout()
plt.show()
