# **K-Means Demo**

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import numpy.typing as npt
from IPython.display import clear_output
from matplotlib.markers import MarkerStyle
from sklearn.cluster import KMeans

# Data

We create a mixture of two bivariate Gaussian distributions:

In [None]:
# Generate samples.
samples1 = np.random.multivariate_normal(
    mean=[2.0, 2.0], cov=[[1.5, 0.5], [0.5, 1.5]], size=400
)
samples2 = np.random.multivariate_normal(
    mean=[-2.0, -2.0], cov=[[2.0, -0.5], [-0.5, 2.0]], size=400
)

# Concatenate samples.
samples = np.concatenate((samples1, samples2))

# Plot samples.
plt.scatter(x=samples[:, 0], y=samples[:, 1], alpha=0.5)
plt.show()

# K-Means From `scikit-learn`

We use the _k_-means implementation from `scikit-learn` to group our samples into two clusters:

In [None]:
kmeans = KMeans(n_clusters=2, init="random", n_init="auto", random_state=0).fit(
    X=samples
)

By inspecting the labels, we see that the algorithm classified most samples correctly:

In [None]:
kmeans.labels_

We can also inspect the location of the cluster centers:

In [None]:
kmeans.cluster_centers_

Also, we can predict the labels for new data:

In [None]:
new_samples = np.array([[5.0, 1.0], [-3.0, -5.0], [0.0, 4.5]])
kmeans.predict(X=new_samples)

Lastly, we visualize the data:

In [None]:
def kmeans_plot(
    samples: npt.ArrayLike,
    labels: npt.ArrayLike,
    centroids: npt.ArrayLike,
    title: str = "",
) -> None:
    """Plot samples and centroids."""
    samples = np.array(samples)
    plt.scatter(
        x=samples[:, 0],
        y=samples[:, 1],
        c=labels,
        cmap="Dark2",
        alpha=0.6,
        label="Samples",
    )
    centroids = np.array(centroids)
    plt.scatter(
        x=centroids[:, 0],
        y=centroids[:, 1],
        s=200,
        c="black",
        marker=MarkerStyle("x"),
        label="Centroids",
    )
    plt.title(label=title)
    plt.legend()
    plt.show()


kmeans_plot(samples=samples, labels=kmeans.labels_, centroids=kmeans.cluster_centers_)

# K-Means From Scratch

## Initialization

The first step of the algorithm is to place $k$ initial cluster centers (centroids/means).
(Hence the name _k_-means.)
This can for example be done randomly; however, there are better and more sophisticated initialization methods.
Here we choose the Forgy method.
The Forgy method randomly chooses $k$ observations from the dataset and uses these as the initial means. 
The number of clusters $k$ has to be chosen manually (or as part of a hyperparameter optimization).

In [None]:
def kmeans_initialize(
    n_clusters: int, samples: npt.NDArray[np.float64]
) -> npt.NDArray[np.float64]:
    """Initialize the centroids."""
    # Generate n_clusters unique random indices.
    unique_indices = np.random.choice(
        a=samples.shape[0], size=n_clusters, replace=False
    )

    # Select and return the centroids using the unique indices.
    return samples[unique_indices]

In [None]:
centroids = kmeans_initialize(n_clusters=2, samples=samples)
print(centroids)

## Assignment Step

Assign each observation to the cluster with the nearest mean: that with the least Euclidean distance.

> Mathematically, this means partitioning the observations according to the Voronoi diagram generated by the means.

In [None]:
def kmeans_assignment_step(
    samples: npt.NDArray[np.float64], centroids: npt.NDArray[np.float64]
) -> npt.NDArray[np.int64]:
    """Assign samples to the closest centroid."""
    # Calculate the Euclidean distances between samples and centroids.
    distances = np.linalg.norm(x=(samples[:, np.newaxis] - centroids), axis=2)

    # Find the index of the closest centroid for each sample.
    labels = np.argmin(a=distances, axis=1)

    return labels

In [None]:
labels = kmeans_assignment_step(samples, centroids)
kmeans_plot(samples=samples, labels=labels, centroids=centroids)

## Update Step

Recalculate means (centroids) for observations assigned to each cluster.

In [None]:
def kmeans_update_step(
    samples: npt.NDArray[np.float64],
    labels: npt.NDArray[np.int64],
    centroids: npt.NDArray[np.float64],
) -> npt.NDArray[np.float64]:
    """Update the centroids."""
    new_centroids = np.array(
        [samples[labels == cluster].mean(axis=0) for cluster in range(len(centroids))]
    )
    return new_centroids

In [None]:
old_centroids = centroids
kmeans_plot(
    samples=samples, labels=labels, centroids=old_centroids, title="Old centroids"
)

new_centroids = kmeans_update_step(samples=samples, labels=labels, centroids=centroids)
kmeans_plot(
    samples=samples, labels=labels, centroids=new_centroids, title="New centroids"
)

## Iterate

Now we repeat these two steps until the centroids converge and do not move anymore.
Here we use a fixed number of iterations.

In [None]:
n_iterations = 20
n_clusters = 2

centroids = kmeans_initialize(n_clusters=n_clusters, samples=samples)

for i in range(n_iterations):
    labels = kmeans_assignment_step(samples=samples, centroids=centroids)
    centroids = kmeans_update_step(samples=samples, labels=labels, centroids=centroids)
    clear_output(wait=True)
    kmeans_plot(samples=samples, centroids=centroids, labels=labels, title=f"i = {i}")
    plt.pause(interval=0.2)