# K-Means Clustering

K-means clustering is a type of algorithm used to group together similar data points. It is commonly used in data science and machine learning. The process works by dividing a dataset into K number of clusters, with each cluster containing data points that are similar to each other.

The algorithm works by first randomly selecting K initial points as the "centers" of the clusters. Then, each data point in the dataset is assigned to the closest center. The centers are then recalculated based on the average of all the data points assigned to each cluster. This process is repeated until the centers no longer move significantly or a maximum number of iterations is reached.

The resulting clusters are made up of data points that are similar to each other and different from those in other clusters. K-means clustering can be useful in many applications, such as grouping customers with similar preferences or clustering images with similar features.

## Numpy

In [None]:
import matplotlib.pyplot as plt
import numpy as np

np.random.seed(42)


def euclidean_distance(x1, x2):
    return np.sqrt(np.sum((x1 - x2) ** 2))


class KMeans:
    def __init__(self, K=5, max_iters=100, plot_steps=False):
        self.K = K
        self.max_iters = max_iters
        self.plot_steps = plot_steps

        # list of sample indices for each cluster
        self.clusters = [[] for _ in range(self.K)]
        # the centers (mean feature vector) for each cluster
        self.centroids = []

    def predict(self, X):
        self.X = X
        self.n_samples, self.n_features = X.shape

        # initialize
        random_sample_idxs = np.random.choice(self.n_samples, self.K, replace=False)
        self.centroids = [self.X[idx] for idx in random_sample_idxs]

        # Optimize clusters
        for _ in range(self.max_iters):
            # Assign samples to closest centroids (create clusters)
            self.clusters = self._create_clusters(self.centroids)

            if self.plot_steps:
                self.plot()

            # Calculate new centroids from the clusters
            centroids_old = self.centroids
            self.centroids = self._get_centroids(self.clusters)

            # check if clusters have changed
            if self._is_converged(centroids_old, self.centroids):
                break

            if self.plot_steps:
                self.plot()

        # Classify samples as the index of their clusters
        return self._get_cluster_labels(self.clusters)

    def _get_cluster_labels(self, clusters):
        # each sample will get the label of the cluster it was assigned to
        labels = np.empty(self.n_samples)

        for cluster_idx, cluster in enumerate(clusters):
            for sample_index in cluster:
                labels[sample_index] = cluster_idx
        return labels

    def _create_clusters(self, centroids):
        # Assign the samples to the closest centroids to create clusters
        clusters = [[] for _ in range(self.K)]
        for idx, sample in enumerate(self.X):
            centroid_idx = self._closest_centroid(sample, centroids)
            clusters[centroid_idx].append(idx)
        return clusters

    def _closest_centroid(self, sample, centroids):
        # distance of the current sample to each centroid
        distances = [euclidean_distance(sample, point) for point in centroids]
        closest_index = np.argmin(distances)
        return closest_index

    def _get_centroids(self, clusters):
        # assign mean value of clusters to centroids
        centroids = np.zeros((self.K, self.n_features))
        for cluster_idx, cluster in enumerate(clusters):
            cluster_mean = np.mean(self.X[cluster], axis=0)
            centroids[cluster_idx] = cluster_mean
        return centroids

    def _is_converged(self, centroids_old, centroids):
        # distances between each old and new centroids, fol all centroids
        distances = [
            euclidean_distance(centroids_old[i], centroids[i]) for i in range(self.K)
        ]
        return sum(distances) == 0

    def plot(self):
        fig, ax = plt.subplots(figsize=(12, 8))

        for i, index in enumerate(self.clusters):
            point = self.X[index].T
            ax.scatter(*point)

        for point in self.centroids:
            ax.scatter(*point, marker="x", color="black", linewidth=2)

        plt.show()


# Testing
if __name__ == "__main__":
    from sklearn.datasets import make_blobs

    X, y = make_blobs(
        centers=3, n_samples=500, n_features=2, shuffle=True, random_state=40
    )
    print(X.shape)

    clusters = len(np.unique(y))
    print(clusters)

    k = KMeans(K=clusters, max_iters=150, plot_steps=True)
    y_pred = k.predict(X)

    k.plot()

## Sklearn

In [6]:
from sklearn.cluster import KMeans
import numpy as np
from sklearn.datasets import make_blobs

X, y = make_blobs(
    centers=3, n_samples=500, n_features=2, shuffle=True, random_state=40
)
print(X.shape)

kmeans = KMeans(n_clusters=len(np.unique(y)), random_state=0).fit(X)
y_pred = kmeans.predict(X)

print(y_pred)

(500, 2)
[1 0 2 2 0 0 1 1 1 1 0 0 2 1 0 2 0 0 1 2 1 0 0 0 1 1 1 0 1 2 0 1 0 2 2 1 0
 2 1 1 0 1 0 0 2 2 2 2 0 0 2 2 1 2 2 0 2 0 2 0 1 2 0 1 2 1 1 0 0 2 2 1 0 2
 1 0 0 1 0 2 1 2 2 0 2 0 1 2 2 1 1 1 0 1 2 0 2 1 2 0 2 0 2 0 0 1 0 0 0 2 2
 2 0 0 0 1 0 1 1 0 2 2 0 2 0 1 2 0 0 0 0 2 2 2 0 1 0 2 0 0 1 2 0 1 1 0 1 1
 1 1 0 0 0 0 1 2 2 2 1 2 2 1 2 2 0 1 2 2 0 0 1 2 0 2 2 2 2 2 2 1 0 2 0 1 0
 1 2 1 0 0 2 2 1 0 2 2 1 2 2 1 1 2 0 0 0 1 1 1 2 0 0 0 0 1 2 1 0 2 1 1 2 0
 2 2 2 0 0 2 2 1 2 0 1 0 2 2 0 0 1 1 2 2 1 1 1 2 2 1 1 2 1 2 0 0 2 2 1 2 0
 0 0 0 0 0 1 0 2 2 1 0 1 1 1 2 1 2 0 2 0 2 0 1 1 2 2 2 0 1 1 2 2 1 0 0 1 1
 1 2 0 0 2 2 0 2 1 0 0 2 1 1 0 0 0 1 0 2 2 0 1 1 0 1 1 1 1 0 1 2 0 1 1 2 2
 0 1 1 2 2 2 1 2 0 1 0 2 1 2 0 0 1 0 2 2 2 0 2 1 2 1 1 1 1 1 1 1 0 1 1 1 0
 0 2 0 0 2 1 1 0 0 0 2 1 1 2 0 1 0 0 0 0 1 1 0 2 2 0 0 1 2 0 2 2 1 1 0 2 2
 2 0 1 2 2 1 0 2 0 0 1 1 1 2 1 1 1 2 0 2 0 2 0 1 0 1 0 0 0 2 2 2 2 0 0 1 2
 0 1 2 1 2 2 2 2 2 0 1 1 1 1 1 1 0 0 0 2 0 1 1 1 2 2 1 2 1 1 2 0 1 2 1 1 1
 0 1 1 0 1 2 1 1

