# K-Means Clustering From Scratch

K-Means clustering algorithm implemented from scratch using NumPy.

## Features

- K-Means algorithm
- Unsupervised learning
- Centroid calculation
- Cluster assignment
- NumPy implementation

In [13]:
import numpy as npimport pandas as pdimport matplotlib.pyplot as pltimport plotly.express as pximport seaborn as snsimport plotly.graph_objects as go

In [14]:
iris = pd.read_csv('sample_data/Iris (1).csv')

In [15]:
iris.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [16]:
iris.drop('Id', inplace = True, axis = 1)

In [17]:
X = iris.iloc[:,:-1]y = iris.iloc[:,-1]

In [18]:
iris.head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [19]:
"""K-Means is an unsupervised learning algorithm used to cluster data into K groups. It works by:
    Picking K random centroidsRepeatedly:
        Assigning each point to its nearest centroidRecomputing centroids as the mean of their assigned pointsAfter enough iterations, the centroids and clusters stabilize"""

'\nK-Means is an unsupervised learning algorithm used to cluster data into K groups. It works by:\n\nPicking K random centroids\n\nRepeatedly:\n\nAssigning each point to its nearest centroid\n\nRecomputing centroids as the mean of their assigned points\n\nAfter enough iterations, the centroids and clusters stabilize\n\n'

In [20]:
import numpy as np

class Kmeans:
    def __init__(self, K):
        assert K > 0, "K must be a positive integer"
        self.K = K
    
    def initialize_centroids(self, X):
        indices = np.random.choice(len(X), self.K, replace=False)
        self.centroids = X[indices]
    
    def compute_distance(self, x, centroids):
        return np.linalg.norm(x - centroids, axis=-1)
    
    def assign_clusters(self, X):
        clusters = []
        for x in X:
            distances = self.compute_distance(x, self.centroids)
            closest = np.argmin(distances)
            clusters.append(closest)
        return np.array(clusters)
    
    def update_centroids(self, X, clusters):
        new_centroids = np.zeros((self.K, X.shape[1]))
        for i in range(self.K):
            points = X[clusters == i]
            if len(points) > 0:
                new_centroids[i] = points.mean(axis=0)
        return new_centroids
    
    def fit(self, X, iterations=10):
        self.initialize_centroids(X)
        for i in range(iterations):
            clusters = self.assign_clusters(X)
            self.centroids = self.update_centroids(X, clusters)
        return self.centroids, clusters

In [21]:
# If X is a pandas DataFrameX = X.values  # Convert to NumPy arraykmeans = Kmeans(3)  # Create model with K=3centroids, points = kmeans.fit(X, iterations=1000)print("Centroids:
    \n", centroids)print("Cluster assignments:
        \n", points)

Centroids:
 [[6.85       3.07368421 5.74210526 2.07105263]
 [5.9016129  2.7483871  4.39354839 1.43387097]
 [5.006      3.418      1.464      0.244     ]]
Cluster assignments:
 [2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0 0 0 0 1 0 0 0 0
 0 0 1 1 0 0 0 0 1 0 1 0 1 0 0 1 1 0 0 0 0 0 1 0 0 0 0 1 0 0 0 1 0 0 0 1 0
 0 1]


In [23]:
"""Sklearn"""

'\nSklearn\n'

In [25]:
from sklearn.cluster import KMeansfrom sklearn.preprocessing import StandardScalerfrom sklearn.metrics import adjusted_rand_score, silhouette_score

In [28]:
X = iris.iloc[:,:-1]y = iris.iloc[:,-1]

In [29]:
scaler = StandardScaler()x_scaled = scaler.fit_transform(X)

In [30]:
kmeans = KMeans(n_clusters=3)kmeans.fit(x_scaled)

In [31]:
predicted_clusters = kmeans.labels_

In [32]:
from sklearn.preprocessing import LabelEncoderle = LabelEncoder()y_encoded = le.fit_transform(y)

In [None]:
from sklearn.metrics import adjusted_rand_score, silhouette_score

ari = adjusted_rand_score(y_encoded, predicted_clusters)
silhouette = silhouette_score(x_scaled, predicted_clusters)

print("Adjusted Rand Index:", ari)
print("Silhouette Score:", silhouette)