In [2]:
import numpy as np

In [3]:
# Implement the class
class KMeans():
    def __init__(self, n_clusters=3, max_iters=100):
        self.n_clusters = n_clusters
        self.max_iters = max_iters

    def fit(self, X):
        idx = np.random.choice(len(X), self.n_clusters, replace=False)
        self.centroids = X[idx]

        for _ in range(self.max_iters):
            distances = np.linalg.norm(X[:, np.newaxis] - self.centroids, axis=2)
            new_labels = np.argmin(distances, axis=1)
            new_centroids = np.array([X[new_labels == k].mean(axis=0) for k in range(self.n_clusters)])

            if np.allclose(new_centroids, self.centroids):
                break
            self.centroids = new_centroids
        return self
    
    def predict(self, X):
        distances = np.linalg.norm(X[:, np.newaxis] - self.centroids, axis=2)
        labels = np.argmin(distances, axis=1)
        return labels

In [4]:
# test the class
def test_kmeans():
    # Test 1: Basic clustering with clearly separated points
    X = np.array([
        [0, 0], [1, 0], [0, 1],      # Cluster 1
        [5, 5], [6, 5], [5, 6],      # Cluster 2
        [10, 10], [11, 10], [10, 11]  # Cluster 3
    ])
    
    np.random.seed(42)
    kmeans = KMeans(n_clusters=3)
    kmeans.fit(X)
    
    # Test number of centroids
    assert len(kmeans.centroids) == 3, "Should have 3 centroids"
    
    # Test centroid shapes
    assert kmeans.centroids.shape == (3, 2), "Centroids should be 2D"

    X_test = np.random.normal(4, 1, (10, 2))
    
    # Predict clusters for test data
    predictions = kmeans.predict(X_test)
    print("Predictions for test data:", predictions)
    
    # Test 2: Random data
    np.random.seed(42)
    X_random = np.random.randn(100, 2) * 3
    X_random[:33] += np.array([10, 10])
    X_random[33:66] += np.array([-10, -10])
    
    kmeans = KMeans(n_clusters=3)
    kmeans.fit(X_random)
    
    distances = np.linalg.norm(X_random[:, np.newaxis] - kmeans.centroids, axis=2)
    labels = np.argmin(distances, axis=1)
    
    # Test cluster sizes are reasonable
    unique, counts = np.unique(labels, return_counts=True)
    assert len(unique) == 3, "Should have 3 unique clusters"
    assert all(counts > 5), "Each cluster should have multiple points"
    
    print("All tests passed!")
    

if __name__ == "__main__":
    test_kmeans()

Predictions for test data: [2 2 2 2 2 2 1 2 2 2]
All tests passed!
