## MNIST Dataset clustering

In [None]:
import numpy as np
from scipy.io import loadmat
from sklearn.cluster import KMeans
from collections import defaultdict

In [8]:
def calculate_purity(cluster_labels, true_labels):
    cluster_dict = defaultdict(list)
    for cluster, label in zip(cluster_labels, true_labels):
        cluster_dict[cluster].append(label)

    purity_scores = {}
    for cluster, labels in cluster_dict.items():
        most_common = max(set(labels), key=labels.count)
        correct_assignments = labels.count(most_common)
        purity_scores[cluster] = correct_assignments / len(labels)

    return purity_scores

def load_mnist_data(file_path):
    data = loadmat(file_path)
    X_train = data["xtrain"]
    y_train = data["ytrain"].flatten()
    X_test = data["xtest"]
    y_test = data["ytest"].flatten()

    X = np.vstack((X_train, X_test))
    y = np.hstack((y_train, y_test))

    return X, y

def clustering_with_euclidean(X, y, k=10):
    X = X / 255.0

    kmeans = KMeans(n_clusters=k, random_state=42).fit(X)
    cluster_labels = kmeans.labels_
    purity_scores = calculate_purity(cluster_labels, y)

    print("Purity scores for Euclidean:")
    for cluster, score in sorted(purity_scores.items()):
        print(f"Cluster {cluster}: {score:.4f}")

    avg_purity = np.mean(list(purity_scores.values()))
    print(f"Average Purity for Euclidean: {avg_purity:.4f}")
    return purity_scores

def clustering_with_hamming(X, y, k=10):
    X_binary = (X > 128).astype(int)

    class HammingKMeans:
        def __init__(self, n_clusters, max_iter=300, random_state=42):
            self.n_clusters = n_clusters
            self.max_iter = max_iter
            self.random_state = random_state

        def fit(self, X):
            np.random.seed(self.random_state)
            n_samples, n_features = X.shape
            self.centroids = X[
                np.random.choice(n_samples, self.n_clusters, replace=False)
            ]

            for _ in range(self.max_iter):
                distances = np.array(
                    [
                        np.sum(np.abs(X - centroid), axis=1)
                        for centroid in self.centroids
                    ]
                )
                labels = np.argmin(distances, axis=0)
                new_centroids = np.zeros_like(self.centroids)
                for cluster in range(self.n_clusters):
                    cluster_points = X[labels == cluster]
                    if len(cluster_points) > 0:
                        new_centroids[cluster] = (
                            np.mean(cluster_points, axis=0) > 0.5
                        ).astype(int)
                if np.all(self.centroids == new_centroids):
                    break
                self.centroids = new_centroids

            self.labels_ = labels

    hamming_kmeans = HammingKMeans(n_clusters=k)
    hamming_kmeans.fit(X_binary)
    cluster_labels = hamming_kmeans.labels_

    purity_scores = calculate_purity(cluster_labels, y)

    print("Purity scores for Hamming:")
    for cluster, score in sorted(purity_scores.items()):
        print(f"Cluster {cluster}: {score:.4f}")

    avg_purity = np.mean(list(purity_scores.values()))
    print(f"Average Purity for Hamming: {avg_purity:.4f}")
    return purity_scores

In [9]:
X, y = load_mnist_data('q5_data/mnist_10digits.mat')

### Q1

In [10]:
euclidean_purity_scores = clustering_with_euclidean(X, y)

  super()._check_params_vs_input(X, default_n_init=10)


Purity scores for Euclidean:
Cluster 0: 0.5281
Cluster 1: 0.6448
Cluster 2: 0.4410
Cluster 3: 0.8688
Cluster 4: 0.3193
Cluster 5: 0.6077
Cluster 6: 0.9153
Cluster 7: 0.6625
Cluster 8: 0.3569
Cluster 9: 0.9389
Average Purity for Euclidean: 0.6283


### Q2

In [11]:
hamming_purity_scores = clustering_with_hamming(X, y)

Purity scores for Hamming:
Cluster 0: 0.4261
Cluster 1: 0.4715
Cluster 2: 0.4868
Cluster 3: 0.5854
Cluster 4: 0.3559
Cluster 5: 0.9175
Cluster 6: 0.7835
Cluster 7: 0.4411
Cluster 8: 0.4418
Cluster 9: 0.3950
Average Purity for Hamming: 0.5305
