# 

In [52]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

class KMeansCluster:
    def __init__(self, k, tol, max_iter):
        self.k = k
        self.tol = tol
        self.max_iter = max_iter
        self.centroids = None
        self.clusters = None
        self.sse = None

    def fit(self, X, init_mu=None):
        if init_mu is None:
            self.centroids = X.sample(self.k).values
        else:
            self.centroids = init_mu
        for i in range(self.max_iter):
            self.clusters = {}
            for j in range(self.k):
                self.clusters[j] = []
            for x in X.values:
                distances = [np.linalg.norm(x - c) for c in self.centroids]
                cluster = distances.index(min(distances))
                self.clusters[cluster].append(x)
            prev_centroids = self.centroids.copy()
            for cluster in self.clusters:
                self.centroids[cluster] = np.average(self.clusters[cluster], axis=0)
            if np.linalg.norm(self.centroids - prev_centroids) < self.tol:
                break
        self.sse = 0
        for cluster in self.clusters:
            for x in self.clusters[cluster]:
                self.sse += np.linalg.norm(x - self.centroids[cluster]) ** 2

    def predict(self, X):
        y_pred = []
        for x in X:
            distances = [np.linalg.norm(x - c) for c in self.centroids]
            cluster = distances.index(min(distances))
            y_pred.append(cluster)
        return y_pred

    def accuracy(self, y_true, y_pred):
        return np.mean(y_true == y_pred)

# Load data
data = pd.read_csv('./Q_4/breast_data.csv')
labels = pd.read_csv('./Q_4/breast_labels.csv')

# Scale data
scaler = StandardScaler()
X = scaler.fit_transform(data)

# Initialize KMeansCluster object
kmeans = KMeansCluster(k=3, tol=0.0001, max_iter=300)

# Fit model to data
kmeans.fit(pd.DataFrame(X))

# Predict cluster labels
y_pred = kmeans.predict(X)

# Calculate accuracy
accuracy = kmeans.accuracy(labels.values.flatten(), y_pred)
print(f"Accuracy: {accuracy}")

# Run code 5 times using different starting points and calculate accuracy
for i in range(5):
    init_mu = np.random.rand(3, X.shape[1])
    kmeans = KMeansCluster(k=3, tol=0.0001, max_iter=300)
    kmeans.fit(pd.DataFrame(X), init_mu=init_mu)
    y_pred = kmeans.predict(pd.DataFrame(X))
    accuracy = kmeans.accuracy(labels.values.flatten(), y_pred)
    print(f"Accuracy {i+1}: {accuracy}")

# Use provided initial centres and report accuracy
# init_mu = pd.read_csv('init_mu.csv', header=None).values

Accuracy: 0.6778169014084507
Accuracy 1: 0.0


  return np.mean(y_true == y_pred)


Accuracy 2: 0.0
Accuracy 3: 0.0
Accuracy 4: 0.0
Accuracy 5: 0.0


In [53]:
import scipy.io
mat = scipy.io.loadmat('./Q_4/init_mu.mat')
kmeans = KMeansCluster(k=3, tol=0.0001, max_iter=300)
kmeans.fit(pd.DataFrame(X), init_mu=init_mu)
y_pred = kmeans.predict(pd.DataFrame(X))
accuracy = kmeans.accuracy(labels.values.flatten(), y_pred)
print(f"Accuracy using provided initial centres: {accuracy}")

Accuracy using provided initial centres: 0.0


  return np.mean(y_true == y_pred)


If you initialize the K-Means clustering algorithm with the true centers obtained after the true clustering, it is likely that the algorithm will converge to the same set of centers, as it considers the initial centers as the optimal centers to minimize the within-cluster variance. This is because the initial centers are already optimized based on the data distribution, so the K-Means algorithm will try to find the same set of centers during the clustering process
2
.
However, it is essential to note that the K-Means algorithm is sensitive to the initial placement of centroids. Different initialization methods can lead to different convergence results
1
. While using the true centers from the previous clustering might seem like a good idea, it may not always result in the best clustering solution, as the algorithm is still dependent on the initial centroids.
In summary, while using the true centers obtained after the true clustering might seem like a straightforward approach, it may not always lead to the best clustering results. The initial placement of centroids plays a crucial role in the convergence of the K-Means algorithm, and different initialization methods can lead to different results
1
2
.