In [319]:
import numpy as np
from scipy.spatial import distance
from sklearn.datasets import fetch_20newsgroups, fetch_openml
from sklearn.feature_extraction.text import TfidfVectorizer


In [320]:

class KMeans:
    def __init__(self, n_clusters, max_iterations=10000):
        self.n_clusters = n_clusters
        self.max_iterations = max_iterations
        self.cluster_centers_ = None
        self.cluster_indices_ = None

    def fit(self, X):
        # Randomly select n indices from X with values less than length of X without replacement
        random_indices = np.random.choice(len(X), self.n_clusters, replace=False)
        self.cluster_centers_ = X[random_indices]  # Get the cluster centers from the selected indices

        # Iterate until convergence or maximum iterations reached
        for _ in range(self.max_iterations):
            # Assign data points to the closest centroids
            cluster_labels = self._assign_clusters(X)

            # Update cluster centroids
            new_centers = self._update_centers(X, cluster_labels)

            # Check for convergence
            if np.allclose(self.cluster_centers_, new_centers):
                break

            self.cluster_centers_ = new_centers

        self.cluster_indices_ = cluster_labels

    def predict(self, X):
        # Assign data points to the closest centroids
        cluster_labels = self._assign_clusters(X)
        return cluster_labels

    def _assign_clusters(self, X):
        # Assign data points to the closest centroids and return centroid indices
        distances = self._compute_distances(X)
        cluster_labels = np.argmin(distances, axis=1)
        return cluster_labels

    def _compute_distances(self, X):
        # Calculate the distance matrix using Euclidean distance
        distance_matrix = distance.cdist(X, self.cluster_centers_, metric='euclidean')
        return distance_matrix

    def _update_centers(self, X, cluster_labels):
        new_centers = np.zeros_like(self.cluster_centers_)
        for i in range(self.n_clusters):
            mask = cluster_labels == i
            if np.any(mask):
                new_centers[i] = np.mean(X[mask], axis=0)
            else:
                # If a cluster has no data points assigned to it, keep the previous centroid
                new_centers[i] = self.cluster_centers_[i]
        return new_centers


In [321]:
from sklearn.datasets import load_digits
from sklearn.metrics import adjusted_rand_score, homogeneity_score, completeness_score
from sklearn.preprocessing import MinMaxScaler

# Load the MNIST dataset
digits = load_digits()

# Preprocess the data
X = digits.data
X = MinMaxScaler().fit_transform(X)

# Create an instance of the KMeans class
kmeans = KMeans(n_clusters=10)

# Fit the KMeans model
kmeans.fit(X)

# Get the predicted cluster labels
predicted_labels = kmeans.predict(X)

# Evaluate the performance
true_labels = digits.target

ari = adjusted_rand_score(true_labels, predicted_labels)
homogeneity = homogeneity_score(true_labels, predicted_labels)
completeness = completeness_score(true_labels, predicted_labels)

print('MNSIST')
print("Adjusted Rand Index:", ari)
print("Homogeneity Score:", homogeneity)
print("Completeness Score:", completeness)


MNSIST
Adjusted Rand Index: 0.6159347691202096
Homogeneity Score: 0.7181580191783904
Completeness Score: 0.7465978292024404


## B 

In [322]:
import numpy as np
from tensorflow.keras.datasets import fashion_mnist
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score, homogeneity_score, completeness_score

# Load the FASHION dataset
(X_train, y_train), (X_test, y_test) = fashion_mnist.load_data()

In [323]:


# Reshape and flatten the images
X_train = X_train.reshape(-1, 28*28)
X_train = X_train.astype('float32') / 255.0

# Run KMeans algorithm
kmeans = KMeans(n_clusters=10)
kmeans.fit(X_train)

# Evaluate the clustering performance
ari = adjusted_rand_score(y_train, kmeans.labels_)
homogeneity = homogeneity_score(y_train, kmeans.labels_)
completeness = completeness_score(y_train, kmeans.labels_)

# Print the evaluation scores
print('Fashing dataset:')
print("Adjusted Rand Index:", ari)
print("Homogeneity Score:", homogeneity)
print("Completeness Score:", completeness)




Fashing dataset:
Adjusted Rand Index: 0.3837040596466122
Homogeneity Score: 0.5188345432236771
Completeness Score: 0.538646403109721


# C

In [324]:
# Load the 20NG text dataset
newsgroups = fetch_20newsgroups(subset='all')

# Convert the text data into TF-IDF features
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(newsgroups.data)
y = newsgroups.target

In [326]:
# Create an instance of the KMeans class
kmeans = KMeans(n_clusters=10)

# Fit the KMeans model
kmeans.fit(X)

# Get the predicted cluster labels
predicted_labels = kmeans.predict(X)

# Evaluate the performance
true_labels = y

ari = adjusted_rand_score(true_labels, predicted_labels)
homogeneity = homogeneity_score(true_labels, predicted_labels)
completeness = completeness_score(true_labels, predicted_labels)

print('2O NG ')
print("Adjusted Rand Index:", ari)
print("Homogeneity Score:", homogeneity)
print("Completeness Score:", completeness)



2O NG 
Adjusted Rand Index: 0.05732837013111314
Homogeneity Score: 0.16741500707236734
Completeness Score: 0.3058746258613677
