In [1]:
import numpy as np
from scipy.spatial.distance import euclidean, cosine
from sklearn.metrics import pairwise_distances_argmin_min
from sklearn.preprocessing import LabelEncoder
import pandas as pd

# Distance computation functions
def euclidean_distance(x, y):
    return euclidean(x, y)

def cosine_similarity(x, y):
    return 1 - np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y))

def generalized_jaccard_similarity(x, y):
    min_sum = np.minimum(x, y).sum()
    max_sum = np.maximum(x, y).sum()
    return 1 - min_sum / max_sum if max_sum != 0 else 0

# K-means algorithm
def kmeans(X, k, distance_func, max_iters=500):
    indices = np.random.choice(X.shape[0], k, replace=False)
    centroids = X[indices]
    
    for iteration in range(max_iters):
        clusters = np.array([np.argmin([distance_func(x, centroid) for centroid in centroids]) for x in X])
        new_centroids = np.array([X[clusters == i].mean(axis=0) for i in range(k)])
        
        if np.all(centroids == new_centroids):
            break
        centroids = new_centroids
        
        _, dist = pairwise_distances_argmin_min(X, centroids, metric=distance_func)
        sse = np.sum(dist ** 2)
        if iteration > 0 and sse > previous_sse:
            break
        previous_sse = sse
    
    return clusters, centroids, sse, iteration

# Helper functions for cluster labeling and accuracy calculation
def label_clusters(clusters, true_labels):
    cluster_labels = {}
    for cluster in np.unique(clusters):
        labels, counts = np.unique(true_labels[clusters == cluster], return_counts=True)
        cluster_labels[cluster] = labels[np.argmax(counts)]
    return cluster_labels

def calculate_accuracy(clusters, cluster_labels, true_labels):
    correct_predictions = sum(cluster_labels[cluster] == true_label for cluster, true_label in zip(clusters, true_labels))
    return correct_predictions / len(true_labels)


X = pd.read_csv('kmeans_data/data.csv').values
y = pd.read_csv('kmeans_data/label.csv').values.ravel()
# Number of clusters
k = np.unique(y).size

# Run K-means for each distance metric
results = {}
for distance_func, name in [
    (euclidean_distance, 'Euclidean'), 
    (cosine_similarity, 'Cosine'), 
    (generalized_jaccard_similarity, 'Jaccard')
]:
    clusters, centroids, sse, iterations = kmeans(X, k, distance_func)
    cluster_labels = label_clusters(clusters, y)
    accuracy = calculate_accuracy(clusters, cluster_labels, y)
    results[name] = {'SSE': sse, 'Accuracy': accuracy, 'Iterations': iterations}

print(results)

{'Euclidean': {'SSE': 25391591181.100655, 'Accuracy': 0.5954595459545955, 'Iterations': 108}, 'Cosine': {'SSE': 686.113709726036, 'Accuracy': 0.6267626762676267, 'Iterations': 18}, 'Jaccard': {'SSE': 3729.7645035080945, 'Accuracy': 0.6086608660866086, 'Iterations': 24}}


In [2]:
import numpy as np
import pandas as pd
from scipy.spatial.distance import euclidean, cosine
from sklearn.metrics import pairwise_distances_argmin_min
from sklearn.preprocessing import LabelEncoder

# Distance computation functions
def euclidean_distance(x, y):
    return euclidean(x, y)

def cosine_similarity(x, y):
    return 1 - np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y))

def generalized_jaccard_similarity(x, y):
    min_sum = np.minimum(x, y).sum()
    max_sum = np.maximum(x, y).sum()
    return 1 - min_sum / max_sum if max_sum != 0 else 0

# K-means algorithm
def kmeans(X, k, distance_func, max_iters=100):
    indices = np.random.choice(X.shape[0], k, replace=False)
    centroids = X[indices]
    for iteration in range(max_iters):
        clusters = np.array([np.argmin([distance_func(x, centroid) for centroid in centroids]) for x in X])
        new_centroids = np.array([X[clusters == i].mean(axis=0) for i in range(k)])

        if np.all(centroids == new_centroids):
            break
        centroids = new_centroids

        _, dist = pairwise_distances_argmin_min(X, centroids, metric=distance_func)
        sse = np.sum(dist ** 2)
        # if iteration > 0 and sse > previous_sse:
        #     break
        previous_sse = sse

    return clusters, centroids, sse, iteration

# Helper functions for cluster labeling and accuracy calculation
def label_clusters(clusters, true_labels):
    cluster_labels = {}
    for cluster in np.unique(clusters):
        labels, counts = np.unique(true_labels[clusters == cluster], return_counts=True)
        cluster_labels[cluster] = labels[np.argmax(counts)]
    return cluster_labels

def calculate_accuracy(clusters, cluster_labels, true_labels):
    correct_predictions = sum(cluster_labels[cluster] == true_label for cluster, true_label in zip(clusters, true_labels))
    return correct_predictions / len(true_labels)

# Load your dataset here
# X = (your data points)
# y = (your labels)
X = pd.read_csv('kmeans_data/data.csv').values
y = pd.read_csv('kmeans_data/label.csv').values.ravel()
# Number of clusters
k = np.unique(y).size

# Run K-means for each distance metric
results = {}
for distance_func, name in [
    (euclidean_distance, 'Euclidean'),
    (cosine_similarity, 'Cosine'),
    (generalized_jaccard_similarity, 'Jaccard')
]:
    clusters, centroids, sse, iterations = kmeans(X, k, distance_func)
    cluster_labels = label_clusters(clusters, y)
    accuracy = calculate_accuracy(clusters, cluster_labels, y)
    results[name] = {'SSE': sse, 'Accuracy': accuracy, 'Iterations': iterations}

print(results)

{'Euclidean': {'SSE': 25429883855.464195, 'Accuracy': 0.6051605160516051, 'Iterations': 75}, 'Cosine': {'SSE': 684.2274297550289, 'Accuracy': 0.5440544054405441, 'Iterations': 54}, 'Jaccard': {'SSE': 3664.612330222969, 'Accuracy': 0.5987598759875987, 'Iterations': 26}}


In [3]:
# Q4 - when the SSE value increases in the next iteration
import numpy as np
import pandas as pd
from scipy.spatial.distance import euclidean, cosine
from sklearn.metrics import pairwise_distances_argmin_min
from sklearn.preprocessing import LabelEncoder

# Distance computation functions
def euclidean_distance(x, y):
    return euclidean(x, y)

def cosine_similarity(x, y):
    return 1 - np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y))

def generalized_jaccard_similarity(x, y):
    min_sum = np.minimum(x, y).sum()
    max_sum = np.maximum(x, y).sum()
    return 1 - min_sum / max_sum if max_sum != 0 else 0

# K-means algorithm
def kmeans(X, k, distance_func, max_iters=100):
    indices = np.random.choice(X.shape[0], k, replace=False)
    centroids = X[indices]
    for iteration in range(max_iters):
        clusters = np.array([np.argmin([distance_func(x, centroid) for centroid in centroids]) for x in X])
        new_centroids = np.array([X[clusters == i].mean(axis=0) for i in range(k)])

        # if np.all(centroids == new_centroids):
        #     break
        centroids = new_centroids

        _, dist = pairwise_distances_argmin_min(X, centroids, metric=distance_func)
        sse = np.sum(dist ** 2)
        if iteration > 0 and sse > previous_sse:
            break
        previous_sse = sse

    return clusters, centroids, sse, iteration

# Helper functions for cluster labeling and accuracy calculation
def label_clusters(clusters, true_labels):
    cluster_labels = {}
    for cluster in np.unique(clusters):
        labels, counts = np.unique(true_labels[clusters == cluster], return_counts=True)
        cluster_labels[cluster] = labels[np.argmax(counts)]
    return cluster_labels

def calculate_accuracy(clusters, cluster_labels, true_labels):
    correct_predictions = sum(cluster_labels[cluster] == true_label for cluster, true_label in zip(clusters, true_labels))
    return correct_predictions / len(true_labels)

# Load your dataset here
# X = (your data points)
# y = (your labels)
X = pd.read_csv('kmeans_data/data.csv').values
y = pd.read_csv('kmeans_data/label.csv').values.ravel()
# Number of clusters
k = np.unique(y).size

# Run K-means for each distance metric
results = {}
for distance_func, name in [
    (euclidean_distance, 'Euclidean'),
    (cosine_similarity, 'Cosine'),
    (generalized_jaccard_similarity, 'Jaccard')
]:
    clusters, centroids, sse, iterations = kmeans(X, k, distance_func)
    cluster_labels = label_clusters(clusters, y)
    accuracy = calculate_accuracy(clusters, cluster_labels, y)
    results[name] = {'SSE': sse, 'Accuracy': accuracy, 'Iterations': iterations}

print(results)


{'Euclidean': {'SSE': 25320344040.518368, 'Accuracy': 0.5964596459645964, 'Iterations': 99}, 'Cosine': {'SSE': 682.6670948604199, 'Accuracy': 0.5962596259625963, 'Iterations': 31}, 'Jaccard': {'SSE': 3689.7627184534444, 'Accuracy': 0.5493549354935493, 'Iterations': 21}}


In [4]:
# Q4 - when the maximum preset value (e.g., 100) of iteration is complete
import numpy as np
import pandas as pd
from scipy.spatial.distance import euclidean, cosine
from sklearn.metrics import pairwise_distances_argmin_min
from sklearn.preprocessing import LabelEncoder

# Distance computation functions
def euclidean_distance(x, y):
    return euclidean(x, y)

def cosine_similarity(x, y):
    return 1 - np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y))

def generalized_jaccard_similarity(x, y):
    min_sum = np.minimum(x, y).sum()
    max_sum = np.maximum(x, y).sum()
    return 1 - min_sum / max_sum if max_sum != 0 else 0

# K-means algorithm
def kmeans(X, k, distance_func, max_iters=100):
    indices = np.random.choice(X.shape[0], k, replace=False)
    centroids = X[indices]
    for iteration in range(max_iters):
        clusters = np.array([np.argmin([distance_func(x, centroid) for centroid in centroids]) for x in X])
        new_centroids = np.array([X[clusters == i].mean(axis=0) for i in range(k)])

        # if np.all(centroids == new_centroids):
        #     break
        centroids = new_centroids

        _, dist = pairwise_distances_argmin_min(X, centroids, metric=distance_func)
        sse = np.sum(dist ** 2)
        # if iteration > 0 and sse > previous_sse:
        #     break
        previous_sse = sse

    return clusters, centroids, sse, iteration

# Helper functions for cluster labeling and accuracy calculation
def label_clusters(clusters, true_labels):
    cluster_labels = {}
    for cluster in np.unique(clusters):
        labels, counts = np.unique(true_labels[clusters == cluster], return_counts=True)
        cluster_labels[cluster] = labels[np.argmax(counts)]
    return cluster_labels

def calculate_accuracy(clusters, cluster_labels, true_labels):
    correct_predictions = sum(cluster_labels[cluster] == true_label for cluster, true_label in zip(clusters, true_labels))
    return correct_predictions / len(true_labels)

# Load your dataset here
# X = (your data points)
# y = (your labels)
X = pd.read_csv('kmeans_data/data.csv').values
y = pd.read_csv('kmeans_data/label.csv').values.ravel()
# Number of clusters
k = np.unique(y).size

# Run K-means for each distance metric
results = {}
for distance_func, name in [
    (euclidean_distance, 'Euclidean'),
    (cosine_similarity, 'Cosine'),
    (generalized_jaccard_similarity, 'Jaccard')
]:
    clusters, centroids, sse, iterations = kmeans(X, k, distance_func)
    cluster_labels = label_clusters(clusters, y)
    accuracy = calculate_accuracy(clusters, cluster_labels, y)
    results[name] = {'SSE': sse, 'Accuracy': accuracy, 'Iterations': iterations}

print(results)

{'Euclidean': {'SSE': 25534122783.11035, 'Accuracy': 0.5604560456045604, 'Iterations': 99}, 'Cosine': {'SSE': 688.3730144624783, 'Accuracy': 0.6532653265326532, 'Iterations': 99}, 'Jaccard': {'SSE': 3660.860586165666, 'Accuracy': 0.6038603860386038, 'Iterations': 99}}
