In [6]:
#Q1,Q2,Q3
import numpy as np
import pandas as pd
from scipy.spatial.distance import euclidean as euclidean_dist, cosine as cosine_dist
from sklearn.metrics import pairwise_distances_argmin_min
from sklearn.preprocessing import LabelEncoder

# Redefined distance computation functions
def calc_euclidean_dist(point_a, point_b):
    return euclidean_dist(point_a, point_b)

def calc_cosine_similarity(point_a, point_b):
    return 1 - np.dot(point_a, point_b) / (np.linalg.norm(point_a) * np.linalg.norm(point_b))

def calc_generalized_jaccard_similarity(point_a, point_b):
    min_sum = np.minimum(point_a, point_b).sum()
    max_sum = np.maximum(point_a, point_b).sum()
    return 1 - min_sum / max_sum if max_sum != 0 else 0

# Modified K-means algorithm
def perform_kmeans(data_points, num_clusters, distance_function, max_iterations=500):
    random_indices = np.random.choice(data_points.shape[0], num_clusters, replace=False)
    cluster_centers = data_points[random_indices]
    
    for iter in range(max_iterations):
        assigned_clusters = np.array([np.argmin([distance_function(dp, center) for center in cluster_centers]) for dp in data_points])
        updated_centers = np.array([data_points[assigned_clusters == idx].mean(axis=0) for idx in range(num_clusters)])
        
        if np.all(cluster_centers == updated_centers):
            break
        cluster_centers = updated_centers
        
        _, distances = pairwise_distances_argmin_min(data_points, cluster_centers, metric=distance_function)
        sum_squared_error = np.sum(distances ** 2)
        if iter > 0 and sum_squared_error > prev_sse:
            break
        prev_sse = sum_squared_error
    
    return assigned_clusters, cluster_centers, sum_squared_error, iter

# Load dataset
# Replace these with your actual file paths or data loading logic
data = pd.read_csv('data.csv').values
labels = pd.read_csv('label.csv').values.ravel()

# Determine number of unique clusters
num_clusters = np.unique(labels).size

def calculate_accuracy(clusters, true_labels):
    cluster_labels = {}
    for cluster in set(clusters):
        # Find the most frequent label in each cluster
        labels_in_cluster = true_labels[clusters == cluster]
        most_common_label = np.bincount(labels_in_cluster).argmax()
        cluster_labels[cluster] = most_common_label
    
    # Calculate accuracy
    correct_labels_count = sum(cluster_labels[cluster] == true_label for cluster, true_label in zip(clusters, true_labels))
    accuracy = correct_labels_count / len(true_labels)
    return accuracy

# Execute K-means with each distance metric
kmeans_results = {}
for dist_func, func_name in [(calc_euclidean_dist, 'Euclidean'), (calc_cosine_similarity, 'Cosine'), (calc_generalized_jaccard_similarity, 'Jaccard')]:
    clusters, centers, sse, iters = perform_kmeans(data, num_clusters, dist_func)
    accuracy = calculate_accuracy(clusters, labels)
    kmeans_results[func_name] = {
        'Sum of Squared Errors': sse, 
        'Iterations Completed': iters,
        'Accuracy': accuracy
    }

# Print formatted results with accuracy
for method, result in kmeans_results.items():
    print(f"Method: {method}, SSE: {result['Sum of Squared Errors']}, Iterations: {result['Iterations Completed']}, Accuracy: {result['Accuracy']}")





Method: Euclidean, SSE: 25434764681.941788, Iterations: 89, Accuracy: 0.6027602760276027
Method: Cosine, SSE: 682.1275065419829, Iterations: 22, Accuracy: 0.6096609660966097
Method: Jaccard, SSE: 3663.479751658251, Iterations: 25, Accuracy: 0.6135613561356136


In [11]:
# Q4 - when there is no change in centroid position
import numpy as np
import pandas as pd
from scipy.spatial.distance import euclidean as euclidean_dist, cosine as cosine_dist
from sklearn.metrics import pairwise_distances_argmin_min
from sklearn.preprocessing import LabelEncoder

# Redefined distance computation functions
def calc_euclidean_dist(point_a, point_b):
    return euclidean_dist(point_a, point_b)

def calc_cosine_similarity(point_a, point_b):
    return 1 - np.dot(point_a, point_b) / (np.linalg.norm(point_a) * np.linalg.norm(point_b))

def calc_generalized_jaccard_similarity(point_a, point_b):
    min_sum = np.minimum(point_a, point_b).sum()
    max_sum = np.maximum(point_a, point_b).sum()
    return 1 - min_sum / max_sum if max_sum != 0 else 0

# Modified K-means algorithm
def perform_kmeans(data_points, num_clusters, distance_function, max_iterations=100):
    random_indices = np.random.choice(data_points.shape[0], num_clusters, replace=False)
    cluster_centers = data_points[random_indices]
    
    for iter in range(max_iterations):
        assigned_clusters = np.array([np.argmin([distance_function(dp, center) for center in cluster_centers]) for dp in data_points])
        updated_centers = np.array([data_points[assigned_clusters == idx].mean(axis=0) for idx in range(num_clusters)])
        
        if np.all(cluster_centers == updated_centers):
            break
        cluster_centers = updated_centers
        
        _, distances = pairwise_distances_argmin_min(data_points, cluster_centers, metric=distance_function)
        sum_squared_error = np.sum(distances ** 2)
        #if iter > 0 and sum_squared_error > prev_sse:
        #    break
        prev_sse = sum_squared_error
    
    return assigned_clusters, cluster_centers, sum_squared_error, iter

# Load dataset
# Replace these with your actual file paths or data loading logic
data = pd.read_csv('data.csv').values
labels = pd.read_csv('label.csv').values.ravel()

# Determine number of unique clusters
num_clusters = np.unique(labels).size

def calculate_accuracy(clusters, true_labels):
    cluster_labels = {}
    for cluster in set(clusters):
        # Find the most frequent label in each cluster
        labels_in_cluster = true_labels[clusters == cluster]
        most_common_label = np.bincount(labels_in_cluster).argmax()
        cluster_labels[cluster] = most_common_label
    
    # Calculate accuracy
    correct_labels_count = sum(cluster_labels[cluster] == true_label for cluster, true_label in zip(clusters, true_labels))
    accuracy = correct_labels_count / len(true_labels)
    return accuracy

# Execute K-means with each distance metric
kmeans_results = {}
for dist_func, func_name in [(calc_euclidean_dist, 'Euclidean'), (calc_cosine_similarity, 'Cosine'), (calc_generalized_jaccard_similarity, 'Jaccard')]:
    clusters, centers, sse, iters = perform_kmeans(data, num_clusters, dist_func)
    accuracy = calculate_accuracy(clusters, labels)
    kmeans_results[func_name] = {
        'Sum of Squared Errors': sse, 
        'Iterations Completed': iters,
        'Accuracy': accuracy
    }

# Print formatted results with accuracy
for method, result in kmeans_results.items():
    print(f"Method: {method}, SSE: {result['Sum of Squared Errors']}, Iterations: {result['Iterations Completed']}, Accuracy: {result['Accuracy']}")



Method: Euclidean, SSE: 25318083736.17222, Iterations: 99, Accuracy: 0.58995899589959
Method: Cosine, SSE: 692.1907174210952, Iterations: 56, Accuracy: 0.6109610961096109
Method: Jaccard, SSE: 3678.396182256969, Iterations: 48, Accuracy: 0.6231623162316232


In [12]:
# Q4 - when the SSE value increases in the next iteration
import numpy as np
import pandas as pd
from scipy.spatial.distance import euclidean as euclidean_dist, cosine as cosine_dist
from sklearn.metrics import pairwise_distances_argmin_min
from sklearn.preprocessing import LabelEncoder

# Redefined distance computation functions
def calc_euclidean_dist(point_a, point_b):
    return euclidean_dist(point_a, point_b)

def calc_cosine_similarity(point_a, point_b):
    return 1 - np.dot(point_a, point_b) / (np.linalg.norm(point_a) * np.linalg.norm(point_b))

def calc_generalized_jaccard_similarity(point_a, point_b):
    min_sum = np.minimum(point_a, point_b).sum()
    max_sum = np.maximum(point_a, point_b).sum()
    return 1 - min_sum / max_sum if max_sum != 0 else 0

# Modified K-means algorithm
def perform_kmeans(data_points, num_clusters, distance_function, max_iterations=100):
    random_indices = np.random.choice(data_points.shape[0], num_clusters, replace=False)
    cluster_centers = data_points[random_indices]
    
    for iter in range(max_iterations):
        assigned_clusters = np.array([np.argmin([distance_function(dp, center) for center in cluster_centers]) for dp in data_points])
        updated_centers = np.array([data_points[assigned_clusters == idx].mean(axis=0) for idx in range(num_clusters)])
        
        #if np.all(cluster_centers == updated_centers):
        #    break
        cluster_centers = updated_centers
        
        _, distances = pairwise_distances_argmin_min(data_points, cluster_centers, metric=distance_function)
        sum_squared_error = np.sum(distances ** 2)
        if iter > 0 and sum_squared_error > prev_sse:
            break
        prev_sse = sum_squared_error
    
    return assigned_clusters, cluster_centers, sum_squared_error, iter

# Load dataset
# Replace these with your actual file paths or data loading logic
data = pd.read_csv('data.csv').values
labels = pd.read_csv('label.csv').values.ravel()

# Determine number of unique clusters
num_clusters = np.unique(labels).size

def calculate_accuracy(clusters, true_labels):
    cluster_labels = {}
    for cluster in set(clusters):
        # Find the most frequent label in each cluster
        labels_in_cluster = true_labels[clusters == cluster]
        most_common_label = np.bincount(labels_in_cluster).argmax()
        cluster_labels[cluster] = most_common_label
    
    # Calculate accuracy
    correct_labels_count = sum(cluster_labels[cluster] == true_label for cluster, true_label in zip(clusters, true_labels))
    accuracy = correct_labels_count / len(true_labels)
    return accuracy

# Execute K-means with each distance metric
kmeans_results = {}
for dist_func, func_name in [(calc_euclidean_dist, 'Euclidean'), (calc_cosine_similarity, 'Cosine'), (calc_generalized_jaccard_similarity, 'Jaccard')]:
    clusters, centers, sse, iters = perform_kmeans(data, num_clusters, dist_func)
    accuracy = calculate_accuracy(clusters, labels)
    kmeans_results[func_name] = {
        'Sum of Squared Errors': sse, 
        'Iterations Completed': iters,
        'Accuracy': accuracy
    }

# Print formatted results with accuracy
for method, result in kmeans_results.items():
    print(f"Method: {method}, SSE: {result['Sum of Squared Errors']}, Iterations: {result['Iterations Completed']}, Accuracy: {result['Accuracy']}")



Method: Euclidean, SSE: 25400077155.69669, Iterations: 99, Accuracy: 0.6004600460046005
Method: Cosine, SSE: 687.3261854086236, Iterations: 17, Accuracy: 0.5552555255525553
Method: Jaccard, SSE: 3658.673997765767, Iterations: 25, Accuracy: 0.5999599959995999


In [13]:
# Q4 - when the maximum preset value (e.g., 100) of iteration is complete
import numpy as np
import pandas as pd
from scipy.spatial.distance import euclidean as euclidean_dist, cosine as cosine_dist
from sklearn.metrics import pairwise_distances_argmin_min
from sklearn.preprocessing import LabelEncoder

# Redefined distance computation functions
def calc_euclidean_dist(point_a, point_b):
    return euclidean_dist(point_a, point_b)

def calc_cosine_similarity(point_a, point_b):
    return 1 - np.dot(point_a, point_b) / (np.linalg.norm(point_a) * np.linalg.norm(point_b))

def calc_generalized_jaccard_similarity(point_a, point_b):
    min_sum = np.minimum(point_a, point_b).sum()
    max_sum = np.maximum(point_a, point_b).sum()
    return 1 - min_sum / max_sum if max_sum != 0 else 0

# Modified K-means algorithm
def perform_kmeans(data_points, num_clusters, distance_function, max_iterations=100):
    random_indices = np.random.choice(data_points.shape[0], num_clusters, replace=False)
    cluster_centers = data_points[random_indices]
    
    for iter in range(max_iterations):
        assigned_clusters = np.array([np.argmin([distance_function(dp, center) for center in cluster_centers]) for dp in data_points])
        updated_centers = np.array([data_points[assigned_clusters == idx].mean(axis=0) for idx in range(num_clusters)])
        
        #if np.all(cluster_centers == updated_centers):
        #    break
        cluster_centers = updated_centers
        
        _, distances = pairwise_distances_argmin_min(data_points, cluster_centers, metric=distance_function)
        sum_squared_error = np.sum(distances ** 2)
        #if iter > 0 and sum_squared_error > prev_sse:
        #    break
        prev_sse = sum_squared_error
    
    return assigned_clusters, cluster_centers, sum_squared_error, iter

# Load dataset
# Replace these with your actual file paths or data loading logic
data = pd.read_csv('data.csv').values
labels = pd.read_csv('label.csv').values.ravel()

# Determine number of unique clusters
num_clusters = np.unique(labels).size

def calculate_accuracy(clusters, true_labels):
    cluster_labels = {}
    for cluster in set(clusters):
        # Find the most frequent label in each cluster
        labels_in_cluster = true_labels[clusters == cluster]
        most_common_label = np.bincount(labels_in_cluster).argmax()
        cluster_labels[cluster] = most_common_label
    
    # Calculate accuracy
    correct_labels_count = sum(cluster_labels[cluster] == true_label for cluster, true_label in zip(clusters, true_labels))
    accuracy = correct_labels_count / len(true_labels)
    return accuracy

# Execute K-means with each distance metric
kmeans_results = {}
for dist_func, func_name in [(calc_euclidean_dist, 'Euclidean'), (calc_cosine_similarity, 'Cosine'), (calc_generalized_jaccard_similarity, 'Jaccard')]:
    clusters, centers, sse, iters = perform_kmeans(data, num_clusters, dist_func)
    accuracy = calculate_accuracy(clusters, labels)
    kmeans_results[func_name] = {
        'Sum of Squared Errors': sse, 
        'Iterations Completed': iters,
        'Accuracy': accuracy
    }

# Print formatted results with accuracy
for method, result in kmeans_results.items():
    print(f"Method: {method}, SSE: {result['Sum of Squared Errors']}, Iterations: {result['Iterations Completed']}, Accuracy: {result['Accuracy']}")



Method: Euclidean, SSE: 25436632644.588085, Iterations: 99, Accuracy: 0.6062606260626062
Method: Cosine, SSE: 682.0748392570059, Iterations: 99, Accuracy: 0.6131613161316132
Method: Jaccard, SSE: 3678.4366672960273, Iterations: 99, Accuracy: 0.6230623062306231
