In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_wine
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, adjusted_rand_score
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import euclidean_distances

class SimpleKMedoids:
    def __init__(self, n_clusters=3, max_iter=100, random_state=None):
        self.n_clusters = n_clusters
        self.max_iter = max_iter
        self.random_state = random_state
        
    def fit_predict(self, X):
        np.random.seed(self.random_state)
        n_samples, n_features = X.shape
        
        # Initialize medoids randomly
        medoid_indices = np.random.choice(n_samples, self.n_clusters, replace=False)
        self.medoid_indices_ = medoid_indices.copy()
        
        for iteration in range(self.max_iter):
            # Assign points to nearest medoids
            distances = euclidean_distances(X, X[medoid_indices])
            labels = np.argmin(distances, axis=1)
            
            # Update medoids
            new_medoid_indices = []
            total_cost = 0
            
            for k in range(self.n_clusters):
                cluster_mask = labels == k
                if np.sum(cluster_mask) == 0:
                    new_medoid_indices.append(medoid_indices[k])
                    continue
                    
                cluster_points = X[cluster_mask]
                cluster_indices = np.where(cluster_mask)[0]
                
                # Find the point that minimizes total distance to other points in cluster
                min_cost = float('inf')
                best_medoid = medoid_indices[k]
                
                for i, point_idx in enumerate(cluster_indices):
                    cost = np.sum(euclidean_distances([X[point_idx]], cluster_points))
                    if cost < min_cost:
                        min_cost = cost
                        best_medoid = point_idx
                
                new_medoid_indices.append(best_medoid)
                total_cost += min_cost
            
            new_medoid_indices = np.array(new_medoid_indices)
            
            # Check for convergence
            if np.array_equal(sorted(medoid_indices), sorted(new_medoid_indices)):
                break
                
            medoid_indices = new_medoid_indices
            
        self.medoid_indices_ = medoid_indices
        self.inertia_ = total_cost
        
        # Final assignment
        distances = euclidean_distances(X, X[medoid_indices])
        labels = np.argmin(distances, axis=1)
        
        return labels

# Load and prepare the dataset
wine_data = load_wine()
X = wine_data.data
y = wine_data.target
feature_names = wine_data.feature_names

print("Dataset Shape:", X.shape)
print("Number of classes:", len(np.unique(y)))
print("Class distribution:", np.bincount(y))
print("\nFeature names:")
for i, name in enumerate(feature_names):
    print(f"{i+1}. {name}")

# Standardize the dataset using z-score normalization
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print("\nData standardized successfully")
print("Mean after scaling:", np.mean(X_scaled, axis=0)[:5])
print("Std after scaling:", np.std(X_scaled, axis=0)[:5])

# Implement K-Means Clustering
print("\n" + "="*50)
print("K-MEANS CLUSTERING")
print("="*50)

kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
kmeans_labels = kmeans.fit_predict(X_scaled)

# Calculate performance metrics for K-Means
kmeans_silhouette = silhouette_score(X_scaled, kmeans_labels)
kmeans_ari = adjusted_rand_score(y, kmeans_labels)

print(f"K-Means Silhouette Score: {kmeans_silhouette:.4f}")
print(f"K-Means Adjusted Rand Index: {kmeans_ari:.4f}")

# Implement K-Medoids Clustering
print("\n" + "="*50)
print("K-MEDOIDS CLUSTERING")
print("="*50)

kmedoids = SimpleKMedoids(n_clusters=3, random_state=42)
kmedoids_labels = kmedoids.fit_predict(X_scaled)

# Calculate performance metrics for K-Medoids
kmedoids_silhouette = silhouette_score(X_scaled, kmedoids_labels)
kmedoids_ari = adjusted_rand_score(y, kmedoids_labels)

print(f"K-Medoids Silhouette Score: {kmedoids_silhouette:.4f}")
print(f"K-Medoids Adjusted Rand Index: {kmedoids_ari:.4f}")

# Performance comparison
print("\n" + "="*50)
print("PERFORMANCE COMPARISON")
print("="*50)
print(f"{'Metric':<25} {'K-Means':<15} {'K-Medoids':<15}")
print("-" * 55)
print(f"{'Silhouette Score':<25} {kmeans_silhouette:<15.4f} {kmedoids_silhouette:<15.4f}")
print(f"{'Adjusted Rand Index':<25} {kmeans_ari:<15.4f} {kmedoids_ari:<15.4f}")

# Use PCA for visualization (reduce to 2D)
pca = PCA(n_components=2, random_state=42)
X_pca = pca.fit_transform(X_scaled)

print(f"\nPCA explained variance ratio: {pca.explained_variance_ratio_}")
print(f"Total variance explained: {sum(pca.explained_variance_ratio_):.4f}")

# Create visualizations
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 12))

# Original classes
scatter1 = ax1.scatter(X_pca[:, 0], X_pca[:, 1], c=y, cmap='viridis', alpha=0.7)
ax1.set_title('Original Wine Classes')
ax1.set_xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.3f})')
ax1.set_ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.3f})')
plt.colorbar(scatter1, ax=ax1)

# K-Means results
scatter2 = ax2.scatter(X_pca[:, 0], X_pca[:, 1], c=kmeans_labels, cmap='viridis', alpha=0.7)
# Plot centroids (transform back to PCA space)
centroids_pca = pca.transform(kmeans.cluster_centers_)
ax2.scatter(centroids_pca[:, 0], centroids_pca[:, 1], c='red', marker='x', s=200, linewidths=3, label='Centroids')
ax2.set_title(f'K-Means Clustering\nSilhouette: {kmeans_silhouette:.3f}, ARI: {kmeans_ari:.3f}')
ax2.set_xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.3f})')
ax2.set_ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.3f})')
ax2.legend()
plt.colorbar(scatter2, ax=ax2)

# K-Medoids results
scatter3 = ax3.scatter(X_pca[:, 0], X_pca[:, 1], c=kmedoids_labels, cmap='viridis', alpha=0.7)
# Plot medoids (transform back to PCA space)
medoids_pca = pca.transform(X_scaled[kmedoids.medoid_indices_])
ax3.scatter(medoids_pca[:, 0], medoids_pca[:, 1], c='red', marker='s', s=200, linewidths=2, label='Medoids')
ax3.set_title(f'K-Medoids Clustering\nSilhouette: {kmedoids_silhouette:.3f}, ARI: {kmedoids_ari:.3f}')
ax3.set_xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.3f})')
ax3.set_ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.3f})')
ax3.legend()
plt.colorbar(scatter3, ax=ax3)

# Comparison plot
x_pos = np.arange(2)
silhouette_scores = [kmeans_silhouette, kmedoids_silhouette]
ari_scores = [kmeans_ari, kmedoids_ari]

ax4_twin = ax4.twinx()
bars1 = ax4.bar(x_pos - 0.2, silhouette_scores, 0.4, label='Silhouette Score', alpha=0.8)
bars2 = ax4_twin.bar(x_pos + 0.2, ari_scores, 0.4, label='Adjusted Rand Index', alpha=0.8, color='orange')

ax4.set_xlabel('Clustering Algorithm')
ax4.set_ylabel('Silhouette Score', color='blue')
ax4_twin.set_ylabel('Adjusted Rand Index', color='orange')
ax4.set_title('Performance Metrics Comparison')
ax4.set_xticks(x_pos)
ax4.set_xticklabels(['K-Means', 'K-Medoids'])

# Add value labels on bars
for i, (bar1, bar2) in enumerate(zip(bars1, bars2)):
    height1 = bar1.get_height()
    height2 = bar2.get_height()
    ax4.text(bar1.get_x() + bar1.get_width()/2., height1 + 0.01,
             f'{height1:.3f}', ha='center', va='bottom')
    ax4_twin.text(bar2.get_x() + bar2.get_width()/2., height2 + 0.01,
                  f'{height2:.3f}', ha='center', va='bottom')

plt.tight_layout()
plt.show()

# Detailed analysis
print("\n" + "="*50)
print("DETAILED ANALYSIS")
print("="*50)

if kmeans_silhouette > kmedoids_silhouette:
    print("K-Means produced better-defined clusters (higher silhouette score)")
else:
    print("K-Medoids produced better-defined clusters (higher silhouette score)")

if kmeans_ari > kmedoids_ari:
    print("K-Means clusters align better with true classes (higher ARI)")
else:
    print("K-Medoids clusters align better with true classes (higher ARI)")

print("\nKey Differences Observed:")
print("- K-Means uses centroids (mean of cluster points)")
print("- K-Medoids uses medoids (actual data points as cluster centers)")
print("- K-Means is more sensitive to outliers")
print("- K-Medoids is more robust to outliers and noise")

print("\nWhen to use each algorithm:")
print("K-Means preferable when:")
print("  - Data is spherically distributed")
print("  - Computational efficiency is important")
print("  - Data has minimal outliers")

print("\nK-Medoids preferable when:")
print("  - Data contains outliers")
print("  - Need actual data points as cluster representatives")
print("  - Non-spherical cluster shapes")

# Cluster characteristics
print("\n" + "="*50)
print("CLUSTER CHARACTERISTICS")
print("="*50)

for i in range(3):
    kmeans_size = np.sum(kmeans_labels == i)
    kmedoids_size = np.sum(kmedoids_labels == i)
    print(f"Cluster {i}:")
    print(f"  K-Means size: {kmeans_size}")
    print(f"  K-Medoids size: {kmedoids_size}")

print(f"\nK-Means inertia: {kmeans.inertia_:.2f}")
print(f"K-Medoids inertia: {kmedoids.inertia_:.2f}")

 MSCS_634_Lab_3
 Roshan Gautam
 University of Cumberlands
