# Import Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn import metrics
from sklearn.cluster import KMeans, MiniBatchKMeans, MeanShift, AgglomerativeClustering, Birch, OPTICS, DBSCAN
from sklearn_extra.cluster import KMedoids
from sklearn.mixture import GaussianMixture
from hdbscan import HDBSCAN
from pyclustering.cluster.kmedians import kmedians
from pyclustering.cluster import cluster_visualizer
from pyclustering.utils import timedcall
from time import time
import memory_profiler

- This script performs a comparative analysis of various clustering algorithms on the California Housing dataset, utilizing PCA for dimension reduction to 2, 3, and 4 components.
- It includes:
  1. Two sets of clustering algorithms: traditional ones like K-Means, Mini Batch K-Means, and K-Medoids, and advanced ones like Mean Shift, Gaussian Mixture, and other clustering methods (Agglomerative Clustering, BIRCH, OPTICS, HDBSCAN, DBSCAN).
  2. Execution of each clustering algorithm on PCA-reduced data, measuring performance metrics such as Silhouette Score, Davies-Bouldin Index, and Calinski-Harabasz Index to evaluate cluster quality.
  3. Calculation of resource consumption for each algorithm, including time and memory usage, to assess efficiency.
- The results are collected in two separate DataFrames for detailed analysis and comparison of the effectiveness and efficiency of each algorithm across different PCA settings.

# Data Loading & Preprocessing

In [2]:
path = 'C:\\Users\\zcindemir\\Desktop\\Data Mining Project\\housing.csv'
housing = pd.read_csv(path)

housing['total_bedrooms'].fillna(housing['total_bedrooms'].median(), inplace=True)

encoder = OneHotEncoder()
ocean_proximity_encoded = encoder.fit_transform(housing[['ocean_proximity']]).toarray()
feature_names = encoder.get_feature_names_out(['ocean_proximity'])
ocean_proximity_df = pd.DataFrame(ocean_proximity_encoded, columns=feature_names)
housing_encoded = pd.concat([housing.drop('ocean_proximity', axis=1), ocean_proximity_df], axis=1)

scaler = StandardScaler()
housing_scaled = scaler.fit_transform(housing_encoded)

## K-means, K-Medoids, K-Medians and Mini Batch K-Means

In [3]:
pca_results = {}
for n_components in [2, 3, 4]:
    pca = PCA(n_components=n_components)
    pca_results[n_components] = pca.fit_transform(housing_scaled)

cluster_algorithms = {
    'K-Means': KMeans(n_clusters=6, n_init='auto'),
    'Mini Batch K-Means': MiniBatchKMeans(n_clusters=6, n_init='auto', batch_size=3072),
    'K-Medoids': KMedoids(n_clusters=6),
    'K-Medians': None
}

results = []
for n_components, data in pca_results.items():
    for algo_name, algo in cluster_algorithms.items():
        start_time = time()
        mem_usage_start = memory_profiler.memory_usage()

        if algo_name == 'K-Medians':
            initial_medians = [data[np.random.randint(0, len(data))].tolist() for _ in range(6)]
            kmedians_instance = kmedians(data, initial_medians)
            (ticks, _) = timedcall(kmedians_instance.process)
            clusters = kmedians_instance.get_clusters()
            labels = np.zeros(len(data))
            for cluster_id, cluster in enumerate(clusters):
                for index in cluster:
                    labels[index] = cluster_id
        else:
            labels = algo.fit_predict(data)

        mem_usage_end = memory_profiler.memory_usage()
        end_time = time()

        if np.unique(labels).size > 1:
            silhouette = metrics.silhouette_score(data, labels)
            db_index = metrics.davies_bouldin_score(data, labels)
            ch_index = metrics.calinski_harabasz_score(data, labels)
        else:
            silhouette, db_index, ch_index = np.nan, np.nan, np.nan
        
        time_consumed = end_time - start_time
        memory_consumed = mem_usage_end[0] - mem_usage_start[0]
        
        results.append({
            'PCA Components': n_components,
            'Algorithm': algo_name,
            'Silhouette Score': silhouette,
            'Davies-Bouldin Index': db_index,
            'Calinski-Harabasz Index': ch_index,
            'Time Consumed (s)': time_consumed,
            'Memory Consumed (MB)': memory_consumed
        })


results_df_1 = pd.DataFrame(results)

In [4]:
indexed_df_1 = results_df_1.set_index(['PCA Components', 'Algorithm']).sort_index()
indexed_df_1

Unnamed: 0_level_0,Unnamed: 1_level_0,Silhouette Score,Davies-Bouldin Index,Calinski-Harabasz Index,Time Consumed (s),Memory Consumed (MB)
PCA Components,Algorithm,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2,K-Means,0.394311,0.84235,15899.367999,0.348015,3.550781
2,K-Medians,0.383132,0.801364,13780.585734,0.86787,23.269531
2,K-Medoids,0.384561,0.802897,13500.902674,33.880282,1.082031
2,Mini Batch K-Means,0.3259,0.979884,15309.544182,0.344059,0.511719
3,K-Means,0.389376,1.026925,9428.270264,0.263344,0.929688
3,K-Medians,0.307257,1.221944,8056.220836,0.925411,2.789062
3,K-Medoids,0.277851,1.129313,7270.842536,21.600055,-0.542969
3,Mini Batch K-Means,0.341428,1.02191,9541.037522,0.343117,0.007812
4,K-Means,0.401001,0.887667,8630.736117,0.278317,1.171875
4,K-Medians,0.335333,1.046658,7017.5962,1.148998,0.03125


## Mean Shift, Gaussian Mixture Model, Agglomerative Clustering, BIRCH, OPTICS, HDBSCAN, and DBSCAN

In [5]:
pca_results = {}
for n_components in [2, 3, 4]:
    pca = PCA(n_components=n_components)
    pca_results[n_components] = pca.fit_transform(housing_scaled)

cluster_algorithms = {
    'Mean Shift': MeanShift(bandwidth=4.026624),
    'Gaussian Mixture': GaussianMixture(n_components=2, covariance_type='spherical', random_state=42),
    'Agglomerative Clustering': AgglomerativeClustering(n_clusters=6, linkage='single', metric='euclidean'),
    'BIRCH': Birch(threshold=0.5, branching_factor=100, n_clusters=5),
    'OPTICS': OPTICS(min_samples=15, xi=0.05, min_cluster_size=55),
    'HDBSCAN': HDBSCAN(min_cluster_size=5, min_samples=10),
    'DBSCAN': DBSCAN(eps=0.5, min_samples=10)
}

results = []
for n_components, data in pca_results.items():
    for algo_name, algo in cluster_algorithms.items():
        start_time = time()
        mem_usage_start = memory_profiler.memory_usage()

        labels = algo.fit_predict(data)

        mem_usage_end = memory_profiler.memory_usage()
        end_time = time()

        if np.unique(labels).size > 1:
            silhouette = metrics.silhouette_score(data, labels)
            db_index = metrics.davies_bouldin_score(data, labels)
            ch_index = metrics.calinski_harabasz_score(data, labels)
        else:
            silhouette, db_index, ch_index = np.nan, np.nan, np.nan
        
        time_consumed = end_time - start_time
        memory_consumed = mem_usage_end[0] - mem_usage_start[0]
        
        results.append({
            'PCA Components': n_components,
            'Algorithm': algo_name,
            'Silhouette Score': silhouette,
            'Davies-Bouldin Index': db_index,
            'Calinski-Harabasz Index': ch_index,
            'Time Consumed (s)': time_consumed,
            'Memory Consumed (MB)': memory_consumed
        })


results_df_2 = pd.DataFrame(results)

In [6]:
indexed_df_2 = results_df_2.set_index(['PCA Components', 'Algorithm']).sort_index()
indexed_df_2

Unnamed: 0_level_0,Unnamed: 1_level_0,Silhouette Score,Davies-Bouldin Index,Calinski-Harabasz Index,Time Consumed (s),Memory Consumed (MB)
PCA Components,Algorithm,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2,Agglomerative Clustering,0.856173,0.184165,140.744925,2.778912,0.707031
2,BIRCH,0.553121,0.620969,2786.190475,0.661422,1.84375
2,DBSCAN,0.652755,1.110853,1215.766418,0.824777,-0.4375
2,Gaussian Mixture,0.306959,0.928817,6612.322513,0.338819,0.023438
2,HDBSCAN,0.655634,1.821081,967.965316,2.491049,1.878906
2,Mean Shift,0.840483,0.29796,1989.632787,100.202932,7.4375
2,OPTICS,0.843555,0.307048,1805.292548,22.904103,0.25
3,Agglomerative Clustering,0.826745,0.198335,106.149913,2.832567,0.671875
3,BIRCH,0.194931,1.244309,3419.909668,0.883332,-7.957031
3,DBSCAN,0.568741,1.00222,3582.298645,0.859624,-4.433594
