# Import Libraries

In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import numpy as np
from hdbscan import HDBSCAN
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score

- This script performs PCA dimension reduction and clustering using HDBSCAN on the California Housing dataset.
- It tests various PCA dimensions and explores different configurations of HDBSCAN, adjusting parameters like minimum cluster size and samples.
- Clustering performance is evaluated using metrics such as Silhouette Score, Davies-Bouldin Index, and Calinski-Harabasz Index.
- The script captures the clustering metrics for different parameter settings in a DataFrame, providing a comprehensive analysis of clustering effectiveness.
- Special attention is given to handling noise as identified by HDBSCAN, with metrics computed only for valid clusters.

# Data Loading & Preprocessing

In [2]:
path = 'C:\\Users\\zcindemir\\Desktop\\Data Mining Project\\housing.csv'
housing = pd.read_csv(path)

housing['total_bedrooms'].fillna(housing['total_bedrooms'].median(), inplace=True)

encoder = OneHotEncoder()
ocean_proximity_encoded = encoder.fit_transform(housing[['ocean_proximity']]).toarray()
feature_names = encoder.get_feature_names_out(['ocean_proximity'])
ocean_proximity_df = pd.DataFrame(ocean_proximity_encoded, columns=feature_names)
housing_encoded = pd.concat([housing.drop('ocean_proximity', axis=1), ocean_proximity_df], axis=1)

scaler = StandardScaler()
housing_scaled = scaler.fit_transform(housing_encoded)

In [3]:
all_scores_hdbscan = []

pca_dimensions = [2, 3, 4]

min_cluster_size_values = [5, 10, 15]
min_samples_values = [5, 10, 15]

for dim in pca_dimensions:
    pca = PCA(n_components=dim)
    housing_pca = pca.fit_transform(housing_scaled)

    for min_cluster_size in min_cluster_size_values:
        for min_samples in min_samples_values:
            hdbscan_model = HDBSCAN(min_cluster_size=min_cluster_size, min_samples=min_samples)
            cluster_labels = hdbscan_model.fit_predict(housing_pca)

            if np.unique(cluster_labels).size > 1:
                silhouette = silhouette_score(housing_pca, cluster_labels)
                davies = davies_bouldin_score(housing_pca, cluster_labels)
                calinski = calinski_harabasz_score(housing_pca, cluster_labels)

                all_scores_hdbscan.append({
                    'PCA Dimensions': dim,
                    'Min Cluster Size': min_cluster_size,
                    'Min Samples': min_samples,
                    'Silhouette Score': silhouette,
                    'Davies-Bouldin Index': davies,
                    'Calinski-Harabasz Index': calinski
                })

hdbscan_results_df = pd.DataFrame(all_scores_hdbscan)

In [4]:
best_silhouette_row = hdbscan_results_df.loc[hdbscan_results_df['Silhouette Score'].idxmax()]
best_silhouette_row

PCA Dimensions                3.000000
Min Cluster Size              5.000000
Min Samples                  10.000000
Silhouette Score              0.612225
Davies-Bouldin Index          1.970577
Calinski-Harabasz Index    1043.581384
Name: 10, dtype: float64

In [5]:
best_db_row = hdbscan_results_df.loc[hdbscan_results_df['Davies-Bouldin Index'].idxmin()]
best_db_row

PCA Dimensions                2.000000
Min Cluster Size             15.000000
Min Samples                  15.000000
Silhouette Score              0.567482
Davies-Bouldin Index          1.115248
Calinski-Harabasz Index    2634.406730
Name: 8, dtype: float64

In [6]:
best_ch_row = hdbscan_results_df.loc[hdbscan_results_df['Calinski-Harabasz Index'].idxmax()]
best_ch_row

PCA Dimensions                4.000000
Min Cluster Size             15.000000
Min Samples                  10.000000
Silhouette Score              0.304936
Davies-Bouldin Index          1.162807
Calinski-Harabasz Index    3778.307170
Name: 25, dtype: float64