# Import Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
import matplotlib.pyplot as plt
from sklearn.cluster import MeanShift, estimate_bandwidth

- This script applies PCA dimension reduction and MeanShift clustering on the California Housing dataset.
- It explores different PCA dimensions and tests various bandwidths around an estimated baseline for MeanShift.
- The performance of each MeanShift configuration is evaluated using metrics such as Silhouette Score, Davies-Bouldin Index, and Calinski-Harabasz Index.
- Results, including the number of clusters formed and performance metrics, are systematically collected in a DataFrame, providing a detailed analysis of how different bandwidth settings influence clustering outcomes.

# Data Loading & Preprocessing

In [2]:
path = 'C:\\Users\\zcindemir\\Desktop\\Data Mining Project\\housing.csv'
housing = pd.read_csv(path)

housing['total_bedrooms'].fillna(housing['total_bedrooms'].median(), inplace=True)

encoder = OneHotEncoder()
ocean_proximity_encoded = encoder.fit_transform(housing[['ocean_proximity']]).toarray()
feature_names = encoder.get_feature_names_out(['ocean_proximity'])
ocean_proximity_df = pd.DataFrame(ocean_proximity_encoded, columns=feature_names)
housing_encoded = pd.concat([housing.drop('ocean_proximity', axis=1), ocean_proximity_df], axis=1)

scaler = StandardScaler()
housing_scaled = scaler.fit_transform(housing_encoded)

In [3]:
all_scores_meanshift = []

pca_dimensions = [2, 3, 4]

for dim in pca_dimensions:
    pca = PCA(n_components=dim)
    housing_pca = pca.fit_transform(housing_scaled)

    bandwidth_base = estimate_bandwidth(housing_pca)
    bandwidths = [bandwidth_base * 0.75, bandwidth_base, bandwidth_base * 1.25]

    for bandwidth in bandwidths:
        meanshift = MeanShift(bandwidth=bandwidth)
        cluster_labels = meanshift.fit_predict(housing_pca)

        n_clusters_ = len(np.unique(cluster_labels))

        silhouette = silhouette_score(housing_pca, cluster_labels)
        davies = davies_bouldin_score(housing_pca, cluster_labels)
        calinski = calinski_harabasz_score(housing_pca, cluster_labels)

        all_scores_meanshift.append({
            'PCA Dimensions': dim,
            'Bandwidth': bandwidth,
            'Number of Clusters': n_clusters_,
            'Silhouette Score': silhouette,
            'Davies-Bouldin Index': davies,
            'Calinski-Harabasz Index': calinski
        })

meanshift_results_df = pd.DataFrame(all_scores_meanshift)

In [4]:
best_silhouette_row = meanshift_results_df.loc[meanshift_results_df['Silhouette Score'].idxmax()]
best_silhouette_row

PCA Dimensions               4.000000
Bandwidth                    4.026624
Number of Clusters           3.000000
Silhouette Score             0.773110
Davies-Bouldin Index         0.366508
Calinski-Harabasz Index    721.350155
Name: 8, dtype: float64

In [5]:
best_db_row = meanshift_results_df.loc[meanshift_results_df['Davies-Bouldin Index'].idxmin()]
best_db_row

PCA Dimensions               2.000000
Bandwidth                    2.516025
Number of Clusters           7.000000
Silhouette Score             0.746202
Davies-Bouldin Index         0.323255
Calinski-Harabasz Index    884.785780
Name: 2, dtype: float64

In [6]:
best_ch_row = meanshift_results_df.loc[meanshift_results_df['Calinski-Harabasz Index'].idxmax()]
best_ch_row

PCA Dimensions                2.000000
Bandwidth                     2.012820
Number of Clusters           11.000000
Silhouette Score              0.450536
Davies-Bouldin Index          0.440093
Calinski-Harabasz Index    3559.740033
Name: 1, dtype: float64