# Import Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
import matplotlib.pyplot as plt
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import DBSCAN

- This script performs PCA dimension reduction and DBSCAN clustering on the California Housing dataset.
- It includes hyperparameter tuning for DBSCAN with various epsilon and min_samples values across different PCA reduced dimensions.
- The script evaluates clustering performance using metrics like Silhouette Score, Davies-Bouldin Index, and Calinski-Harabasz Index, and aims to identify optimal clustering configurations. Results are collected in a DataFrame for analysis, with considerations for cases where DBSCAN might detect less than two clusters.

# Data Loading & Preprocessing

In [2]:
path = 'C:\\Users\\zcindemir\\Desktop\\Data Mining Project\\housing.csv'
housing = pd.read_csv(path)

housing['total_bedrooms'].fillna(housing['total_bedrooms'].median(), inplace=True)

encoder = OneHotEncoder()
ocean_proximity_encoded = encoder.fit_transform(housing[['ocean_proximity']]).toarray()
feature_names = encoder.get_feature_names_out(['ocean_proximity'])
ocean_proximity_df = pd.DataFrame(ocean_proximity_encoded, columns=feature_names)
housing_encoded = pd.concat([housing.drop('ocean_proximity', axis=1), ocean_proximity_df], axis=1)

scaler = StandardScaler()
housing_scaled = scaler.fit_transform(housing_encoded)

In [3]:
all_scores_dbscan = []

pca_dimensions = [2, 3, 4]

eps_values = [0.1, 0.3, 0.5]
min_samples_values = [5, 10, 20]

for dim in pca_dimensions:
    pca = PCA(n_components=dim)
    housing_pca = pca.fit_transform(housing_scaled)

    for eps in eps_values:
        for min_samples in min_samples_values:
            dbscan = DBSCAN(eps=eps, min_samples=min_samples)
            cluster_labels = dbscan.fit_predict(housing_pca)

            if len(np.unique(cluster_labels)) < 2:
                continue

            silhouette = silhouette_score(housing_pca, cluster_labels) if len(set(cluster_labels)) > 1 else -1
            davies = davies_bouldin_score(housing_pca, cluster_labels) if len(set(cluster_labels)) > 1 else float('inf')
            calinski = calinski_harabasz_score(housing_pca, cluster_labels) if len(set(cluster_labels)) > 1 else 0

            all_scores_dbscan.append({
                'PCA Dimensions': dim,
                'Epsilon': eps,
                'Min Samples': min_samples,
                'Number of Clusters': len(np.unique(cluster_labels)),
                'Silhouette Score': silhouette,
                'Davies-Bouldin Index': davies,
                'Calinski-Harabasz Index': calinski
            })

dbscan_results_df = pd.DataFrame(all_scores_dbscan)

In [4]:
best_silhouette_row = dbscan_results_df.loc[dbscan_results_df['Silhouette Score'].idxmax()]
best_silhouette_row

PCA Dimensions                2.000000
Epsilon                       0.500000
Min Samples                  10.000000
Number of Clusters            5.000000
Silhouette Score              0.652755
Davies-Bouldin Index          1.110853
Calinski-Harabasz Index    1215.766418
Name: 7, dtype: float64

In [5]:
best_db_row = dbscan_results_df.loc[dbscan_results_df['Davies-Bouldin Index'].idxmin()]
best_db_row

PCA Dimensions                2.000000
Epsilon                       0.500000
Min Samples                  20.000000
Number of Clusters            3.000000
Silhouette Score              0.566215
Davies-Bouldin Index          0.704011
Calinski-Harabasz Index    2802.916801
Name: 8, dtype: float64

In [6]:
best_ch_row = dbscan_results_df.loc[dbscan_results_df['Calinski-Harabasz Index'].idxmax()]
best_ch_row

PCA Dimensions                4.000000
Epsilon                       0.500000
Min Samples                  20.000000
Number of Clusters            7.000000
Silhouette Score              0.305811
Davies-Bouldin Index          1.348639
Calinski-Harabasz Index    3587.521084
Name: 26, dtype: float64