# Import Libraries

In [7]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
from sklearn.cluster import OPTICS
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

- This script performs PCA dimension reduction and clustering using the OPTICS algorithm on the California Housing dataset.
- It tests various PCA dimensions and iterates over different configurations of OPTICS, adjusting parameters like minimum samples, xi, and minimum cluster size.
- The performance of each OPTICS configuration is evaluated using metrics such as Silhouette Score, Davies-Bouldin Index, and Calinski-Harabasz Index.
- Results, including the number of clusters formed and performance metrics, are systematically collected in a DataFrame, providing a comprehensive analysis of how different OPTICS settings influence clustering effectiveness.
- Special consideration is given to handle cases where OPTICS may detect less than two clusters, signifying potential issues with parameter settings or data suitability.

# Data Loading & Preprocessing

In [8]:
path = 'C:\\Users\\zcindemir\\Desktop\\Data Mining Project\\housing.csv'
housing = pd.read_csv(path)

housing['total_bedrooms'].fillna(housing['total_bedrooms'].median(), inplace=True)

encoder = OneHotEncoder()
ocean_proximity_encoded = encoder.fit_transform(housing[['ocean_proximity']]).toarray()
feature_names = encoder.get_feature_names_out(['ocean_proximity'])
ocean_proximity_df = pd.DataFrame(ocean_proximity_encoded, columns=feature_names)
housing_encoded = pd.concat([housing.drop('ocean_proximity', axis=1), ocean_proximity_df], axis=1)

scaler = StandardScaler()
housing_scaled = scaler.fit_transform(housing_encoded)

In [9]:
all_scores_optics = []

pca_dimensions = [2, 3, 4]

min_samples_values = range(5, 56, 10)
xi_values = [0.05, 0.1, 0.15]
min_cluster_size_values = range(5, 56, 10)

for dim in pca_dimensions:
    pca = PCA(n_components=dim)
    housing_pca = pca.fit_transform(housing_scaled)

    for min_samples in min_samples_values:
        for xi in xi_values:
            for min_cluster_size in min_cluster_size_values:
                optics = OPTICS(min_samples=min_samples, xi=xi, min_cluster_size=min_cluster_size)
                cluster_labels = optics.fit_predict(housing_pca)

                if len(np.unique(cluster_labels)) < 2:
                    continue

                silhouette = silhouette_score(housing_pca, cluster_labels) if len(set(cluster_labels)) > 1 else -1
                davies = davies_bouldin_score(housing_pca, cluster_labels) if len(set(cluster_labels)) > 1 else float('inf')
                calinski = calinski_harabasz_score(housing_pca, cluster_labels) if len(set(cluster_labels)) > 1 else 0

                all_scores_optics.append({
                    'PCA Dimensions': dim,
                    'Min Samples': min_samples,
                    'Xi': xi,
                    'Min Cluster Size': min_cluster_size,
                    'Number of Clusters': len(np.unique(cluster_labels)),
                    'Silhouette Score': silhouette,
                    'Davies-Bouldin Index': davies,
                    'Calinski-Harabasz Index': calinski
                })

optics_results_df = pd.DataFrame(all_scores_optics)

In [10]:
best_silhouette_row = optics_results_df.loc[optics_results_df['Silhouette Score'].idxmax()]
best_silhouette_row

PCA Dimensions                2.000000
Min Samples                  15.000000
Xi                            0.050000
Min Cluster Size             55.000000
Number of Clusters            2.000000
Silhouette Score              0.843555
Davies-Bouldin Index          0.307048
Calinski-Harabasz Index    1805.292548
Name: 23, dtype: float64

In [11]:
best_db_row = optics_results_df.loc[optics_results_df['Davies-Bouldin Index'].idxmin()]
best_db_row

PCA Dimensions                2.000000
Min Samples                  15.000000
Xi                            0.100000
Min Cluster Size             25.000000
Number of Clusters            2.000000
Silhouette Score              0.837356
Davies-Bouldin Index          0.306395
Calinski-Harabasz Index    2090.089207
Name: 26, dtype: float64

In [12]:
best_ch_row = optics_results_df.loc[optics_results_df['Calinski-Harabasz Index'].idxmax()]
best_ch_row

PCA Dimensions                2.000000
Min Samples                  45.000000
Xi                            0.050000
Min Cluster Size             35.000000
Number of Clusters            2.000000
Silhouette Score              0.759991
Davies-Bouldin Index          0.451217
Calinski-Harabasz Index    4876.698861
Name: 51, dtype: float64