# Import Libraries

In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
import matplotlib.pyplot as plt
from sklearn.cluster import Birch

# Data Loading & Preprocessing

In [2]:
path = 'C:\\Users\\zcindemir\\Desktop\\Data Mining Project\\housing.csv'
housing = pd.read_csv(path)

housing['total_bedrooms'].fillna(housing['total_bedrooms'].median(), inplace=True)

encoder = OneHotEncoder()
ocean_proximity_encoded = encoder.fit_transform(housing[['ocean_proximity']]).toarray()
feature_names = encoder.get_feature_names_out(['ocean_proximity'])
ocean_proximity_df = pd.DataFrame(ocean_proximity_encoded, columns=feature_names)
housing_encoded = pd.concat([housing.drop('ocean_proximity', axis=1), ocean_proximity_df], axis=1)

scaler = StandardScaler()
housing_scaled = scaler.fit_transform(housing_encoded)

- This script extends the analysis to include BIRCH clustering with hyperparameter tuning on the California Housing dataset.
- It handles data loading, preprocessing, PCA for dimension reduction, and experiments with various configurations of BIRCH clustering.
- Parameters like threshold, branching factors, and number of clusters are iteratively tested.
- Clustering performance is evaluated using metrics such as Silhouette Score, Davies-Bouldin Index, and Calinski-Harabasz Index.
- The script aims to provide insights on how different configurations impact the clustering results, capturing these in a DataFrame for further analysis.

In [3]:
all_scores_birch = []

pca_dimensions = [2, 3, 4]
threshold_values = [0.1, 0.3, 0.5]
branching_factors = [20, 50, 100]
cluster_counts = [5, 6, 7, 8]


for dim in pca_dimensions:
    pca = PCA(n_components=dim)
    housing_pca = pca.fit_transform(housing_scaled)

    for threshold in threshold_values:
        for branching_factor in branching_factors:
            for clusters in cluster_counts:
                birch = Birch(threshold=threshold, branching_factor=branching_factor, n_clusters=clusters)
                cluster_labels = birch.fit_predict(housing_pca)

                silhouette = silhouette_score(housing_pca, cluster_labels)
                davies = davies_bouldin_score(housing_pca, cluster_labels)
                calinski = calinski_harabasz_score(housing_pca, cluster_labels)

                all_scores_birch.append({
                    'PCA Dimensions': dim,
                    'Threshold': threshold,
                    'Branching Factor': branching_factor,
                    'Number of Clusters': clusters,
                    'Silhouette Score': silhouette,
                    'Davies-Bouldin Index': davies,
                    'Calinski-Harabasz Index': calinski
                })

birch_results_df = pd.DataFrame(all_scores_birch)

In [4]:
best_silhouette_row = birch_results_df.loc[birch_results_df['Silhouette Score'].idxmax()]
best_silhouette_row

PCA Dimensions                2.000000
Threshold                     0.500000
Branching Factor            100.000000
Number of Clusters            5.000000
Silhouette Score              0.553121
Davies-Bouldin Index          0.620969
Calinski-Harabasz Index    2786.190475
Name: 32, dtype: float64

In [5]:
best_db_row = birch_results_df.loc[birch_results_df['Davies-Bouldin Index'].idxmin()]
best_db_row

PCA Dimensions                2.000000
Threshold                     0.500000
Branching Factor            100.000000
Number of Clusters            5.000000
Silhouette Score              0.553121
Davies-Bouldin Index          0.620969
Calinski-Harabasz Index    2786.190475
Name: 32, dtype: float64

In [6]:
best_ch_row = birch_results_df.loc[birch_results_df['Calinski-Harabasz Index'].idxmax()]
best_ch_row

PCA Dimensions                 2.000000
Threshold                      0.100000
Branching Factor              20.000000
Number of Clusters             5.000000
Silhouette Score               0.427710
Davies-Bouldin Index           0.739195
Calinski-Harabasz Index    11824.582345
Name: 0, dtype: float64