# **Import Libraries**

In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.cluster import AgglomerativeClustering
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.cluster import AgglomerativeClustering

- This script performs data loading, preprocessing, scaling, and clustering on the California Housing dataset.
- It includes steps to handle missing values, encode categorical features, apply PCA for dimension reduction, and utilize Agglomerative Clustering with various configurations to evaluate clustering performance using Silhouette Score, Davies-Bouldin Index, and Calinski-Harabasz Index.

# Data Loading & Preprocessing

In [2]:
path = 'C:\\Users\\zcindemir\\Desktop\\Data Mining Project\\housing.csv'
housing = pd.read_csv(path)

housing['total_bedrooms'].fillna(housing['total_bedrooms'].median(), inplace=True)

encoder = OneHotEncoder()
ocean_proximity_encoded = encoder.fit_transform(housing[['ocean_proximity']]).toarray()
feature_names = encoder.get_feature_names_out(['ocean_proximity'])
ocean_proximity_df = pd.DataFrame(ocean_proximity_encoded, columns=feature_names)
housing_encoded = pd.concat([housing.drop('ocean_proximity', axis=1), ocean_proximity_df], axis=1)

scaler = StandardScaler()
housing_scaled = scaler.fit_transform(housing_encoded)

In [3]:
all_scores_agg = []

pca_dimensions = [2, 3, 4]
cluster_counts = [5, 6, 7, 8]
metrics = ['euclidean', 'manhattan']
linkages = ['ward', 'complete', 'average', 'single']


for dim in pca_dimensions:
    pca = PCA(n_components=dim)
    housing_pca = pca.fit_transform(housing_scaled)

    for clusters in cluster_counts:
        for metric in metrics:
            for linkage in linkages:
                if linkage == 'ward' and metric != 'euclidean':
                    continue
                agg_clustering = AgglomerativeClustering(n_clusters=clusters, linkage=linkage, metric=metric)
                cluster_labels = agg_clustering.fit_predict(housing_pca)

                silhouette = silhouette_score(housing_pca, cluster_labels)
                davies = davies_bouldin_score(housing_pca, cluster_labels)
                calinski = calinski_harabasz_score(housing_pca, cluster_labels)

                all_scores_agg.append({
                    'PCA Dimensions': dim,
                    'Number of Clusters': clusters,
                    'Metric': metric,
                    'Linkage': linkage,
                    'Silhouette Score': silhouette,
                    'Davies-Bouldin Index': davies,
                    'Calinski-Harabasz Index': calinski
                })


agg_results_df = pd.DataFrame(all_scores_agg)

In [4]:
best_silhouette_row = agg_results_df.loc[agg_results_df['Silhouette Score'].idxmax()]
best_silhouette_row

PCA Dimensions                      2
Number of Clusters                  6
Metric                      euclidean
Linkage                        single
Silhouette Score             0.856173
Davies-Bouldin Index         0.184165
Calinski-Harabasz Index    140.744925
Name: 10, dtype: object

In [5]:
best_db_row = agg_results_df.loc[agg_results_df['Davies-Bouldin Index'].idxmin()]
best_db_row

PCA Dimensions                     2
Number of Clusters                 5
Metric                     euclidean
Linkage                       single
Silhouette Score            0.855944
Davies-Bouldin Index        0.082388
Calinski-Harabasz Index    124.09645
Name: 3, dtype: object

In [6]:
best_ch_row = agg_results_df.loc[agg_results_df['Calinski-Harabasz Index'].idxmax()]
best_ch_row

PCA Dimensions                        2
Number of Clusters                    7
Metric                        euclidean
Linkage                            ward
Silhouette Score               0.359905
Davies-Bouldin Index           0.897696
Calinski-Harabasz Index    14696.573486
Name: 14, dtype: object