# Import Libraries

In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import numpy as np
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score

- This script performs PCA dimension reduction and clustering using the Gaussian Mixture Model (GMM) on the California Housing dataset.
- It explores different PCA dimensions and iteratively tests various configurations of GMM, including the number of components and covariance types.
- Clustering performance is evaluated using metrics such as Silhouette Score, Davies-Bouldin Index, and Calinski-Harabasz Index.
- The script systematically collects results for different parameter settings in a DataFrame for detailed analysis and comparison of clustering effectiveness.

# Data Loading & Preprocessing

In [2]:
path = 'C:\\Users\\zcindemir\\Desktop\\Data Mining Project\\housing.csv'
housing = pd.read_csv(path)

housing['total_bedrooms'].fillna(housing['total_bedrooms'].median(), inplace=True)

encoder = OneHotEncoder()
ocean_proximity_encoded = encoder.fit_transform(housing[['ocean_proximity']]).toarray()
feature_names = encoder.get_feature_names_out(['ocean_proximity'])
ocean_proximity_df = pd.DataFrame(ocean_proximity_encoded, columns=feature_names)
housing_encoded = pd.concat([housing.drop('ocean_proximity', axis=1), ocean_proximity_df], axis=1)

scaler = StandardScaler()
housing_scaled = scaler.fit_transform(housing_encoded)

In [3]:
all_scores_gmm = []

pca_dimensions = [2, 3, 4]

n_components_values = [2, 3, 4, 5]
covariance_type_options = ['full', 'tied', 'diag', 'spherical']

for dim in pca_dimensions:
    pca = PCA(n_components=dim)
    housing_pca = pca.fit_transform(housing_scaled)

    for n_components in n_components_values:
        for covariance_type in covariance_type_options:
            gmm = GaussianMixture(n_components=n_components, covariance_type=covariance_type, random_state=42)
            gmm.fit(housing_pca)
            cluster_labels = gmm.predict(housing_pca)

            silhouette = silhouette_score(housing_pca, cluster_labels)
            davies = davies_bouldin_score(housing_pca, cluster_labels)
            calinski = calinski_harabasz_score(housing_pca, cluster_labels)

            all_scores_gmm.append({
                'PCA Dimensions': dim,
                'Number of Components': n_components,
                'Covariance Type': covariance_type,
                'Silhouette Score': silhouette,
                'Davies-Bouldin Index': davies,
                'Calinski-Harabasz Index': calinski
            })

gmm_results_df = pd.DataFrame(all_scores_gmm)

In [4]:
best_silhouette_row = gmm_results_df.loc[gmm_results_df['Silhouette Score'].idxmax()]
best_silhouette_row

PCA Dimensions                       3
Number of Components                 2
Covariance Type              spherical
Silhouette Score               0.56945
Davies-Bouldin Index          0.797889
Calinski-Harabasz Index    5644.241739
Name: 19, dtype: object

In [5]:
best_db_row = gmm_results_df.loc[gmm_results_df['Davies-Bouldin Index'].idxmin()]
best_db_row

PCA Dimensions                        2
Number of Components                  4
Covariance Type                    tied
Silhouette Score               0.465717
Davies-Bouldin Index           0.731839
Calinski-Harabasz Index    14531.871617
Name: 9, dtype: object

In [6]:
best_ch_row = gmm_results_df.loc[gmm_results_df['Calinski-Harabasz Index'].idxmax()]
best_ch_row

PCA Dimensions                        2
Number of Components                  4
Covariance Type               spherical
Silhouette Score               0.474693
Davies-Bouldin Index           0.769723
Calinski-Harabasz Index    17110.732514
Name: 11, dtype: object