<a href="https://colab.research.google.com/github/vaibhavmishra03/-datasciencecoursera/blob/main/clustering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
from sklearn.datasets import load_iris
from sklearn.preprocessing import MinMaxScaler, StandardScaler, PowerTransformer
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, MeanShift, estimate_bandwidth, AgglomerativeClustering
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score

In [3]:
import warnings
warnings.filterwarnings('ignore')

# Load the Iris dataset
iris = load_iris()
X = iris.data


In [4]:
def evaluate_clustering(X, labels):
    if len(set(labels)) <= 1 or len(set(labels)) >= len(X):
        return [np.nan, np.nan, np.nan]
    return [
        round(silhouette_score(X, labels), 3),
        round(calinski_harabasz_score(X, labels), 0),
        round(davies_bouldin_score(X, labels), 2)
    ]


In [5]:
# Preprocessing configurations
preprocessing_methods = {
    'No Processing': lambda X: X,
    'Normalization': lambda X: MinMaxScaler().fit_transform(X),
    'Transform': lambda X: PowerTransformer().fit_transform(X),
    'PCA': lambda X: PCA(n_components=2).fit_transform(X),
    'T+N': lambda X: MinMaxScaler().fit_transform(PowerTransformer().fit_transform(X)),
    'T+N+PCA': lambda X: PCA(n_components=2).fit_transform(
        MinMaxScaler().fit_transform(PowerTransformer().fit_transform(X)))
}

cluster_range = [3, 4, 5]

In [6]:
# Initialize result dictionaries
results_kmeans = {}
results_hierarchical = {}
results_meanshift = {}

for method, preprocess in preprocessing_methods.items():
    X_proc = preprocess(X)
    for c in cluster_range:
        # K-Means
        kmeans = KMeans(n_clusters=c, random_state=42)
        kmeans_labels = kmeans.fit_predict(X_proc)
        results_kmeans[(method, c)] = evaluate_clustering(X_proc, kmeans_labels)

        # Hierarchical
        hier = AgglomerativeClustering(n_clusters=c)
        hier_labels = hier.fit_predict(X_proc)
        results_hierarchical[(method, c)] = evaluate_clustering(X_proc, hier_labels)

    # Mean Shift (uses bandwidth, doesn't need fixed 'c')
    bandwidth = estimate_bandwidth(X_proc, quantile=0.2)
    meanshift = MeanShift(bandwidth=bandwidth)
    ms_labels = meanshift.fit_predict(X_proc)
    results_meanshift[method] = evaluate_clustering(X_proc, ms_labels)

In [7]:
# Convert results to DataFrames
index = pd.MultiIndex.from_tuples(results_kmeans.keys(), names=['Preprocessing', 'Clusters'])
kmeans_df = pd.DataFrame(list(results_kmeans.values()), index=index, columns=['Silhouette', 'Calinski-Harabasz', 'Davies-Bouldin'])

index = pd.MultiIndex.from_tuples(results_hierarchical.keys(), names=['Preprocessing', 'Clusters'])
hierarchical_df = pd.DataFrame(list(results_hierarchical.values()), index=index, columns=['Silhouette', 'Calinski-Harabasz', 'Davies-Bouldin'])

meanshift_df = pd.DataFrame.from_dict(results_meanshift, orient='index', columns=['Silhouette', 'Calinski-Harabasz', 'Davies-Bouldin'])

# Display results
display(kmeans_df)
display(hierarchical_df)
display(meanshift_df)

# Optional: Save to CSV or plot heatmaps for visualization
kmeans_df.to_csv("kmeans_results.csv")
hierarchical_df.to_csv("hierarchical_results.csv")
meanshift_df.to_csv("meanshift_results.csv")

Unnamed: 0_level_0,Unnamed: 1_level_0,Silhouette,Calinski-Harabasz,Davies-Bouldin
Preprocessing,Clusters,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
No Processing,3,0.551,562.0,0.67
No Processing,4,0.498,530.0,0.75
No Processing,5,0.493,495.0,0.82
Normalization,3,0.483,351.0,0.79
Normalization,4,0.444,314.0,0.91
Normalization,5,0.423,263.0,0.99
Transform,3,0.49,162.0,0.82
Transform,4,0.386,209.0,0.88
Transform,5,0.369,170.0,0.89
PCA,3,0.598,694.0,0.56


Unnamed: 0_level_0,Unnamed: 1_level_0,Silhouette,Calinski-Harabasz,Davies-Bouldin
Preprocessing,Clusters,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
No Processing,3,0.554,558.0,0.66
No Processing,4,0.489,515.0,0.8
No Processing,5,0.484,488.0,0.82
Normalization,3,0.505,349.0,0.75
Normalization,4,0.433,301.0,0.85
Normalization,5,0.349,272.0,0.91
Transform,3,0.478,225.0,0.74
Transform,4,0.427,214.0,0.9
Transform,5,0.357,202.0,0.92
PCA,3,0.598,689.0,0.56


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin
No Processing,0.686,510.0,0.39
Normalization,0.477,290.0,0.76
Transform,0.342,140.0,0.78
PCA,0.562,615.0,0.56
T+N,0.399,222.0,0.86
T+N+PCA,0.404,227.0,0.72
