In [None]:
import pandas as pd
import numpy as np
import requests
import zipfile
import io
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from kmodes.kmodes import KModes
from sklearn.preprocessing import StandardScaler, KBinsDiscretizer, LabelEncoder
from sklearn import metrics


url = "https://archive.ics.uci.edu/static/public/602/dry+bean+dataset.zip"
r = requests.get(url)
with zipfile.ZipFile(io.BytesIO(r.content)) as z:

    with z.open('DryBeanDataset/Dry_Bean_Dataset.xlsx') as f:
        df = pd.read_excel(f)

le = LabelEncoder()
y_true = le.fit_transform(df['Class'])
X = df.drop('Class', axis=1)


scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

discretizer = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='uniform')
X_categorical = discretizer.fit_transform(X)

results = []

kmeans_labels = KMeans(n_clusters=7, random_state=42, n_init=10).fit_predict(X_scaled)

agglo_labels = AgglomerativeClustering(n_clusters=7).fit_predict(X_scaled[:5000])

dbscan_labels = DBSCAN(eps=0.6, min_samples=30).fit_predict(X_scaled)

km_mode = KModes(n_clusters=7, init='Huang', n_init=2).fit_predict(X_categorical)

def calculate_metrics(name, labels, true_labels):

    y_compare = true_labels[:len(labels)]
    X_compare = X_scaled[:len(labels)]

    return {
        "Algorithm": name,
        "Silhouette": metrics.silhouette_score(X_compare, labels) if len(set(labels)) > 1 else 0,
        "Davies-Bouldin": metrics.davies_bouldin_score(X_compare, labels) if len(set(labels)) > 1 else 0,
        "Calinski-Harabasz": metrics.calinski_harabasz_score(X_compare, labels) if len(set(labels)) > 1 else 0,
        "ARI": metrics.adjusted_rand_score(y_compare, labels),
        "MI": metrics.adjusted_mutual_info_score(y_compare, labels)
    }

results.append(calculate_metrics("K-Means", kmeans_labels, y_true))
results.append(calculate_metrics("Hierarchical", agglo_labels, y_true))
results.append(calculate_metrics("DBSCAN", dbscan_labels, y_true))
results.append(calculate_metrics("K-Mode", km_mode, y_true))


perf_df = pd.DataFrame(results).set_index("Algorithm")
print(perf_df.round(3))

              Silhouette  Davies-Bouldin  Calinski-Harabasz    ARI     MI
Algorithm                                                                
K-Means            0.309           1.099           7848.442  0.669  0.714
Hierarchical       0.263           1.288           4704.469  0.491  0.661
DBSCAN             0.248           1.395           4735.514  0.146  0.271
K-Mode             0.151           2.005           2561.819  0.444  0.562
