<a href="https://colab.research.google.com/github/trish-r/102203584_Trish_Rustagi_Clustering/blob/main/102203584_Clustering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [27]:
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.preprocessing import MinMaxScaler, FunctionTransformer
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, MeanShift, estimate_bandwidth, AgglomerativeClustering
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score

In [28]:
iris = datasets.load_iris()
X = iris.data

In [29]:
def normalize(X):
    return MinMaxScaler().fit_transform(X)

def log_transform(X):
    return FunctionTransformer(np.log1p).fit_transform(X)

def apply_pca(X, n_components=2):
    return PCA(n_components=n_components).fit_transform(X)

In [30]:
preprocessing_options = {
    "No Data Processing": lambda X: X,
    "Using Normalization": normalize,
    "Using Transform": log_transform,
    "Using PCA": lambda X: apply_pca(X),
    "Using T+N": lambda X: normalize(log_transform(X)),
    "T+N+PCA": lambda X: apply_pca(normalize(log_transform(X))),
}

In [31]:
def init_result_table():
    return {
        "Silhouette": {},
        "Calinski-Harabasz": {},
        "Davies-Bouldin": {}
    }

In [32]:
def evaluate_clustering(X, labels):
    return (
        silhouette_score(X, labels),
        calinski_harabasz_score(X, labels),
        davies_bouldin_score(X, labels)
    )

In [33]:
def run_clustering(X_original, clusterer, cluster_range=(3, 6), fixed=False):
    results = init_result_table()
    for name, preprocess in preprocessing_options.items():
        X = preprocess(X_original)
        for c in range(*cluster_range):
            try:
                if fixed:
                    model = clusterer(n_clusters=c)
                else:
                    model = clusterer()
                labels = model.fit_predict(X)
                sil, ch, db = evaluate_clustering(X, labels)
                results["Silhouette"][(name, c)] = round(sil, 2)
                results["Calinski-Harabasz"][(name, c)] = int(ch)
                results["Davies-Bouldin"][(name, c)] = round(db, 2)
            except:
                results["Silhouette"][(name, c)] = "NA"
                results["Calinski-Harabasz"][(name, c)] = "NA"
                results["Davies-Bouldin"][(name, c)] = "NA"
    return results


In [34]:
kmeans_results = run_clustering(X, KMeans, (3, 6), fixed=True)

In [35]:
hierarchical_results = run_clustering(X, AgglomerativeClustering, (3, 6), fixed=True)


In [36]:
meanshift_results = run_clustering(X, MeanShift, (3, 6), fixed=False)

In [38]:
def print_formatted_results_table(title, results):
    print(f"\n{title}")
    print("-" * 200)
    preprocessing_methods = [
        "No Data Processing",
        "Using Normalization",
        "Using Transform",
        "Using PCA",
        "Using T+N",
        "T+N+PCA"
    ]

    metrics = ["Silhouette", "Calinski-Harabasz", "Davies-Bouldin"]
    header_row = ["Parameters"]
    for method in preprocessing_methods:
        header_row.extend([method, "", ""])
    c_row = [""]
    for _ in preprocessing_methods:
        c_row.extend(["c=3", "c=4", "c=5"])
    print("{:<20}".format(header_row[0]), end="")
    for i in range(1, len(header_row), 3):
        print("{:<30}".format(header_row[i]), end="")
    print()

    print("{:<20}".format(c_row[0]), end="")
    for i in range(1, len(c_row)):
        print("{:<10}".format(c_row[i]), end="")
    print()
    print("-" * 200)

    for metric in metrics:
        print("{:<20}".format(metric), end="")

        for method in preprocessing_methods:
            for c in range(3, 6):
                try:
                    value = results[metric][(method, c)]
                    if isinstance(value, (int, float)):
                        if metric == "Calinski-Harabasz":
                            print("{:<10}".format(value), end="")
                        else:
                            print("{:<10.2f}".format(float(value)), end="")
                    else:
                        print("{:<10}".format(value), end="")
                except KeyError:
                    print("{:<10}".format("NA"), end="")
        print()

    print()

print("\nPerformance using different clustering techniques on various parameters")
print("=" * 100)
print_formatted_results_table("Using K-Mean Clustering", kmeans_results)
print_formatted_results_table("Using Hierarchical Clustering", hierarchical_results)
print_formatted_results_table("Using K-mean Shift Clustering", meanshift_results)


Performance using different clustering techniques on various parameters

Using K-Mean Clustering
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Parameters          No Data Processing            Using Normalization           Using Transform               Using PCA                     Using T+N                     T+N+PCA                       
                    c=3       c=4       c=5       c=3       c=4       c=5       c=3       c=4       c=5       c=3       c=4       c=5       c=3       c=4       c=5       c=3       c=4       c=5       
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Silhouette          0.55      0.50      0.37      0.50      0.44      0.44      0.57      0.50    