In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans, DBSCAN, Birch
from sklearn.mixture import GaussianMixture
from hdbscan import HDBSCAN
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

ModuleNotFoundError: No module named 'hdbscan'

In [None]:
def evaluate_clustering(X, model, param_grid, metric_func):
    best_score = -np.inf
    best_model = None
    results = []

    if isinstance(model, KMeans) or isinstance(model, Birch):
        for n_clusters in param_grid['n_clusters']:
            m = model.__class__(n_clusters=n_clusters, random_state=42).fit(X)
            labels = m.labels_
            if len(set(labels)) < 2:
                continue  # Skip invalid clusterings
            score = metric_func(X, labels)
            results.append((n_clusters, score))
            if score > best_score:
                best_score = score
                best_model = m
    elif isinstance(model, DBSCAN):
        for eps in param_grid['eps']:
            for min_samples in param_grid['min_samples']:
                m = model.__class__(eps=eps, min_samples=min_samples).fit(X)
                labels = m.labels_
                if len(set(labels)) < 2:
                    continue
                score = metric_func(X, labels)
                results.append((eps, min_samples, score))
                if score > best_score:
                    best_score = score
                    best_model = m
    elif isinstance(model, HDBSCAN):
        m = model.fit(X)
        labels = m.labels_
        if len(set(labels)) < 2:
            return None, None, []
        score = metric_func(X, labels)
        results.append(("HDBSCAN", score))
        return m, score, results

    return best_model, best_score, results