feat(evaluation): add algorithm-aware cluster evaluation function with edge case handling

artemisTurintech · paulsbrookes · commit 71a9419ee97f · 2025-10-25T15:35:13.000+01:00
Add evaluate_clusters() to calculate silhouette, Davies-Bouldin, and Calinski-Harabasz metrics with algorithm-specific support for K-means inertia and DBSCAN n
diff --git a/clustering_toolkit/evaluation.py b/clustering_toolkit/evaluation.py
@@ -17,10 +17,13 @@
 - Homogeneity, Completeness, and V-measure
 
 Typical usage:
-    from clustering_toolkit.evaluation import evaluate_clustering, silhouette_score
+    from clustering_toolkit.evaluation import evaluate_clusters, evaluate_clustering
     
+    # Algorithm-aware evaluation (recommended)
+    metrics = evaluate_clusters(data, labels, algorithm='kmeans', model=kmeans_model)
+    
+    # General evaluation
     metrics = evaluate_clustering(data, labels)
-    score = silhouette_score(data, labels)
 """
 
 import pandas as pd
@@ -155,6 +158,178 @@ def evaluate_clustering(
     return results
 
 
+def evaluate_clusters(
+    data: pd.DataFrame,
+    labels: np.ndarray,
+    algorithm: str,
+    model=None
+) -> dict:
+    """
+    Evaluate clustering results with algorithm-aware metric calculation.
+    
+    This function calculates multiple clustering quality metrics based on the
+    algorithm used. It handles algorithm-specific metrics (like inertia for K-means)
+    and edge cases like DBSCAN noise points and single-cluster results.
+    
+    Metrics Calculated:
+    -------------------
+    - **silhouette_score**: Measures cluster cohesion and separation
+        - Range: [-1, 1]
+        - Interpretation: Higher is better
+        - Values near +1: Well-separated, cohesive clusters
+        - Values near 0: Overlapping clusters
+        - Values near -1: Misclassified samples
+        - Requires: 2+ clusters
+    
+    - **davies_bouldin**: Average similarity ratio of clusters
+        - Range: [0, ∞)
+        - Interpretation: Lower is better
+        - Values near 0: Well-separated clusters
+        - Higher values: More cluster overlap
+        - Requires: 2+ clusters
+    
+    - **calinski_harabasz**: Ratio of between/within cluster dispersion
+        - Range: [0, ∞)
+        - Interpretation: Higher is better
+        - Higher values: Better-defined, denser clusters
+        - Requires: 2+ clusters
+    
+    - **inertia**: Within-cluster sum of squares (K-means only)
+        - Range: [0, ∞)
+        - Interpretation: Lower is better
+        - Measures compactness of clusters
+        - Only available when algorithm='kmeans' and model is provided
+    
+    Args:
+        data: Feature data used for clustering (DataFrame or numpy array).
+            Must have same number of rows as labels array.
+        labels: Cluster labels as numpy array. For DBSCAN, -1 indicates noise points
+            which are automatically excluded from metric calculations.
+        algorithm: Name of the clustering algorithm used ('kmeans', 'dbscan', or 'hierarchical').
+            Used to determine which algorithm-specific metrics to include.
+        model: Optional fitted clustering model object. Required to extract inertia
+            for K-means. Should be the sklearn KMeans model instance with .inertia_ attribute.
+    
+    Returns:
+        Dictionary with metric names as keys and calculated values as floats.
+        Keys include: 'silhouette_score', 'davies_bouldin', 'calinski_harabasz',
+        and 'inertia' (K-means only when model provided).
+        Returns empty dict or partial dict when metrics cannot be calculated
+        (e.g., single cluster, all noise points).
+    
+    Edge Cases:
+        - Single cluster: Returns empty dict (metrics require 2+ clusters)
+        - DBSCAN all noise: Returns empty dict
+        - DBSCAN with noise: Filters out -1 labels before calculating metrics
+        - Missing model for K-means: Inertia is omitted from results
+        - Data/labels length mismatch: Raises ValueError
+    
+    Raises:
+        ValueError: If data and labels have different lengths
+        TypeError: If data cannot be converted to numpy array
+    
+    Examples:
+        >>> import pandas as pd
+        >>> import numpy as np
+        >>> from sklearn.cluster import KMeans
+        >>> 
+        >>> # K-means evaluation with model
+        >>> data = pd.DataFrame({'x': [1, 2, 10, 11], 'y': [1, 2, 10, 11]})
+        >>> model = KMeans(n_clusters=2, random_state=42)
+        >>> labels = model.fit_predict(data)
+        >>> metrics = evaluate_clusters(data, labels, 'kmeans', model)
+        >>> # Returns: {'silhouette_score': 0.85, 'davies_bouldin': 0.42,
+        >>> #           'calinski_harabasz': 15.2, 'inertia': 4.0}
+        >>> 
+        >>> # DBSCAN evaluation (handles noise points)
+        >>> from sklearn.cluster import DBSCAN
+        >>> dbscan = DBSCAN(eps=2.0, min_samples=2)
+        >>> labels = dbscan.fit_predict(data)  # May include -1 for noise
+        >>> metrics = evaluate_clusters(data, labels, 'dbscan')
+        >>> # Returns metrics excluding noise points
+        >>> 
+        >>> # Hierarchical evaluation
+        >>> from sklearn.cluster import AgglomerativeClustering
+        >>> hierarchical = AgglomerativeClustering(n_clusters=2)
+        >>> labels = hierarchical.fit_predict(data)
+        >>> metrics = evaluate_clusters(data, labels, 'hierarchical')
+        >>> # Returns: {'silhouette_score': 0.85, 'davies_bouldin': 0.42,
+        >>> #           'calinski_harabasz': 15.2}
+    
+    Notes:
+        - All metrics except inertia work for any algorithm with 2+ clusters
+        - DBSCAN noise points (-1) are automatically filtered before calculation
+        - If only one cluster remains after filtering, returns empty dict
+        - Inertia is only meaningful for K-means (measures centroid distance)
+    """
+    # Input validation: check data and labels length match
+    if isinstance(data, pd.DataFrame):
+        data_array = data.values
+    elif isinstance(data, np.ndarray):
+        data_array = data
+    else:
+        raise TypeError(
+            f"Data must be pandas DataFrame or numpy array, got {type(data).__name__}"
+        )
+    
+    if len(data_array) != len(labels):
+        raise ValueError(
+            f"Data and labels must have same length. "
+            f"Got data: {len(data_array)}, labels: {len(labels)}"
+        )
+    
+    results = {}
+    
+    # Handle DBSCAN noise points: filter out -1 labels
+    # Create mask for valid (non-noise) points
+    valid_mask = labels != -1
+    valid_labels = labels[valid_mask]
+    valid_data = data_array[valid_mask]
+    
+    # Check if we have enough valid clusters for metrics
+    unique_labels = np.unique(valid_labels)
+    n_clusters = len(unique_labels)
+    
+    # Edge case: less than 2 clusters means no meaningful metrics
+    if n_clusters < 2:
+        # Return empty dict - metrics require at least 2 clusters
+        return results
+    
+    # Calculate silhouette score (requires 2+ clusters)
+    try:
+        results['silhouette_score'] = sklearn_silhouette_score(valid_data, valid_labels)
+    except Exception:
+        # Skip if calculation fails
+        pass
+    
+    # Calculate Davies-Bouldin index (requires 2+ clusters)
+    try:
+        results['davies_bouldin'] = davies_bouldin_score(valid_data, valid_labels)
+    except Exception:
+        # Skip if calculation fails
+        pass
+    
+    # Calculate Calinski-Harabasz score (requires 2+ clusters)
+    try:
+        results['calinski_harabasz'] = calinski_harabasz_score(valid_data, valid_labels)
+    except Exception:
+        # Skip if calculation fails
+        pass
+    
+    # Extract inertia for K-means only (requires model object)
+    algorithm_lower = algorithm.lower()
+    if algorithm_lower == 'kmeans' and model is not None:
+        try:
+            # Access inertia attribute from fitted KMeans model
+            if hasattr(model, 'inertia_'):
+                results['inertia'] = float(model.inertia_)
+        except Exception:
+            # Skip if inertia cannot be extracted
+            pass
+    
+    return results
+
+
 def print_evaluation_report(metrics: Dict[str, Any], title: str = "Clustering Evaluation"):
     """
     Print a formatted evaluation report.