|
17 | 17 | - Homogeneity, Completeness, and V-measure |
18 | 18 |
|
19 | 19 | Typical usage: |
20 | | - from clustering_toolkit.evaluation import evaluate_clustering, silhouette_score |
| 20 | + from clustering_toolkit.evaluation import evaluate_clusters, evaluate_clustering |
21 | 21 | |
| 22 | + # Algorithm-aware evaluation (recommended) |
| 23 | + metrics = evaluate_clusters(data, labels, algorithm='kmeans', model=kmeans_model) |
| 24 | + |
| 25 | + # General evaluation |
22 | 26 | metrics = evaluate_clustering(data, labels) |
23 | | - score = silhouette_score(data, labels) |
24 | 27 | """ |
25 | 28 |
|
26 | 29 | import pandas as pd |
@@ -155,6 +158,178 @@ def evaluate_clustering( |
155 | 158 | return results |
156 | 159 |
|
157 | 160 |
|
| 161 | +def evaluate_clusters( |
| 162 | + data: pd.DataFrame, |
| 163 | + labels: np.ndarray, |
| 164 | + algorithm: str, |
| 165 | + model=None |
| 166 | +) -> dict: |
| 167 | + """ |
| 168 | + Evaluate clustering results with algorithm-aware metric calculation. |
| 169 | + |
| 170 | + This function calculates multiple clustering quality metrics based on the |
| 171 | + algorithm used. It handles algorithm-specific metrics (like inertia for K-means) |
| 172 | + and edge cases like DBSCAN noise points and single-cluster results. |
| 173 | + |
| 174 | + Metrics Calculated: |
| 175 | + ------------------- |
| 176 | + - **silhouette_score**: Measures cluster cohesion and separation |
| 177 | + - Range: [-1, 1] |
| 178 | + - Interpretation: Higher is better |
| 179 | + - Values near +1: Well-separated, cohesive clusters |
| 180 | + - Values near 0: Overlapping clusters |
| 181 | + - Values near -1: Misclassified samples |
| 182 | + - Requires: 2+ clusters |
| 183 | + |
| 184 | + - **davies_bouldin**: Average similarity ratio of clusters |
| 185 | + - Range: [0, ∞) |
| 186 | + - Interpretation: Lower is better |
| 187 | + - Values near 0: Well-separated clusters |
| 188 | + - Higher values: More cluster overlap |
| 189 | + - Requires: 2+ clusters |
| 190 | + |
| 191 | + - **calinski_harabasz**: Ratio of between/within cluster dispersion |
| 192 | + - Range: [0, ∞) |
| 193 | + - Interpretation: Higher is better |
| 194 | + - Higher values: Better-defined, denser clusters |
| 195 | + - Requires: 2+ clusters |
| 196 | + |
| 197 | + - **inertia**: Within-cluster sum of squares (K-means only) |
| 198 | + - Range: [0, ∞) |
| 199 | + - Interpretation: Lower is better |
| 200 | + - Measures compactness of clusters |
| 201 | + - Only available when algorithm='kmeans' and model is provided |
| 202 | + |
| 203 | + Args: |
| 204 | + data: Feature data used for clustering (DataFrame or numpy array). |
| 205 | + Must have same number of rows as labels array. |
| 206 | + labels: Cluster labels as numpy array. For DBSCAN, -1 indicates noise points |
| 207 | + which are automatically excluded from metric calculations. |
| 208 | + algorithm: Name of the clustering algorithm used ('kmeans', 'dbscan', or 'hierarchical'). |
| 209 | + Used to determine which algorithm-specific metrics to include. |
| 210 | + model: Optional fitted clustering model object. Required to extract inertia |
| 211 | + for K-means. Should be the sklearn KMeans model instance with .inertia_ attribute. |
| 212 | + |
| 213 | + Returns: |
| 214 | + Dictionary with metric names as keys and calculated values as floats. |
| 215 | + Keys include: 'silhouette_score', 'davies_bouldin', 'calinski_harabasz', |
| 216 | + and 'inertia' (K-means only when model provided). |
| 217 | + Returns empty dict or partial dict when metrics cannot be calculated |
| 218 | + (e.g., single cluster, all noise points). |
| 219 | + |
| 220 | + Edge Cases: |
| 221 | + - Single cluster: Returns empty dict (metrics require 2+ clusters) |
| 222 | + - DBSCAN all noise: Returns empty dict |
| 223 | + - DBSCAN with noise: Filters out -1 labels before calculating metrics |
| 224 | + - Missing model for K-means: Inertia is omitted from results |
| 225 | + - Data/labels length mismatch: Raises ValueError |
| 226 | + |
| 227 | + Raises: |
| 228 | + ValueError: If data and labels have different lengths |
| 229 | + TypeError: If data cannot be converted to numpy array |
| 230 | + |
| 231 | + Examples: |
| 232 | + >>> import pandas as pd |
| 233 | + >>> import numpy as np |
| 234 | + >>> from sklearn.cluster import KMeans |
| 235 | + >>> |
| 236 | + >>> # K-means evaluation with model |
| 237 | + >>> data = pd.DataFrame({'x': [1, 2, 10, 11], 'y': [1, 2, 10, 11]}) |
| 238 | + >>> model = KMeans(n_clusters=2, random_state=42) |
| 239 | + >>> labels = model.fit_predict(data) |
| 240 | + >>> metrics = evaluate_clusters(data, labels, 'kmeans', model) |
| 241 | + >>> # Returns: {'silhouette_score': 0.85, 'davies_bouldin': 0.42, |
| 242 | + >>> # 'calinski_harabasz': 15.2, 'inertia': 4.0} |
| 243 | + >>> |
| 244 | + >>> # DBSCAN evaluation (handles noise points) |
| 245 | + >>> from sklearn.cluster import DBSCAN |
| 246 | + >>> dbscan = DBSCAN(eps=2.0, min_samples=2) |
| 247 | + >>> labels = dbscan.fit_predict(data) # May include -1 for noise |
| 248 | + >>> metrics = evaluate_clusters(data, labels, 'dbscan') |
| 249 | + >>> # Returns metrics excluding noise points |
| 250 | + >>> |
| 251 | + >>> # Hierarchical evaluation |
| 252 | + >>> from sklearn.cluster import AgglomerativeClustering |
| 253 | + >>> hierarchical = AgglomerativeClustering(n_clusters=2) |
| 254 | + >>> labels = hierarchical.fit_predict(data) |
| 255 | + >>> metrics = evaluate_clusters(data, labels, 'hierarchical') |
| 256 | + >>> # Returns: {'silhouette_score': 0.85, 'davies_bouldin': 0.42, |
| 257 | + >>> # 'calinski_harabasz': 15.2} |
| 258 | + |
| 259 | + Notes: |
| 260 | + - All metrics except inertia work for any algorithm with 2+ clusters |
| 261 | + - DBSCAN noise points (-1) are automatically filtered before calculation |
| 262 | + - If only one cluster remains after filtering, returns empty dict |
| 263 | + - Inertia is only meaningful for K-means (measures centroid distance) |
| 264 | + """ |
| 265 | + # Input validation: check data and labels length match |
| 266 | + if isinstance(data, pd.DataFrame): |
| 267 | + data_array = data.values |
| 268 | + elif isinstance(data, np.ndarray): |
| 269 | + data_array = data |
| 270 | + else: |
| 271 | + raise TypeError( |
| 272 | + f"Data must be pandas DataFrame or numpy array, got {type(data).__name__}" |
| 273 | + ) |
| 274 | + |
| 275 | + if len(data_array) != len(labels): |
| 276 | + raise ValueError( |
| 277 | + f"Data and labels must have same length. " |
| 278 | + f"Got data: {len(data_array)}, labels: {len(labels)}" |
| 279 | + ) |
| 280 | + |
| 281 | + results = {} |
| 282 | + |
| 283 | + # Handle DBSCAN noise points: filter out -1 labels |
| 284 | + # Create mask for valid (non-noise) points |
| 285 | + valid_mask = labels != -1 |
| 286 | + valid_labels = labels[valid_mask] |
| 287 | + valid_data = data_array[valid_mask] |
| 288 | + |
| 289 | + # Check if we have enough valid clusters for metrics |
| 290 | + unique_labels = np.unique(valid_labels) |
| 291 | + n_clusters = len(unique_labels) |
| 292 | + |
| 293 | + # Edge case: less than 2 clusters means no meaningful metrics |
| 294 | + if n_clusters < 2: |
| 295 | + # Return empty dict - metrics require at least 2 clusters |
| 296 | + return results |
| 297 | + |
| 298 | + # Calculate silhouette score (requires 2+ clusters) |
| 299 | + try: |
| 300 | + results['silhouette_score'] = sklearn_silhouette_score(valid_data, valid_labels) |
| 301 | + except Exception: |
| 302 | + # Skip if calculation fails |
| 303 | + pass |
| 304 | + |
| 305 | + # Calculate Davies-Bouldin index (requires 2+ clusters) |
| 306 | + try: |
| 307 | + results['davies_bouldin'] = davies_bouldin_score(valid_data, valid_labels) |
| 308 | + except Exception: |
| 309 | + # Skip if calculation fails |
| 310 | + pass |
| 311 | + |
| 312 | + # Calculate Calinski-Harabasz score (requires 2+ clusters) |
| 313 | + try: |
| 314 | + results['calinski_harabasz'] = calinski_harabasz_score(valid_data, valid_labels) |
| 315 | + except Exception: |
| 316 | + # Skip if calculation fails |
| 317 | + pass |
| 318 | + |
| 319 | + # Extract inertia for K-means only (requires model object) |
| 320 | + algorithm_lower = algorithm.lower() |
| 321 | + if algorithm_lower == 'kmeans' and model is not None: |
| 322 | + try: |
| 323 | + # Access inertia attribute from fitted KMeans model |
| 324 | + if hasattr(model, 'inertia_'): |
| 325 | + results['inertia'] = float(model.inertia_) |
| 326 | + except Exception: |
| 327 | + # Skip if inertia cannot be extracted |
| 328 | + pass |
| 329 | + |
| 330 | + return results |
| 331 | + |
| 332 | + |
158 | 333 | def print_evaluation_report(metrics: Dict[str, Any], title: str = "Clustering Evaluation"): |
159 | 334 | """ |
160 | 335 | Print a formatted evaluation report. |
|
0 commit comments