Skip to content

Commit 71a9419

Browse files
artemisTurintechpaulsbrookes
authored andcommitted
feat(evaluation): add algorithm-aware cluster evaluation function with edge case handling
Add evaluate_clusters() to calculate silhouette, Davies-Bouldin, and Calinski-Harabasz metrics with algorithm-specific support for K-means inertia and DBSCAN n
1 parent 70e9011 commit 71a9419

File tree

1 file changed

+177
-2
lines changed

1 file changed

+177
-2
lines changed

clustering_toolkit/evaluation.py

Lines changed: 177 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,13 @@
1717
- Homogeneity, Completeness, and V-measure
1818
1919
Typical usage:
20-
from clustering_toolkit.evaluation import evaluate_clustering, silhouette_score
20+
from clustering_toolkit.evaluation import evaluate_clusters, evaluate_clustering
2121
22+
# Algorithm-aware evaluation (recommended)
23+
metrics = evaluate_clusters(data, labels, algorithm='kmeans', model=kmeans_model)
24+
25+
# General evaluation
2226
metrics = evaluate_clustering(data, labels)
23-
score = silhouette_score(data, labels)
2427
"""
2528

2629
import pandas as pd
@@ -155,6 +158,178 @@ def evaluate_clustering(
155158
return results
156159

157160

161+
def evaluate_clusters(
162+
data: pd.DataFrame,
163+
labels: np.ndarray,
164+
algorithm: str,
165+
model=None
166+
) -> dict:
167+
"""
168+
Evaluate clustering results with algorithm-aware metric calculation.
169+
170+
This function calculates multiple clustering quality metrics based on the
171+
algorithm used. It handles algorithm-specific metrics (like inertia for K-means)
172+
and edge cases like DBSCAN noise points and single-cluster results.
173+
174+
Metrics Calculated:
175+
-------------------
176+
- **silhouette_score**: Measures cluster cohesion and separation
177+
- Range: [-1, 1]
178+
- Interpretation: Higher is better
179+
- Values near +1: Well-separated, cohesive clusters
180+
- Values near 0: Overlapping clusters
181+
- Values near -1: Misclassified samples
182+
- Requires: 2+ clusters
183+
184+
- **davies_bouldin**: Average similarity ratio of clusters
185+
- Range: [0, ∞)
186+
- Interpretation: Lower is better
187+
- Values near 0: Well-separated clusters
188+
- Higher values: More cluster overlap
189+
- Requires: 2+ clusters
190+
191+
- **calinski_harabasz**: Ratio of between/within cluster dispersion
192+
- Range: [0, ∞)
193+
- Interpretation: Higher is better
194+
- Higher values: Better-defined, denser clusters
195+
- Requires: 2+ clusters
196+
197+
- **inertia**: Within-cluster sum of squares (K-means only)
198+
- Range: [0, ∞)
199+
- Interpretation: Lower is better
200+
- Measures compactness of clusters
201+
- Only available when algorithm='kmeans' and model is provided
202+
203+
Args:
204+
data: Feature data used for clustering (DataFrame or numpy array).
205+
Must have same number of rows as labels array.
206+
labels: Cluster labels as numpy array. For DBSCAN, -1 indicates noise points
207+
which are automatically excluded from metric calculations.
208+
algorithm: Name of the clustering algorithm used ('kmeans', 'dbscan', or 'hierarchical').
209+
Used to determine which algorithm-specific metrics to include.
210+
model: Optional fitted clustering model object. Required to extract inertia
211+
for K-means. Should be the sklearn KMeans model instance with .inertia_ attribute.
212+
213+
Returns:
214+
Dictionary with metric names as keys and calculated values as floats.
215+
Keys include: 'silhouette_score', 'davies_bouldin', 'calinski_harabasz',
216+
and 'inertia' (K-means only when model provided).
217+
Returns empty dict or partial dict when metrics cannot be calculated
218+
(e.g., single cluster, all noise points).
219+
220+
Edge Cases:
221+
- Single cluster: Returns empty dict (metrics require 2+ clusters)
222+
- DBSCAN all noise: Returns empty dict
223+
- DBSCAN with noise: Filters out -1 labels before calculating metrics
224+
- Missing model for K-means: Inertia is omitted from results
225+
- Data/labels length mismatch: Raises ValueError
226+
227+
Raises:
228+
ValueError: If data and labels have different lengths
229+
TypeError: If data cannot be converted to numpy array
230+
231+
Examples:
232+
>>> import pandas as pd
233+
>>> import numpy as np
234+
>>> from sklearn.cluster import KMeans
235+
>>>
236+
>>> # K-means evaluation with model
237+
>>> data = pd.DataFrame({'x': [1, 2, 10, 11], 'y': [1, 2, 10, 11]})
238+
>>> model = KMeans(n_clusters=2, random_state=42)
239+
>>> labels = model.fit_predict(data)
240+
>>> metrics = evaluate_clusters(data, labels, 'kmeans', model)
241+
>>> # Returns: {'silhouette_score': 0.85, 'davies_bouldin': 0.42,
242+
>>> # 'calinski_harabasz': 15.2, 'inertia': 4.0}
243+
>>>
244+
>>> # DBSCAN evaluation (handles noise points)
245+
>>> from sklearn.cluster import DBSCAN
246+
>>> dbscan = DBSCAN(eps=2.0, min_samples=2)
247+
>>> labels = dbscan.fit_predict(data) # May include -1 for noise
248+
>>> metrics = evaluate_clusters(data, labels, 'dbscan')
249+
>>> # Returns metrics excluding noise points
250+
>>>
251+
>>> # Hierarchical evaluation
252+
>>> from sklearn.cluster import AgglomerativeClustering
253+
>>> hierarchical = AgglomerativeClustering(n_clusters=2)
254+
>>> labels = hierarchical.fit_predict(data)
255+
>>> metrics = evaluate_clusters(data, labels, 'hierarchical')
256+
>>> # Returns: {'silhouette_score': 0.85, 'davies_bouldin': 0.42,
257+
>>> # 'calinski_harabasz': 15.2}
258+
259+
Notes:
260+
- All metrics except inertia work for any algorithm with 2+ clusters
261+
- DBSCAN noise points (-1) are automatically filtered before calculation
262+
- If only one cluster remains after filtering, returns empty dict
263+
- Inertia is only meaningful for K-means (measures centroid distance)
264+
"""
265+
# Input validation: check data and labels length match
266+
if isinstance(data, pd.DataFrame):
267+
data_array = data.values
268+
elif isinstance(data, np.ndarray):
269+
data_array = data
270+
else:
271+
raise TypeError(
272+
f"Data must be pandas DataFrame or numpy array, got {type(data).__name__}"
273+
)
274+
275+
if len(data_array) != len(labels):
276+
raise ValueError(
277+
f"Data and labels must have same length. "
278+
f"Got data: {len(data_array)}, labels: {len(labels)}"
279+
)
280+
281+
results = {}
282+
283+
# Handle DBSCAN noise points: filter out -1 labels
284+
# Create mask for valid (non-noise) points
285+
valid_mask = labels != -1
286+
valid_labels = labels[valid_mask]
287+
valid_data = data_array[valid_mask]
288+
289+
# Check if we have enough valid clusters for metrics
290+
unique_labels = np.unique(valid_labels)
291+
n_clusters = len(unique_labels)
292+
293+
# Edge case: less than 2 clusters means no meaningful metrics
294+
if n_clusters < 2:
295+
# Return empty dict - metrics require at least 2 clusters
296+
return results
297+
298+
# Calculate silhouette score (requires 2+ clusters)
299+
try:
300+
results['silhouette_score'] = sklearn_silhouette_score(valid_data, valid_labels)
301+
except Exception:
302+
# Skip if calculation fails
303+
pass
304+
305+
# Calculate Davies-Bouldin index (requires 2+ clusters)
306+
try:
307+
results['davies_bouldin'] = davies_bouldin_score(valid_data, valid_labels)
308+
except Exception:
309+
# Skip if calculation fails
310+
pass
311+
312+
# Calculate Calinski-Harabasz score (requires 2+ clusters)
313+
try:
314+
results['calinski_harabasz'] = calinski_harabasz_score(valid_data, valid_labels)
315+
except Exception:
316+
# Skip if calculation fails
317+
pass
318+
319+
# Extract inertia for K-means only (requires model object)
320+
algorithm_lower = algorithm.lower()
321+
if algorithm_lower == 'kmeans' and model is not None:
322+
try:
323+
# Access inertia attribute from fitted KMeans model
324+
if hasattr(model, 'inertia_'):
325+
results['inertia'] = float(model.inertia_)
326+
except Exception:
327+
# Skip if inertia cannot be extracted
328+
pass
329+
330+
return results
331+
332+
158333
def print_evaluation_report(metrics: Dict[str, Any], title: str = "Clustering Evaluation"):
159334
"""
160335
Print a formatted evaluation report.

0 commit comments

Comments
 (0)