# Exploring HDBSCANClusterer
Interactive notebook to understand clusterer.py

## Cell 1: Basic Imports

In [2]:
import numpy as np
from src.sentrylens.clustering.clusterer import HDBSCANClusterer
from src.sentrylens.core.models import ErrorEmbedding
from configs.settings import settings

print("Imports successful!")
print(f"Embedding dimension: {settings.EMBEDDING_DIMENSION}")

Imports successful!
Embedding dimension: 384


## Cell 2: Load Actual Embeddings from Data

In [3]:
embedding_vector = np.random.randn(settings.EMBEDDING_DIMENSION)

In [None]:
# Load actual embeddings from data file
from src.sentrylens.data.loader import AERIDataLoader
from pathlib import Path

# Use the most recent embeddings file
embeddings_file = Path("data/embeddings/embeddings_20260120_132919.json")

if embeddings_file.exists():
    loader = AERIDataLoader()
    dataset = loader.load_processed_dataset(embeddings_file)
    test_embeddings = dataset.embeddings
    test_errors = dataset.errors
    
    print(f"Loaded {len(test_embeddings)} embeddings from {embeddings_file.name}")
    print(f"Also loaded {len(test_errors)} errors")
    print(f"\nFirst embedding:")
    print(f"  error_id: {test_embeddings[0].error_id}")
    print(f"  embedding dimension: {len(test_embeddings[0].embedding)}")
    print(f"  model: {test_embeddings[0].model_name}")
else:
    print(f"File not found: {embeddings_file}")
    print("Run Step 2 first: python -m scripts.generate_embeddings --input <processed_dataset.json>")

## Cell 3: Explore - Initialize HDBSCANClusterer
What does the __init__ method do?

In [43]:
# Try creating a clusterer and inspect it
clusterer = HDBSCANClusterer(min_cluster_size=3)

# TODO: Explore the clusterer object
# - What attributes does it have?
# - Print min_cluster_size, min_samples, metric

print(dir(clusterer))

{"text": "2026-01-20 21:19:36.128 | INFO     | src.sentrylens.clustering.clusterer:__init__:93 - Initialized HDBSCANClusterer\n", "record": {"elapsed": {"repr": "0:58:29.982367", "seconds": 3509.982367}, "exception": null, "extra": {"min_cluster_size": 3, "min_samples": 3, "cluster_selection_epsilon": 0.0, "algorithm": "best", "metric": "euclidean"}, "file": {"name": "clusterer.py", "path": "/Users/vamsi.uppala/Documents/personal/sentrylens/src/sentrylens/clustering/clusterer.py"}, "function": "__init__", "level": {"icon": "ℹ️", "name": "INFO", "no": 20}, "line": 93, "message": "Initialized HDBSCANClusterer", "module": "clusterer", "name": "src.sentrylens.clustering.clusterer", "process": {"id": 67926, "name": "MainProcess"}, "thread": {"id": 8387567744, "name": "MainThread"}, "time": {"repr": "2026-01-20 21:19:36.128624-08:00", "timestamp": 1768972776.128624}}}


['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', 'algorithm', 'cluster_embeddings', 'cluster_selection_epsilon', 'clusterer', 'embeddings', 'fit', 'get_cluster_center', 'get_cluster_members', 'get_stats', 'labels', 'metric', 'min_cluster_size', 'min_samples', 'predict']


In [None]:
# Convert embeddings to numpy array
embedding_vectors = np.array([e.embedding for e in test_embeddings])
print(f"Embedding vectors shape: {embedding_vectors.shape}")
print(f"Expected: (500, 384) - 500 real Java errors, 384-dimensional embeddings")

## Cell 4: Explore - fit() method
What happens when we fit?

In [None]:
# Create a fresh clusterer for real data
clusterer = HDBSCANClusterer(min_cluster_size=5)

# TODO: Call clusterer.fit(embedding_vectors)
# - What does it return?
# - What's the shape of labels?
# - Hints:
#   - labels should be 1D array with 500 elements
#   - Each element is a cluster ID (0, 1, 2, ...) or -1 for noise
#   - Try: print(clusterer.labels)
#   - Try: print(f"Unique labels: {sorted(set(clusterer.labels))}")

## Cell 5: Explore - get_stats() method
What statistics are available?

In [55]:
clusterer.get_stats()

ClusterStats(num_clusters=0, num_noise_points=20, total_points=20, cluster_sizes={}, avg_cluster_size=0.0, largest_cluster_size=0, smallest_cluster_size=0, noise_fraction=1.0, silhouette_score=None)

In [None]:
# TODO: Call clusterer.get_stats() on real data
# - What does it return?
# - Print all attributes:
#   - stats.num_clusters: How many error groups?
#   - stats.num_noise_points: How many rare/unique errors?
#   - stats.noise_fraction: Percentage of outliers
#   - stats.cluster_sizes: Dict of cluster sizes
#   - stats.avg_cluster_size: Average errors per cluster
#   - stats.largest_cluster_size: Biggest cluster
#   - stats.smallest_cluster_size: Smallest cluster
# 
# You should see real error groupings like NullPointerException, OutOfMemoryError, etc.

## Cell 6: Explore - cluster_embeddings() method
End-to-end clustering

In [None]:
# cluster_embeddings() does fit() + creates ClusterAssignment objects
# It's a convenient wrapper that works with Pydantic models

# TODO: Create new clusterer and call cluster_embeddings(test_embeddings, test_errors)
# - What does it return?
# - How is it different from fit()?
# - What's in each ClusterAssignment object?
# 
# Try:
#   clusterer2 = HDBSCANClusterer(min_cluster_size=5)
#   assignments = clusterer2.cluster_embeddings(test_embeddings, test_errors)
#   print(f"Assignments: {len(assignments)}")
#   print(f"First assignment: {assignments[0]}")
#   print(f"  error_id: {assignments[0].error_id}")
#   print(f"  cluster_id: {assignments[0].cluster_id}")
#   print(f"  is_noise: {assignments[0].is_noise}")

## Cell 7: Explore - Parameters
How does min_cluster_size affect results?

In [None]:
# TODO: Try different min_cluster_size values on REAL data
# - This is the KEY parameter that affects clustering
# 
# Try:
#   for min_size in [3, 5, 10, 15]:
#       c = HDBSCANClusterer(min_cluster_size=min_size)
#       labels = c.fit(embedding_vectors)
#       stats = c.get_stats()
#       
#       print(f"\nmin_cluster_size={min_size}:")
#       print(f"  Clusters: {stats.num_clusters}")
#       print(f"  Noise: {stats.num_noise_points} ({stats.noise_fraction:.1%})")
#       print(f"  Avg size: {stats.avg_cluster_size:.1f}")
# 
# Questions:
# - Smaller min_cluster_size = more or fewer clusters?
# - How does noise fraction change?
# - Which value seems best for error clustering?