# Tutorial 02: Clustering Deep Dive

Goal: Show how the system groups related information.

We will take chunks, generate embeddings, and run UMAP + GMM clustering.

In [None]:
import logging
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs
from matome.engines.cluster import GMMClusterer
from domain_models.config import ProcessingConfig
from domain_models.manifest import Chunk

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

USE_MOCK = True

## 1. Prepare Data
We need embeddings to cluster. In a real scenario, these come from the Embedding Service.
For this tutorial, if we are in Mock mode, we will generate synthetic clusters to demonstrate the algorithm.

In [None]:
if USE_MOCK:
    print("Generating synthetic embeddings (3 clusters)...")
    # Generate 50 samples with 128 dimensions, 3 centers
    embeddings_array, labels = make_blobs(n_samples=50, n_features=128, centers=3, random_state=42)
    embeddings = embeddings_array.tolist()
else:
    print("Loading chunks and generating real embeddings (not implemented in this snippet for brevity, assuming existing embeddings)")
    # In real usage, you would do:
    # chunker = JapaneseSemanticChunker(embedder)
    # chunks = list(chunker.split_text(text, config))
    # embeddings = [c.embedding for c in chunks]
    pass

## 2. Run Clustering
We use GMMClusterer which performs UMAP dimensionality reduction followed by GMM.

In [None]:
config = ProcessingConfig(clustering_algorithm="gmm", n_clusters=3 if USE_MOCK else None)
clusterer = GMMClusterer()

clusters = clusterer.cluster_nodes(embeddings, config)

print(f"Found {len(clusters)} clusters.")
for c in clusters:
    print(f"Cluster {c.id}: {len(c.node_indices)} nodes")

## 3. Visualization (UMAP Projection)
To visualize high-dimensional data, we project it to 2D using UMAP.

In [None]:
from umap import UMAP

# Reduce to 2D for visualization
reducer = UMAP(n_components=2, random_state=42)
reduced = reducer.fit_transform(embeddings)

# Plot
plt.figure(figsize=(10, 6))
plt.scatter(reduced[:, 0], reduced[:, 1], alpha=0.5)
plt.title("UMAP Projection of Embeddings")
plt.xlabel("UMAP 1")
plt.ylabel("UMAP 2")
plt.show()