# Tutorial 2: Clustering Deep Dive
Embeddings and GMM Clustering.


In [None]:
import sys
from pathlib import Path
# Ensure src is in python path
project_root = Path.cwd().parent
if str(project_root / 'src') not in sys.path:
    sys.path.append(str(project_root / 'src'))

import logging
logging.basicConfig(level=logging.INFO)
from matome.engines.embedder import EmbeddingService
from matome.engines.cluster import GMMClusterer
from domain_models.config import ProcessingConfig
from domain_models.manifest import Chunk


In [None]:
config = ProcessingConfig(n_clusters=2, umap_n_neighbors=2)
embedder = EmbeddingService(config)
clusterer = GMMClusterer()


In [None]:
# Create Dummy Chunks (or load real ones)
chunks = [
    Chunk(index=0, text='The cat sat on the mat.', start_char_idx=0, end_char_idx=20),
    Chunk(index=1, text='Dogs are loyal animals.', start_char_idx=21, end_char_idx=40),
    Chunk(index=2, text='Felines are independent.', start_char_idx=41, end_char_idx=60),
    Chunk(index=3, text='Canines like to play fetch.', start_char_idx=61, end_char_idx=80),
]


In [None]:
# Embed Chunks
embeddings_iter = embedder.embed_chunks(chunks)
# Materialize for clustering
embedded_chunks = list(embeddings_iter)
print('Chunks embedded.')


In [None]:
# Cluster
def embedding_generator():
    for c in embedded_chunks:
        if c.embedding:
            yield (str(c.index), c.embedding)

clusters = clusterer.cluster_nodes(embedding_generator(), config)
print(f'Formed {len(clusters)} clusters.')
for cluster in clusters:
    print(f'Cluster {cluster.id}: Nodes {cluster.node_indices}')
