# Tutorial 01: Quickstart

Goal: Demonstrate text ingestion and semantic chunking.

We will load a sample text, apply the Japanese-optimized regex splitter, and visualize the chunks.

In [None]:
import logging
from collections.abc import Iterable, Iterator

import numpy as np

from domain_models.config import ProcessingConfig
from matome.engines.embedder import EmbeddingService
from matome.engines.semantic_chunker import JapaneseSemanticChunker

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Configuration
USE_MOCK = True  # Set to False to use real embeddings (requires downloading model)


## 1. Load Data
We load a sample text file.

In [None]:
filepath = "test_data/sample.txt"
with open(filepath, encoding="utf-8") as f:
    text = f.read()

print(f"Loaded text ({len(text)} chars): {text[:100]}...")

## 2. Setup Embedding Service
We need an embedding service for semantic chunking. We can use a mock service for quick testing.

In [None]:
class MockEmbeddingService(EmbeddingService):
    def __init__(self, config):
        super().__init__(config)
        self.dim = 1024  # e5-large dimension

    def embed_strings(self, texts: Iterable[str]) -> Iterator[list[float]]:
        # Return random vectors
        for _ in texts:
            # Normalize to simulate unit vectors (cosine similarity)
            vec = np.random.rand(self.dim)
            vec = vec / np.linalg.norm(vec)
            yield vec.tolist()

if USE_MOCK:
    print("Using Mock Embedding Service")
    # We use a valid model name in config even if mocking, to pass validation
    config = ProcessingConfig(embedding_model="mock-model")
    embedder = MockEmbeddingService(config)
else:
    print("Using Real Embedding Service (this may download a large model)")
    config = ProcessingConfig()
    embedder = EmbeddingService(config)

## 3. Semantic Chunking
Now we split the text into chunks.

In [None]:
chunker = JapaneseSemanticChunker(embedder)
chunks = list(chunker.split_text(text, config))

print(f"Generated {len(chunks)} chunks.")
for i, chunk in enumerate(chunks[:5]):
    print(f"\n--- Chunk {i} ---")
    print(chunk.text)
    print(f"Length: {len(chunk.text)} chars")