# Tutorial 03: Full RAPTOR Pipeline

Goal: Execute the full recursive summarization process.

We will load the text, run the Raptor Engine, and generate a markdown summary.

In [None]:
import logging
import os
from collections.abc import Iterator
from pathlib import Path

import numpy as np

from domain_models.config import ProcessingConfig
from matome.agents.summarizer import SummarizationAgent
from matome.engines.cluster import GMMClusterer
from matome.engines.embedder import EmbeddingService
from matome.engines.raptor import RaptorEngine
from matome.engines.semantic_chunker import JapaneseSemanticChunker
from matome.exporters.markdown import export_to_markdown
from matome.utils.store import DiskChunkStore

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

USE_MOCK = True  # Set to False to use real API and Models

## 1. Setup Environment
We setup the environment. If Mock mode, we use a mock API key and mock embedding service.

In [None]:
if USE_MOCK:
    os.environ["OPENROUTER_API_KEY"] = "mock"
    print("Mock Mode: Using mock API key and Embeddings.")
elif not os.environ.get("OPENROUTER_API_KEY"):
    print("WARNING: OPENROUTER_API_KEY not found. Switching to Mock mode.")
    USE_MOCK = True
    os.environ["OPENROUTER_API_KEY"] = "mock"

In [None]:
# Mock Embedding Service
class MockEmbeddingService(EmbeddingService):
    def __init__(self, config):
        super().__init__(config)
        self.dim = 1024

    def _process_batch(self, batch_texts: list[str] | tuple[str, ...]) -> Iterator[list[float]]:
        # Bypass model loading and return random vectors
        for _ in batch_texts:
            vec = np.random.rand(self.dim)
            vec = vec / np.linalg.norm(vec)
            yield vec.tolist()

# Config
config = ProcessingConfig(
    embedding_model="mock-model" if USE_MOCK else "intfloat/multilingual-e5-large",
    summarization_model="mock-model" if USE_MOCK else "google/gemini-1.5-flash",
    max_tokens=200 if USE_MOCK else 500  # Smaller chunks for faster mock
)

# Components
if USE_MOCK:
    embedder = MockEmbeddingService(config)
else:
    embedder = EmbeddingService(config)

chunker = JapaneseSemanticChunker(embedder)
clusterer = GMMClusterer()
summarizer = SummarizationAgent(config)

## 2. Load Data
Load the target text file.

In [None]:
filepath = "test_data/エミン流「会社四季報」最強の読み方.txt"
with open(filepath, encoding="utf-8") as f:
    text = f.read()

print(f"Loaded {len(text)} chars.")

## 3. Run RAPTOR Pipeline
We initialize the engine and run it.

In [None]:
# Use a persistent store file for inspection, or temporary if preferred
db_path = Path("chunks.db")
if db_path.exists():
    db_path.unlink()

store = DiskChunkStore(db_path=db_path)

engine = RaptorEngine(chunker, embedder, clusterer, summarizer, config)

try:
    tree = engine.run(text, store=store)
    print("RAPTOR Pipeline Completed!")
except Exception as e:
    print(f"Pipeline failed: {e}")
    import traceback
    traceback.print_exc()
    raise

## 4. Export to Markdown
Generate the final summary markdown.

In [None]:
summary_md = export_to_markdown(tree, store)
output_file = "summary_all.md"
with open(output_file, "w", encoding="utf-8") as f:
    f.write(summary_md)

print(f"Summary saved to {output_file}")
print("--- Preview ---")
print(summary_md[:500] + "...")

In [None]:
# Close store
store.close()