# Document processing

In [1]:
#| default_exp docs

In [2]:
#| hide
import nbdev; nbdev.nbdev_export()

In [3]:
#| exporti

from pathlib import Path
from typing import Literal
import logging

from pydantic import Field, BaseModel
from jinja2 import Environment, FileSystemLoader


import llm
from llm.models import Usage
import anthropic
import langsmith as ls

from lovely_docs.settings import Settings, settings
import asyncio

from tenacity import AsyncRetrying, stop_after_attempt, wait_exponential


In [4]:
#| export
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)

In [5]:
#| export

class TokenCounts(BaseModel):
    fulltext: int = 0
    digest: int = 0
    short_digest: int = 0

class DocItem(BaseModel):
    path: Path
    name: str = ""
    digest: str = ""
    short_digest: str = ""
    essence: str = ""
    relevant: bool = True
    usage: Usage|None = Field(default_factory=Usage)
    type: Literal["page", "dir"]
    token_counts: TokenCounts = Field(default_factory=TokenCounts)


class DocPage(DocItem):
    """Represents a single documentation page"""
    fulltext: str
    type: str = "page"


class DocDirectory(DocItem):
    """Represents a directory in the documentation structure"""
    pages: list[DocPage] = Field(default_factory=list)
    subdirs: list['DocDirectory'] = Field(default_factory=list)
    fulltext: str = ""
    type: str = "dir"

#### Let's load all markdown files from a directory

In [6]:
#| export

def build_markdown_doc_tree(root:Path, path:Path = Path()) -> DocDirectory:
    """Recursively build a documentation tree from markdown files.

    Args:
        root: Root directory containing the documentation
        path: Relative path from root to process (default: root itself)

    Returns:
        DocDirectory containing pages and subdirectories
    """
    assert root.exists() and root.is_dir()
    assert (root/path).exists() and (root/path).is_dir()

    doc_dir = DocDirectory(path=path)

    # Get immediate children only
    for item in sorted((root/path).iterdir()):
        if item.is_file() and item.suffix == '.md':
            # We'll process files later, just record them
            doc_dir.pages.append(DocPage(path=item.relative_to(root), fulltext=item.read_text()))
        if item.is_dir():
            doc_dir.subdirs.append(build_markdown_doc_tree(root, item.relative_to(root)))

    return doc_dir

In [7]:
import pprint

In [8]:
tree = build_markdown_doc_tree(Path("test_data"))
pprint.pprint(tree.model_dump(exclude=["pages", "subdirs"]))

for d in tree.subdirs:
    # for dd in d.subdirs:
    print(str(d.path) + "/")
    for pp in d.pages:
        print(f"{str(pp.path)} -> {pp.fulltext[:50].replace("\n", "")}...")


{'digest': '',
 'essence': '',
 'fulltext': '',
 'name': '',
 'path': Path('.'),
 'relevant': True,
 'short_digest': '',
 'token_counts': {'digest': 0, 'fulltext': 0, 'short_digest': 0},
 'type': 'dir',
 'usage': {'details': None, 'input': None, 'output': None}}
01-introduction/
01-introduction/01-overview.md -> ---title: Overview---Svelte is a framework for...
01-introduction/02-getting-started.md -> ---title: Getting started---We recommend using...
01-introduction/03-svelte-files.md -> ---title: .svelte files---Components are the b...
01-introduction/04-svelte-js-files.md -> ---title: .svelte.js and .svelte.ts files---Be...
01-introduction/index.md -> ---title: Introduction---...
02-runes/
02-runes/01-what-are-runes.md -> ---title: What are runes?---> [!NOTE] **rune**...
02-runes/02-$state.md -> ---title: $state---The `$state` rune allows yo...
02-runes/03-$derived.md -> ---title: $derived---Derived state is declared...
02-runes/04-$effect.md -> ---title: $effect---Effects are functi

#### Let's process one page

In [None]:
#| export

class PageReplySchema(BaseModel):
    better_name: str = Field(description="")
    digest: str = Field(title="Digest, format: markdown", )
    short_digest: str = Field(title="Short digest, format:markdown")
    essence: str = Field(title="Essence, format:txt")
    relevant: bool


async def anthropic_count_tokens(client: anthropic.AsyncAnthropic, model: str, text: str):
    res = await client.messages.count_tokens(
        model=model,
        messages=[{
            "role": "user",
            "content": text
        }],
    )
    return res.input_tokens

async def llm_process_page(settings: Settings, page: DocPage, libname: str) -> DocPage:
    with ls.trace(name=f"Process page: {page.path}", run_type="chain") as trace:
        logger.debug(f"Processing {page.path}")


        model = llm.get_async_model(settings.model)
        model.key = settings.api_key

        # We need to use anthropic client directly to count tokens.
        anthropic_client = anthropic.AsyncAnthropic(api_key=settings.api_key)


        template = Environment(loader=FileSystemLoader(settings.templates_dir)).get_template("process_page.j2")
        inputs = {
            "text": page.fulltext,
            "filename": page.path.name,
            "path": page.path.parent.name + "/",
            "libname": libname
        }
        with ls.trace("Template", "prompt", inputs=inputs) as template_trace:
            prompt = template.render(**inputs)
            template_trace.end(outputs=prompt)

        with ls.trace("LLM call", "llm", inputs={"prompt": prompt}) as llm_trace:
            async for attempt in AsyncRetrying(stop=stop_after_attempt(5), wait=wait_exponential(multiplier=1, min=3, max=60)):
                with attempt:
                    try:
                        res = await model.prompt(prompt=prompt, schema=PageReplySchema, max_tokens=32768, temperature=0)
                        llm_trace.end(outputs=await res.text())
                    except Exception as e:
                        logger.warning(f"Retry {attempt.retry_state.attempt_number}: {str(e)}")
                        raise

        with ls.trace("Parse", "parser", inputs={"input": await res.text()}) as parse_trace:
            reply = PageReplySchema.model_validate_json(await res.text())
            # Normalize better_name: lowercase, replace spaces with hyphens, remove .md extension
            reply.better_name = (reply.better_name.lower()  .replace(' ', '-')
                                                            .removesuffix('.md')
                                                            .replace("/", "∕")) # No / in filenames!
            parse_trace.end(outputs=reply)
            usage = await res.usage()

        # Count tokens for fulltext, digest, and short_digest in parallel
        fulltext_tokens, digest_tokens, short_digest_tokens = await asyncio.gather(
            anthropic_count_tokens(anthropic_client, "claude-haiku-4-5", page.fulltext),
            anthropic_count_tokens(anthropic_client, "claude-haiku-4-5", reply.digest),
            anthropic_count_tokens(anthropic_client, "claude-haiku-4-5", reply.short_digest)
        )
        token_counts = TokenCounts(
            fulltext=fulltext_tokens,
            digest=digest_tokens,
            short_digest=short_digest_tokens
        )

        result = DocPage(
            path=page.path,
            fulltext=page.fulltext,
            name=reply.better_name,
            digest=reply.digest,
            short_digest=reply.short_digest,
            essence=reply.essence,
            relevant=reply.relevant,
            token_counts=token_counts,
            usage=usage)

        trace.end(outputs=result)
        return result

In [10]:
from lovely_docs.settings import settings
from IPython.display import Markdown, display

In [11]:
res = await llm_process_page(
    settings=settings,
    page=tree.subdirs[1].pages[2],
    libname="test")

+0.222s [36mDEBUG[0m [34m<ipykernel>:23[0m llm_process_page Processing 02-runes/03-$derived.md[0m


In [12]:
display(Markdown(f"`{res.path} -> {res.name}:`\n\n{res.essence}"))
display(Markdown("# **==== Short digest ====**"))
display(Markdown(res.short_digest))
display(Markdown("# **==== Full digest ====**"))
display(Markdown(res.digest))

`02-runes/03-$derived.md -> $derived:`

$derived creates reactive computed values that automatically update when their dependencies change, with support for complex derivations via $derived.by and temporary value overrides.

# **==== Short digest ====**

## $derived

Declares reactive derived state that recalculates when dependencies change:

```svelte
let doubled = $derived(count * 2);
let total = $derived.by(() => { /* complex logic */ });
```

Expressions must be side-effect free. Derived values can be temporarily reassigned for optimistic UI. Uses push-pull reactivity: state changes notify dependents immediately, but derived values only re-evaluate when read.

# **==== Full digest ====**

## $derived

Declares derived state that automatically updates when dependencies change:

```svelte
let count = $state(0);
let doubled = $derived(count * 2);
```

Expressions must be side-effect free. Svelte prevents state mutations inside derived expressions.

### $derived.by

For complex derivations, use `$derived.by` with a function:

```svelte
let numbers = $state([1, 2, 3]);
let total = $derived.by(() => {
	let total = 0;
	for (const n of numbers) {
		total += n;
	}
	return total;
});
```

`$derived(expression)` is equivalent to `$derived.by(() => expression)`.

### Dependencies

Anything read synchronously inside the derived expression is a dependency. When dependencies change, the derived is marked dirty and recalculated on next read. Use `untrack` to exempt state from being treated as a dependency.

### Overriding derived values

Derived values can be temporarily reassigned (unless declared with `const`), useful for optimistic UI:

```svelte
let likes = $derived(post.likes);
async function onclick() {
	likes += 1;
	try {
		await like();
	} catch {
		likes -= 1;
	}
}
```

### Reactivity behavior

Unlike `$state`, `$derived` values are not converted to deeply reactive proxies. However, if a derived value is an object/array from state, mutating its properties affects the underlying state.

Svelte uses push-pull reactivity: state changes immediately notify dependents (push), but derived values only re-evaluate when read (pull). If a derived's new value is referentially identical to its previous value, downstream updates are skipped.

#### Looks decent, let's process all pages in a sub-directory.

In [13]:
import asyncio

pages = []
tasks = []
for page in tree.subdirs[1].pages:
    tasks.append(asyncio.create_task(llm_process_page(settings, page, "svelte")))

pages = await asyncio.gather(*tasks)

pages.sort(key=lambda x: x.path)

tree.subdirs[1].pages = pages

+8.689s [36mDEBUG[0m [34m<ipykernel>:23[0m llm_process_page Processing 02-runes/01-what-are-runes.md[0m
+0.017s [36mDEBUG[0m [34m<ipykernel>:23[0m llm_process_page Processing 02-runes/02-$state.md[0m
+0.017s [36mDEBUG[0m [34m<ipykernel>:23[0m llm_process_page Processing 02-runes/03-$derived.md[0m
+0.015s [36mDEBUG[0m [34m<ipykernel>:23[0m llm_process_page Processing 02-runes/04-$effect.md[0m
+0.016s [36mDEBUG[0m [34m<ipykernel>:23[0m llm_process_page Processing 02-runes/05-$props.md[0m
+0.015s [36mDEBUG[0m [34m<ipykernel>:23[0m llm_process_page Processing 02-runes/06-$bindable.md[0m
+0.015s [36mDEBUG[0m [34m<ipykernel>:23[0m llm_process_page Processing 02-runes/07-$inspect.md[0m
+0.015s [36mDEBUG[0m [34m<ipykernel>:23[0m llm_process_page Processing 02-runes/08-$host.md[0m
+0.014s [36mDEBUG[0m [34m<ipykernel>:23[0m llm_process_page Processing 02-runes/index.md[0m


In [14]:
page = tree.subdirs[1].pages[3]
display(Markdown(f"`{page.path} -> {page.name}:`\n\n{page.essence}"))
display(Markdown("# **==== Short digest ====**"))
display(Markdown(page.short_digest))
# display(Markdown("# **==== Full digest ====**"))
# display(Markdown(page.digest))

`02-runes/04-$effect.md -> $effect:`

The $effect rune runs functions when state updates for side effects, with automatic dependency tracking and lifecycle management.

# **==== Short digest ====**

## $effect

Effects run when state updates for side effects (API calls, canvas drawing, etc). They track synchronously-read reactive values and re-run when dependencies change. Asynchronously-read values are not tracked.

```svelte
$effect(() => {
	context.fillStyle = color;
	context.fillRect(0, 0, size, size);
});
```

Effects can return teardown functions that run before re-runs or on component destroy.

**$effect.pre**: Runs before DOM updates.
**$effect.tracking()**: Returns whether code runs in a tracking context.
**$effect.root()**: Creates non-tracked scope for manual control.

Don't use effects for state synchronization—use `$derived` instead.

#### Let's process a directory. The input is all pages digests (+sub-directory digests)

In [None]:
#| export

class DirReplySchema(BaseModel):
    better_name: str
    digest: str = Field(title="Directory digest, fmt:markdown")
    short_digest: str = Field(title="Short digest, format:markdown")
    essence: str
    relevant: bool

async def llm_process_directory(settings: Settings, directory: DocDirectory,  libname: str) -> DocDirectory:
    """Create a summary for a directory based on its relevant pages and subdirectories"""

    with ls.trace(name=f"Process directory: {directory.path}", run_type="chain") as trace:
        logger.debug(f"Processing {directory.path}")

        # If the directory did not have any relevant pages / subdirs, we should not be called.
        assert any(x for x in directory.pages+directory.subdirs if x.relevant)

        model = llm.get_async_model(settings.model)
        model.key = settings.api_key

        # We need to use anthropic client directly to count tokens.
        anthropic_client = anthropic.AsyncAnthropic(api_key=settings.api_key)

        template = Environment(loader=FileSystemLoader(settings.templates_dir)).get_template("process_directory.j2")

        input = {
            "dirname": directory.path.name,
            "path": directory.path.parent.name,
            "pages": [p for p in directory.pages if p.relevant],
            "subdirs": [s for s in directory.subdirs if s.relevant],
            "libname": libname
        }
        with ls.trace("Template", "prompt", inputs=input) as template_trace:
            prompt = template.render(**input)
            template_trace.end(outputs=prompt)


        with ls.trace("LLM call", "llm", inputs={"prompt": prompt}) as llm_trace:
            async for attempt in AsyncRetrying(stop=stop_after_attempt(5), wait=wait_exponential(multiplier=1, min=3, max=60)):
                with attempt:
                    try:
                        res = await model.prompt(prompt=prompt, schema=DirReplySchema, max_tokens=32768, temperature=0)
                        llm_trace.end(outputs=await res.text())
                        usage = await res.usage()
                    except Exception as e:
                        logger.warning(f"Retry {attempt.retry_state.attempt_number}: {str(e)}")
                        raise

        with ls.trace("Parse", "parser", inputs={"input": await res.text()}) as parse_trace:
            reply = DirReplySchema.model_validate_json(await res.text())
            reply.better_name = (reply.better_name.lower()  .replace(' ', '-')
                                                            .removesuffix('.md')
                                                            .replace("/", "∕")) # No / in filenames!
            parse_trace.end(outputs=reply)


        # We save a generated fulltext for a directory which is the sum of digests of all the pages and subdirs within.
        fulltext_template = Environment(loader=FileSystemLoader(settings.templates_dir)).get_template("directory_fulltext.j2")
        fulltext = fulltext_template.render(**input)

        # Count tokens for fulltext, digest, and short_digest in parallel
        fulltext_tokens, digest_tokens, short_digest_tokens = await asyncio.gather(
            anthropic_count_tokens(anthropic_client, "claude-haiku-4-5", fulltext),
            anthropic_count_tokens(anthropic_client, "claude-haiku-4-5", reply.digest),
            anthropic_count_tokens(anthropic_client, "claude-haiku-4-5", reply.short_digest)
        )
        token_counts = TokenCounts(
            fulltext=fulltext_tokens,
            digest=digest_tokens,
            short_digest=short_digest_tokens
        )

        result = directory.model_copy(deep=True)
        result.name = reply.better_name
        result.digest = reply.digest
        result.short_digest = reply.short_digest
        result.essence = reply.essence
        result.relevant = reply.relevant
        result.fulltext = fulltext
        result.token_counts = token_counts
        result.usage = usage

        trace.end(outputs=result)
        return result

In [16]:
tree.subdirs[1] = await llm_process_directory(settings, tree.subdirs[1], "svelte")

+12.714s [36mDEBUG[0m [34m<ipykernel>:14[0m llm_process_directory Processing 02-runes[0m


In [17]:
subdir = tree.subdirs[1]
display(Markdown(f"`{subdir.path} -> {subdir.name}:`\n\n{subdir.essence}"))
display(Markdown("# **==== Short digest ====**"))
display(Markdown(subdir.short_digest))

`02-runes -> runes:`

Runes are $-prefixed compiler keywords that control reactivity, state management, side effects, and component communication in Svelte.

# **==== Short digest ====**

## Runes

`$`-prefixed compiler keywords for reactivity and component behavior.

**$state**: Reactive state with deep reactivity for objects/arrays. `$state.raw` for non-reactive, `$state.snapshot` to convert proxy.

**$derived**: Reactive computed values. `$derived.by()` for complex logic. Auto-tracks dependencies.

**$effect**: Runs on state updates for side effects. Returns teardown functions. `$effect.pre` runs before DOM updates.

**$props**: Destructure component props with defaults and renaming. `$props.id()` generates unique IDs.

**$bindable**: Enables two-way prop binding with `bind:` directive.

**$inspect**: Dev-only logging of reactive changes. `$inspect.trace()` traces state changes.

**$host**: Access host element in custom elements for event dispatch.

#### Great! Now let's do the whole tree recursively.
When processing a directory:
- process all pages
- process all-sibdirectories
- Generate digests for the whole directory

In [None]:
#| export
async def process_tree_depth_first(settings: Settings, doc_dir: DocDirectory, libname: str) -> DocDirectory:
    """
    Process documentation tree depth-first with parallel processing.
    Mutates the doc_dir object.
    """

    with ls.trace(name=f"Process tree: {libname}/{doc_dir.path}", run_type="chain") as trace:
        # First, recursively process all subdirectories in parallel
        subdirs: list[DocDirectory] = await asyncio.gather(*[
            process_tree_depth_first(settings, subdir, libname) for subdir in doc_dir.subdirs
        ])
        subdirs = sorted(subdirs, key=lambda s: s.path)

        # Then process all pages in this directory in parallel
        pages = await asyncio.gather(*[
            llm_process_page(settings, page, libname) for page in doc_dir.pages
        ])
        pages = sorted(pages, key=lambda s: s.path)

        # .name is llm-generated and might be not unique. Make it unique.
        names: set[str] = set()
        for x in subdirs + pages:
            name, i = x.name, 2
            while name in names:
                name = f"{x.name}_{str(i)}"
                x.name = name
            names.add(name)

        if not any(x for x in subdirs+pages if x.relevant):
            result = DocDirectory(path=doc_dir.path, pages=pages, relevant=False)
            trace.end(outputs=result)
            return result

        result = await llm_process_directory(settings, DocDirectory(path=doc_dir.path, pages=pages, subdirs=subdirs), libname)
        trace.end(outputs=result)
        return result

In [19]:
# Clean tree
tree = build_markdown_doc_tree(Path("test_data"))
processed_tree = await process_tree_depth_first(settings, tree, "svelte")

+10.923s [36mDEBUG[0m [34m<ipykernel>:23[0m llm_process_page Processing 01-introduction/01-overview.md[0m
+0.018s [36mDEBUG[0m [34m<ipykernel>:23[0m llm_process_page Processing 01-introduction/02-getting-started.md[0m
+0.017s [36mDEBUG[0m [34m<ipykernel>:23[0m llm_process_page Processing 01-introduction/03-svelte-files.md[0m
+0.018s [36mDEBUG[0m [34m<ipykernel>:23[0m llm_process_page Processing 01-introduction/04-svelte-js-files.md[0m
+0.018s [36mDEBUG[0m [34m<ipykernel>:23[0m llm_process_page Processing 01-introduction/index.md[0m
+0.022s [36mDEBUG[0m [34m<ipykernel>:23[0m llm_process_page Processing 02-runes/01-what-are-runes.md[0m
+0.021s [36mDEBUG[0m [34m<ipykernel>:23[0m llm_process_page Processing 02-runes/02-$state.md[0m
+0.023s [36mDEBUG[0m [34m<ipykernel>:23[0m llm_process_page Processing 02-runes/03-$derived.md[0m
+0.016s [36mDEBUG[0m [34m<ipykernel>:23[0m llm_process_page Processing 02-runes/04-$effect.md[0m
+0.020s [36mDEBUG[0m

In [20]:
display(Markdown(f"`{processed_tree.path} -> {processed_tree.name}:`\n\n{processed_tree.essence}"))
display(Markdown("# **==== Digest ====**"))
display(Markdown(processed_tree.digest))

`. -> runes:`

Compiler keywords prefixed with `$` that manage reactivity, state, effects, props, and debugging in Svelte components.

# **==== Digest ====**

## Runes

`$`-prefixed compiler keywords that control reactivity, state management, side effects, and component communication. They are not functions and cannot be imported, assigned to variables, or passed as arguments.

### $state
Creates reactive state with automatic deep reactivity for objects and arrays:
```js
let count = $state(0);
let todos = $state([{ done: false, text: 'add' }]);
todos[0].done = true; // triggers updates
```
Use `$state.raw` for non-reactive state (reassign only) and `$state.snapshot(value)` to convert proxies to plain objects.

### $derived
Creates reactive computed values that automatically update when dependencies change:
```js
let doubled = $derived(count * 2);
let result = $derived.by(() => { /* complex logic */ });
```

### $effect
Runs functions when state updates for side effects:
```js
$effect(() => {
	context.fillStyle = color;
	context.fillRect(0, 0, size, size);
});
```
Return teardown functions that run before re-runs or on destroy. Use `$effect.pre` to run before DOM updates.

### $props
Receives component props with destructuring and defaults:
```js
let { adjective = 'happy', ...others } = $props();
```
Generate unique component instance IDs with `$props.id()`.

### $bindable
Enables two-way prop binding:
```js
let { value = $bindable() } = $props();
```

### $inspect
Development-only debugging rune that logs reactive state changes:
```js
$inspect(count, message);
$inspect(count).with((type, value) => { /* custom handler */ });
$inspect.trace(); // traces what caused effect to re-run
```

### $host
Accesses the host element in custom element components:
```js
$host().dispatchEvent(new CustomEvent(type));
```

#### Looks decent. Keep in mind, this it the digest for the while 2 first section of the docs.

#### How much did it cost us?

In [21]:
#| export

def calculate_total_usage(doc_dir: DocDirectory) -> Usage:
    """Calculate total usage for a directory tree including all pages, subdirs, and summaries"""
    total_input = 0
    total_output = 0

    # Add usage from all pages in this directory
    for page in doc_dir.pages:
        total_input += page.usage.input or 0
        total_output += page.usage.output or 0

    # Add usage from directory summarization
    if doc_dir.usage:
        total_input += doc_dir.usage.input or 0
        total_output += doc_dir.usage.output or 0

    # Recursively add usage from subdirectories
    for subdir in doc_dir.subdirs:
        subdir_usage = calculate_total_usage(subdir)
        total_input += subdir_usage.input or 0
        total_output += subdir_usage.output or 0

    return Usage(input=total_input, output=total_output)


def calculate_cost(usage: Usage, input_cost: float, output_cost: float) -> tuple[float, float, float]:
    total_input = usage.input or 0
    total_output = usage.output or 0
    input_cost_total = (total_input / 1_000_000) * input_cost
    output_cost_total = (total_output / 1_000_000) * output_cost
    cost = input_cost_total + output_cost_total
    return cost, input_cost_total, output_cost_total

In [22]:
usage =calculate_total_usage(processed_tree)
print(f"\nTotal Usage:")
print(f"  Input tokens: {usage.input:,}")
print(f"  Output tokens: {usage.output:,}")
print(f"  Total tokens: {(usage.input + usage.output):,}")

cost, input_cost, output_cost = calculate_cost(usage, 1, 5)

print(f"\nCost:")
print(f"  Total: ${cost:.2f}")
print(f"  Input: ${input_cost:.2f}")
print(f"  Output: ${output_cost:.2f}")


Total Usage:
  Input tokens: 37,958
  Output tokens: 9,138
  Total tokens: 47,096

Cost:
  Total: $0.08
  Input: $0.04
  Output: $0.05


#### Let's save the results. First the markdown files.

In [23]:
#| export
import shutil

In [24]:
#| export

def save_doc_files(path: Path, doc: DocDirectory):
    """Save a DocDirectory structure to disk at the specified path.

    Args:
        path: Directory path where the documentation will be saved
        doc: DocDirectory object containing the documentation structure to save
    """
    if path.exists(): shutil.rmtree(path)
    path.mkdir(parents=True, exist_ok=True)

    (path/"digest.md").write_text(doc.digest)
    (path/"short_digest.md").write_text(doc.short_digest)
    (path/"essence.md").write_text(doc.essence)
    (path/"fulltext.md").write_text(doc.fulltext)


    for page in doc.pages:
        (path/page.name).mkdir(parents=True, exist_ok=True)
        (path/page.name/"essence.md").write_text(page.essence)
        (path/page.name/"fulltext.md").write_text(page.fulltext)
        (path/page.name/"digest.md").write_text(page.digest)
        (path/page.name/"short_digest.md").write_text(page.short_digest)

    for subdir in doc.subdirs:
        save_doc_files(path/subdir.name, subdir)

In [25]:
save_doc_files(Path("test-out"), processed_tree)
!ls test-out
!ls "test-out/{processed_tree.subdirs[0].name}"
!ls "test-out/{processed_tree.subdirs[0].name}/{processed_tree.subdirs[0].pages[0].name}"
Markdown((Path("test-out")/processed_tree.subdirs[0].name/processed_tree.subdirs[0].pages[0].name / "digest.md").read_text())

digest.md  essence.md  fulltext.md  introduction  runes  short_digest.md
digest.md   fulltext.md      introduction  short_digest.md
essence.md  getting-started  overview	   svelte-files
digest.md  essence.md  fulltext.md  short_digest.md


Svelte is a compiler-based framework for building web user interfaces. It transforms declarative components written in HTML, CSS, and JavaScript into optimized JavaScript code.

Example component:
```svelte
<script>
	function greet() {
		alert('Welcome to Svelte!');
	}
</script>

<button onclick={greet}>click me</button>

<style>
	button {
		font-size: 2em;
	}
</style>
```

Use cases range from standalone components to full-stack applications using SvelteKit. For learning, the interactive tutorial is recommended as a starting point, with this documentation serving as reference material. Online environments are available via the playground or StackBlitz.

#### Next the metadata

In [26]:
#| export
import git
import json
from datetime import datetime, timezone
from lovely_docs.settings import Source, WebSource, GitSource

In [27]:
repo = git.Repo(search_parent_directories=True)
commit = repo.head.commit.hexsha
# Check if repo is dirty
if repo.is_dirty(untracked_files=True):
    commit += "-dirty"

commit

'f5802e3f465dd546722723e1a413294fd81f9d26-dirty'

In [28]:
#| export

def file_map(doc: DocDirectory, prefix: Path):
    filemap = {}

    path = prefix / doc.name

    # The filemap is keyed by original name so it's easy to get the pages in the original order.
    logger.debug(f"Added {path.as_posix()} -> {doc.name} ({doc.path.as_posix()})")

    for subdir in doc.subdirs:
        filemap[(path/subdir.name).as_posix()] =  subdir.model_dump(mode="json", include=["path", "relevant", "usage", "token_counts"]) | {"type": "directory" }
        filemap |= file_map(subdir, path)

    for page in doc.pages:
        filemap[(path / page.name).as_posix()] = page.model_dump(mode="json", include=["path", "relevant", "usage", "token_counts"]) | {"type": "page"}
        logger.debug(f"Added page {(path / page.name).as_posix()} -> {page.name} ({page.path.as_posix()})")
    return filemap


def build_metadata(source: Source, doc: DocDirectory):

    if isinstance(source, GitSource):
        source_type = "git"
    elif isinstance(source, WebSource):
        source_type = "web"
    else:
        raise TypeError(f"Unknown source type: {type(source)}")

    # Get current git commit
    repo = git.Repo(search_parent_directories=True)
    commit = repo.head.commit.hexsha
    # Check if repo is dirty
    if repo.is_dirty(untracked_files=True):
        commit += "-dirty"

    return {
        "map": file_map(doc, Path("")),
        "source_type": source_type,
        "source": source.model_dump(mode="json"),
        "date": datetime.now(timezone.utc).isoformat(),
        "model": settings.model,
        "commit": commit,
    }


In [29]:
fake_source = GitSource(
    commit="123456",
    doc_dir="path/to/nowhere",
    name="svelte",
    repo="https://github.com/sveltejs/svelte"
)
processed_tree.name = "" # The top-level directry does not need a name.
Path("test-out/index.json").write_text(json.dumps(build_metadata(fake_source, processed_tree), indent=2))

+6.559s [36mDEBUG[0m [34m<ipykernel>:9[0m file_map Added . ->  (.)[0m
+0.001s [36mDEBUG[0m [34m<ipykernel>:9[0m file_map Added introduction -> introduction (01-introduction)[0m
+0.000s [36mDEBUG[0m [34m<ipykernel>:17[0m file_map Added page introduction/overview -> overview (01-introduction/01-overview.md)[0m
+0.000s [36mDEBUG[0m [34m<ipykernel>:17[0m file_map Added page introduction/getting-started -> getting-started (01-introduction/02-getting-started.md)[0m
+0.001s [36mDEBUG[0m [34m<ipykernel>:17[0m file_map Added page introduction/svelte-files -> svelte-files (01-introduction/03-svelte-files.md)[0m
+0.000s [36mDEBUG[0m [34m<ipykernel>:17[0m file_map Added page introduction/.svelte.js-and-.svelte.ts-files -> .svelte.js-and-.svelte.ts-files (01-introduction/04-svelte-js-files.md)[0m
+0.000s [36mDEBUG[0m [34m<ipykernel>:17[0m file_map Added page introduction/introduction -> introduction (01-introduction/index.md)[0m
+0.000s [36mDEBUG[0m [34m<ipyker

5562

In [30]:
!cat test-out/index.json | head -n 20
!echo "[  .....  ]"
!cat test-out/index.json | tail -n 10

{
  "map": {
    "introduction": {
      "path": "01-introduction",
      "relevant": false,
      "usage": {
        "input": 1419,
        "output": 458,
        "details": null
      },
      "token_counts": {
        "fulltext": 329,
        "digest": 246,
        "short_digest": 110
      },
      "type": "directory"
    },
    "introduction/overview": {
      "path": "01-introduction/01-overview.md",
      "relevant": false,
[  .....  ]
  "source": {
    "name": "svelte",
    "doc_dir": "path/to/nowhere",
    "repo": "https://github.com/sveltejs/svelte",
    "commit": "123456"
  },
  "date": "2025-11-05T06:14:05.369769+00:00",
  "model": "claude-haiku-4.5",
  "commit": "f5802e3f465dd546722723e1a413294fd81f9d26-dirty"
}

In [31]:
#| export
def save_processed_documents(source: Source, path: Path, tree: DocDirectory):
    tree = tree.model_copy(deep=True)
    tree.name = "" # The top-level directry does not need a name.
    save_doc_files(path, tree)
    (path/"index.json").write_text(json.dumps(build_metadata(source, tree), indent=2))

In [32]:
save_processed_documents(fake_source, Path("test-out"), processed_tree)

+0.884s [36mDEBUG[0m [34m<ipykernel>:9[0m file_map Added . ->  (.)[0m
+0.001s [36mDEBUG[0m [34m<ipykernel>:9[0m file_map Added introduction -> introduction (01-introduction)[0m
+0.000s [36mDEBUG[0m [34m<ipykernel>:17[0m file_map Added page introduction/overview -> overview (01-introduction/01-overview.md)[0m
+0.000s [36mDEBUG[0m [34m<ipykernel>:17[0m file_map Added page introduction/getting-started -> getting-started (01-introduction/02-getting-started.md)[0m
+0.000s [36mDEBUG[0m [34m<ipykernel>:17[0m file_map Added page introduction/svelte-files -> svelte-files (01-introduction/03-svelte-files.md)[0m
+0.000s [36mDEBUG[0m [34m<ipykernel>:17[0m file_map Added page introduction/.svelte.js-and-.svelte.ts-files -> .svelte.js-and-.svelte.ts-files (01-introduction/04-svelte-js-files.md)[0m
+0.000s [36mDEBUG[0m [34m<ipykernel>:17[0m file_map Added page introduction/introduction -> introduction (01-introduction/index.md)[0m
+0.000s [36mDEBUG[0m [34m<ipyker