# Document processing


In [None]:
#| default_exp docs

In [None]:
#| hide
import nbdev

nbdev.nbdev_export()

In [None]:
#| exporti

from pathlib import Path
import logging

from pydantic import Field, BaseModel, field_validator
from jinja2 import Environment, FileSystemLoader

import llm
from llm.models import Usage
import anthropic
import langsmith as ls

from lovely_docs.settings import Source, WebSource, GitSource, LLMTxtSource, Settings, settings
import asyncio

from tenacity import AsyncRetrying, stop_after_attempt, wait_exponential


In [None]:
#| export
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)

In [None]:
#| export

class TokenCounts(BaseModel):
    fulltext: int = 0
    digest: int = 0
    short_digest: int = 0

class DocItem(BaseModel):
    origPath: Path
    name: str

    @field_validator('name')
    @classmethod
    def validate_name(cls, v):
        if '/' in v or ' ' in v:
            raise ValueError(f"name must not contain '/' or spaces, got: {v!r}")
        return v

    displayName: str
    digest: str = ""
    short_digest: str = ""
    essence: str = ""
    relevant: bool = True
    usage: Usage = Field(default_factory=lambda: Usage(0, 0))
    token_counts: TokenCounts = Field(default_factory=TokenCounts)
    fulltext: str = ""
    children: list['DocItem'] = Field(default_factory=list)

DocItem.model_rebuild()  # Resolve forward refs.

#### Let's load all markdown files from a directory


In [None]:
#| export

from pathlib import Path
from typing import Iterable

def build_markdown_doc_tree(
    root: Path,
    path: Path = Path(),
    include: list | None = None,  # Relative prefixes, don't start with '/'
    exclude: list | None = None,  # Glob patterns
    extensions: list[str] = [".md"],
) -> DocItem | None:
    """Recursively build a documentation tree from markdown files.

    Args:
        root: Root directory containing the documentation
        path: Relative path from root to process (default: root itself)
        include: List of paths to include (prefixes). If None, all are included.
        exclude: List of glob patterns to exclude.
        extensions: List of file extensions to consider.

    Returns:
        DocItem containing pages and subdirectories in children, if any
        None if there are no non-emprty pages or subdirectories.
    """
    assert root.exists() and root.is_dir()
    assert (root / path).exists() and (root / path).is_dir()

    include_paths: set[str] = set(include or [])
    exclude_patterns: set[str] = set(exclude or [])

    def _is_included(rel: Path) -> bool:
        rel_str = rel.as_posix()
        if not include_paths:
            return True
        return any(rel_str.startswith(inc) or inc.startswith(rel_str) for inc in include_paths)

    def _is_excluded(rel: Path) -> bool:
        return any(rel.match(exc) for exc in exclude_patterns)

    children: list[DocItem] = []

    # Get immediate children only
    for item in sorted((root / path).iterdir()):
        rel_path = item.relative_to(root)
        if not _is_included(rel_path) or _is_excluded(rel_path):
            continue

        if item.is_file() and item.suffix in extensions:
            # We'll process files later, just record them
            fulltext = item.read_text()
            if fulltext:
                name = str(rel_path.name)
                children.append(
                    DocItem(origPath=rel_path, name=name, displayName=name, fulltext=fulltext)
                )
        if item.is_dir():
            subtree = build_markdown_doc_tree(
                root, item.relative_to(root), include, exclude, extensions
            )
            if subtree:
                children.append(subtree)

    # Special case - directories with 1 child get folded.
    if len(children) == 1:
        return children[0]

    if children:
        name = str(path.name)
        return DocItem(origPath=path, name=name, displayName=name, children=children)

    return None

In [None]:
# pprint.pprint(tree.model_dump(exclude=["children"]))
tree = build_markdown_doc_tree(Path("test_data"))

def display_tree(root: DocItem):
    print(f"{str(root.origPath)} -> {root.name}: {root.fulltext[:50].replace(chr(10), '')}...")

    if root.children:
        for child in root.children:
            display_tree(child)

display_tree(tree)

. -> : ...
01-introduction -> 01-introduction: ...
01-introduction/01-overview.md -> 01-overview.md: ---title: Overview---Svelte is a framework for...
01-introduction/02-getting-started.md -> 02-getting-started.md: ---title: Getting started---We recommend using...
01-introduction/03-svelte-files.md -> 03-svelte-files.md: ---title: .svelte files---Components are the b...
01-introduction/04-svelte-js-files.md -> 04-svelte-js-files.md: ---title: .svelte.js and .svelte.ts files---Be...
01-introduction/index.md -> index.md: ---title: Introduction---...
02-runes -> 02-runes: ...
02-runes/01-what-are-runes.md -> 01-what-are-runes.md: ---title: What are runes?---> [!NOTE] **rune**...
02-runes/02-$state.md -> 02-$state.md: ---title: $state---The `$state` rune allows yo...
02-runes/03-$derived.md -> 03-$derived.md: ---title: $derived---Derived state is declared...
02-runes/04-$effect.md -> 04-$effect.md: ---title: $effect---Effects are functions that...
02-runes/05-$props.md -> 05-$props.md: ---

In [None]:
small_tree = build_markdown_doc_tree(Path("test_data"), include=["02-runes"])
display_tree(small_tree)

02-runes -> 02-runes: ...
02-runes/01-what-are-runes.md -> 01-what-are-runes.md: ---title: What are runes?---> [!NOTE] **rune**...
02-runes/02-$state.md -> 02-$state.md: ---title: $state---The `$state` rune allows yo...
02-runes/03-$derived.md -> 03-$derived.md: ---title: $derived---Derived state is declared...
02-runes/04-$effect.md -> 04-$effect.md: ---title: $effect---Effects are functions that...
02-runes/05-$props.md -> 05-$props.md: ---title: $props---The inputs to a component a...
02-runes/06-$bindable.md -> 06-$bindable.md: ---title: $bindable---Ordinarily, props go one...
02-runes/07-$inspect.md -> 07-$inspect.md: ---title: $inspect---> [!NOTE] `$inspect` only...
02-runes/08-$host.md -> 08-$host.md: ---title: $host---When compiling a component a...
02-runes/index.md -> index.md: ---title: Runes---...


In [None]:
small_tree = build_markdown_doc_tree(Path("test_data"), include=["02-runes/04-$effect.md"])
display_tree(small_tree)

02-runes/04-$effect.md -> 04-$effect.md: ---title: $effect---Effects are functions that...


#### Let's process one page


In [None]:
#| export

class PageReplySchema(BaseModel):
    better_name: str = Field(description="")
    digest: str = Field(title="Digest, format: markdown", )
    short_digest: str = Field(title="Short digest, format:markdown")
    essence: str = Field(title="Essence, format:txt")
    relevant: bool

async def anthropic_count_tokens(client: anthropic.AsyncAnthropic, model: str, text: str):
    res = await client.messages.count_tokens(
        model=model,
        messages=[{
            "role": "user",
            "content": text
        }],
    )
    return res.input_tokens

async def llm_process_page(
    settings: Settings, page: DocItem, libname: str, extra_prompt: str | None = None
) -> DocItem:
    with ls.trace(
        name=f"Process page: {page.origPath}", run_type="chain", inputs={"input": page.fulltext}
    ) as trace:
        logger.debug(f"Processing {page.origPath}")
        assert not page.children, "A page should be a leaf node, no children allowed"

        if not page.fulltext.strip():
            logger.debug(f"Got an empry page {page.origPath}")
            return page.model_copy(update={"relevant": False})

        model = llm.get_async_model(settings.model)
        model.key = settings.api_key

        # We need to use anthropic client directly to count tokens.
        anthropic_client = anthropic.AsyncAnthropic(api_key=settings.api_key)

        template = Environment(loader=FileSystemLoader(settings.templates_dir)
                               ).get_template("process_page.j2")
        inputs = {
            "text": page.fulltext,
            "filename": str(page.origPath),
            "path": str(page.origPath.parent) + "/",
            "libname": libname,
            "extra": extra_prompt
        }
        with ls.trace("Template", "prompt", inputs=inputs) as template_trace:
            prompt = template.render(**inputs)
            template_trace.end(outputs=prompt)

        with ls.trace("LLM call", "llm", inputs={"prompt": prompt}) as llm_trace:
            async for attempt in AsyncRetrying(
                stop=stop_after_attempt(5), wait=wait_exponential(multiplier=1, min=3, max=60)
            ):
                with attempt:
                    try:
                        res = await model.prompt(
                            prompt=prompt, schema=PageReplySchema, max_tokens=32768, temperature=0
                        )
                        llm_trace.end(outputs=await res.text())
                    except Exception as e:
                        logger.warning(
                            f"{page.origPath}: retry {attempt.retry_state.attempt_number}: {str(e)}"
                        )
                        raise

        with ls.trace("Parse", "parser", inputs={"input": await res.text()}) as parse_trace:
            reply = PageReplySchema.model_validate_json(await res.text())
            reply.better_name = reply.better_name.removesuffix('.md')
            parse_trace.end(outputs=reply)
            usage = await res.usage()

        # Count tokens for fulltext, digest, and short_digest in parallel
        async for attempt in AsyncRetrying(
            stop=stop_after_attempt(5), wait=wait_exponential(multiplier=1, min=3, max=60)
        ):
            with attempt:
                try:
                    fulltext_tokens, digest_tokens, short_digest_tokens = await asyncio.gather(
                        anthropic_count_tokens(anthropic_client, "claude-haiku-4-5", page.fulltext),
                        anthropic_count_tokens(anthropic_client, "claude-haiku-4-5", reply.digest),
                        anthropic_count_tokens(
                            anthropic_client, "claude-haiku-4-5", reply.short_digest
                        )
                    )
                    token_counts = TokenCounts(
                        fulltext=fulltext_tokens,
                        digest=digest_tokens,
                        short_digest=short_digest_tokens
                    )
                except Exception as e:
                    logger.warning(
                        f"{page.origPath}: retry token count {attempt.retry_state.attempt_number}: {str(e)}"
                    )
                    raise

        result = DocItem(
            origPath=page.origPath,
            fulltext=page.fulltext,
            displayName=reply.better_name,
            name=reply.better_name.lower().replace(" ", "_").replace("/", "_"),
            digest=reply.digest,
            short_digest=reply.short_digest,
            essence=reply.essence,
            relevant=reply.relevant,
            token_counts=token_counts,
            usage=usage
        )

        trace.end(outputs=result.model_copy(update={"fulltext": ""}))
        return result

In [None]:
from lovely_docs.settings import settings
from IPython.display import Markdown, display

In [None]:
tree.children[1].children[2]

DocItem(origPath=Path('02-runes/03-$derived.md'), name='03-$derived.md', displayName='03-$derived.md', digest='', short_digest='', essence='', relevant=True, usage=Usage(input=0, output=0, details=None), token_counts=TokenCounts(fulltext=0, digest=0, short_digest=0), fulltext="---\ntitle: $derived\n---\n\nDerived state is declared with the `$derived` rune:\n\n```svelte\n<script>\n\tlet count = $state(0);\n\tlet doubled = $derived(count * 2);\n</script>\n\n<button onclick={() => count++}>\n\t{doubled}\n</button>\n\n<p>{count} doubled is {doubled}</p>\n```\n\nThe expression inside `$derived(...)` should be free of side-effects. Svelte will disallow state changes (e.g. `count++`) inside derived expressions.\n\nAs with `$state`, you can mark class fields as `$derived`.\n\n> [!NOTE] Code in Svelte components is only executed once at creation. Without the `$derived` rune, `doubled` would maintain its original value even when `count` changes.\n\n## `$derived.by`\n\nSometimes you need to creat

In [None]:
res = await llm_process_page(
    settings=settings,
    page=tree.children[1].children[2],
    libname="svelte")

+0.222s [36mDEBUG[0m [34m<ipykernel>:26[0m llm_process_page Processing 02-runes/03-$derived.md[0m


In [None]:
display(Markdown(f"`{res.origPath} -> {res.displayName}:`\n\n{res.essence}"))
display(Markdown("# **==== Short digest ====**"))
display(Markdown(res.short_digest))
display(Markdown("# **==== Full digest ====**"))
display(Markdown(res.digest))

`02-runes/03-$derived.md -> $derived:`

$derived creates reactive computed values from state dependencies; $derived.by for complex logic; supports temporary overrides; uses push-pull reactivity with referential equality optimization.

# **==== Short digest ====**

Declares reactive derived state that auto-updates when dependencies change. Use `$derived(expr)` for simple expressions or `$derived.by(() => {...})` for complex logic. Can be temporarily overridden for optimistic UI. Uses push-pull reactivityâ€”dependents notified immediately but re-evaluated only when read. Skips downstream updates if new value is referentially identical to previous.

# **==== Full digest ====**

## $derived

Declares derived state that automatically updates when dependencies change. The expression must be side-effect free.

```svelte
let count = $state(0);
let doubled = $derived(count * 2);
```

Can be used on class fields. Without `$derived`, values don't reactively update when dependencies change.

### $derived.by

For complex derivations, use `$derived.by` with a function body:

```svelte
let numbers = $state([1, 2, 3]);
let total = $derived.by(() => {
	let total = 0;
	for (const n of numbers) total += n;
	return total;
});
```

`$derived(expr)` is equivalent to `$derived.by(() => expr)`.

### Dependencies

Anything read synchronously inside the expression is a dependency. When dependencies change, the derived is marked dirty and recalculated on next read. Use `untrack` to exempt state from being a dependency.

### Overriding derived values

Can temporarily reassign derived values (unless declared with `const`) for optimistic UI:

```svelte
let { post, like } = $props();
let likes = $derived(post.likes);

async function onclick() {
	likes += 1;  // immediate feedback
	try {
		await like();
	} catch {
		likes -= 1;  // rollback
	}
}
```

Prior to Svelte 5.25, deriveds were read-only.

### Reactivity behavior

Unlike `$state`, `$derived` values are not converted to deeply reactive proxies. However, if a derived returns an object/array from a deeply reactive source, mutations to that object still affect the source:

```svelte
let items = $state([...]);
let index = $state(0);
let selected = $derived(items[index]);
// mutating selected affects items
```

### Update propagation

Uses push-pull reactivity: state changes immediately notify dependents (push), but derived values only re-evaluate when read (pull). If a derived's new value is referentially identical to the previous value, downstream updates are skipped:

```svelte
let count = $state(0);
let large = $derived(count > 10);
// button only updates when large changes, not when count changes
```

#### Looks decent, let's process all pages in a sub-directory.


In [None]:
import asyncio

pages = []
tasks = []
for page in tree.children[1].children:
    tasks.append(asyncio.create_task(llm_process_page(settings, page, "svelte")))

pages: list[DocItem] = await asyncio.gather(*tasks)

pages.sort(key=lambda x: x.origPath)

# Update the children with processed pages
tree.children[1].children = pages

# and some more
tree.children[2] = await llm_process_page(settings, tree.children[2], "svelte")
tree.children[3] = await llm_process_page(settings, tree.children[3], "svelte")

+8.812s [36mDEBUG[0m [34m<ipykernel>:26[0m llm_process_page Processing 02-runes/01-what-are-runes.md[0m
+0.017s [36mDEBUG[0m [34m<ipykernel>:26[0m llm_process_page Processing 02-runes/02-$state.md[0m
+0.016s [36mDEBUG[0m [34m<ipykernel>:26[0m llm_process_page Processing 02-runes/03-$derived.md[0m
+0.016s [36mDEBUG[0m [34m<ipykernel>:26[0m llm_process_page Processing 02-runes/04-$effect.md[0m
+0.015s [36mDEBUG[0m [34m<ipykernel>:26[0m llm_process_page Processing 02-runes/05-$props.md[0m
+0.016s [36mDEBUG[0m [34m<ipykernel>:26[0m llm_process_page Processing 02-runes/06-$bindable.md[0m
+0.016s [36mDEBUG[0m [34m<ipykernel>:26[0m llm_process_page Processing 02-runes/07-$inspect.md[0m
+0.016s [36mDEBUG[0m [34m<ipykernel>:26[0m llm_process_page Processing 02-runes/08-$host.md[0m
+0.016s [36mDEBUG[0m [34m<ipykernel>:26[0m llm_process_page Processing 02-runes/index.md[0m
+25.081s [36mDEBUG[0m [34m<ipykernel>:26[0m llm_process_page Processing 04-t

In [None]:
tree.children[2].model_dump()

{'origPath': Path('04-test-compact2/ttt/ttt.md'),
 'name': 'test-compact2-ttt',
 'displayName': 'test-compact2-ttt',
 'digest': 'This is a test file and not relevant documentation.',
 'short_digest': 'Test file, not relevant.',
 'essence': 'Test file placeholder with no actual content.',
 'relevant': False,
 'usage': {'input': 1269, 'output': 129, 'details': None},
 'token_counts': {'fulltext': 21, 'digest': 17, 'short_digest': 13},
 'fulltext': "This is a test file, it's not relevant, ignore it.",
 'children': []}

In [None]:
tree.children[3].model_dump()

{'origPath': Path('05-test-compact3/asd/test.md'),
 'name': 'effect',
 'displayName': 'effect',
 'digest': '## $effect\n\nEffects are functions that run when state updates. They only run in the browser, not during server-side rendering. Generally avoid updating state inside effects as it leads to convoluted code and infinite loops.\n\n### Basic Usage\n\n```svelte\n<script>\n    let size = $state(50);\n    let color = $state(\'#ff3e00\');\n    let canvas;\n\n    $effect(() => {\n        const context = canvas.getContext(\'2d\');\n        context.clearRect(0, 0, canvas.width, canvas.height);\n        context.fillStyle = color;\n        context.fillRect(0, 0, size, size);\n    });\n</script>\n\n<canvas bind:this={canvas} width="100" height="100"></canvas>\n```\n\nSvelte tracks which state is accessed in the effect and re-runs it when that state changes.\n\n### Lifecycle\n\nEffects run after component mount and in a microtask after state changes. Re-runs are batched. Effects can return a t

In [None]:
page = tree.children[1].children[3]
display(Markdown(f"`{page.origPath} -> {page.displayName}:`\n\n{page.essence}"))
display(Markdown("# **==== Short digest ====**"))
display(Markdown(page.short_digest))
# display(Markdown("# **==== Full digest ====**"))
# display(Markdown(page.digest))

`02-runes/04-$effect.md -> $effect:`

$effect rune runs side effects when state updates, automatically tracking synchronously-read reactive values; supports teardown functions, conditional dependency tracking, and variants ($effect.pre, $effect.tracking, $effect.root); avoid for state synchronizationâ€”use $derived instead.

# **==== Short digest ====**

## $effect
Runs side effects when state updates. Automatically tracks synchronously-read reactive values and reruns when they change. Runs after DOM updates in microtasks, with reruns batched.

```svelte
<script>
	let size = $state(50), color = $state('#ff3e00'), canvas;

	$effect(() => {
		const ctx = canvas.getContext('2d');
		ctx.clearRect(0, 0, canvas.width, canvas.height);
		ctx.fillStyle = color;
		ctx.fillRect(0, 0, size, size);
	});
</script>
<canvas bind:this={canvas} width="100" height="100"></canvas>
```

Can return teardown function (runs before rerun or on destroy). Only tracks **synchronous** reads; async reads (setTimeout, await) aren't tracked. Only reruns when object itself changes, not properties inside it.

Conditional code affects dependenciesâ€”only values read in last run are tracked:
```ts
$effect(() => {
	if (condition) confetti({ colors: [color] }); // color is dependency
	else confetti(); // color NOT a dependency
});
```

**Variants:**
- `$effect.pre()` - runs before DOM updates
- `$effect.tracking()` - returns true if in tracking context
- `$effect.root()` - non-tracked scope with manual cleanup

**Don't use for state sync.** Use `$derived` instead of `$effect(() => { derived = count * 2 })`. Use function bindings instead of effects to link values. If updating state in effect causes loops, use `untrack`.

In [None]:
display_tree(tree)

. -> : ...
01-introduction -> 01-introduction: ...
01-introduction/01-overview.md -> 01-overview.md: ---title: Overview---Svelte is a framework for...
01-introduction/02-getting-started.md -> 02-getting-started.md: ---title: Getting started---We recommend using...
01-introduction/03-svelte-files.md -> 03-svelte-files.md: ---title: .svelte files---Components are the b...
01-introduction/04-svelte-js-files.md -> 04-svelte-js-files.md: ---title: .svelte.js and .svelte.ts files---Be...
01-introduction/index.md -> index.md: ---title: Introduction---...
02-runes -> 02-runes: ...
02-runes/01-what-are-runes.md -> what-are-runes: ---title: What are runes?---> [!NOTE] **rune**...
02-runes/02-$state.md -> $state: ---title: $state---The `$state` rune allows yo...
02-runes/03-$derived.md -> $derived: ---title: $derived---Derived state is declared...
02-runes/04-$effect.md -> $effect: ---title: $effect---Effects are functions that...
02-runes/05-$props.md -> $props: ---title: $props---The inputs to 

#### Let's process a directory. The input is all pages digests (+sub-directory digests)


In [None]:
#| export

class DirReplySchema(BaseModel):
    better_name: str
    digest: str = Field(title="Directory digest, fmt:markdown")
    short_digest: str = Field(title="Short digest, format:markdown")
    essence: str
    relevant: bool

async def llm_process_directory(
    settings: Settings, directory: DocItem, libname: str, extra: str | None = None
) -> DocItem:
    """Create a summary for a directory based on its relevant pages and subdirectories"""

    with ls.trace(name=f"Process directory: {directory.origPath}", run_type="chain") as trace:
        logger.debug(f"Processing {directory.origPath}")

        assert directory.children, "Expected a directory, got a single page"
        assert len(directory.children), "1-child directories are supposed to be folded as pages"
        # If the directory did not have any relevant pages / subdirs, we should not be called.
        assert any(
            x for x in directory.children if x.relevant
        ), "Expected relevant children, got none"

        pages = [p for p in directory.children if not p.children and p.relevant]
        subdirs = [s for s in directory.children if s.children and s.relevant]

        # Special case - if a directory has only 1 relevant child, fold the directory.
        if len(pages + subdirs) == 1:
            return (pages + subdirs)[0].model_copy(deep=True)

        model = llm.get_async_model(settings.model)
        model.key = settings.api_key

        # We need to use anthropic client directly to count tokens.
        anthropic_client = anthropic.AsyncAnthropic(api_key=settings.api_key)

        template = Environment(loader=FileSystemLoader(settings.templates_dir)
                               ).get_template("process_directory.j2")

        input = {
            "dirname": directory.origPath.name + "/",
            "path": directory.origPath.parent.name + "/",
            "pages": pages,
            "subdirs": subdirs,
            "libname": libname,
            "extra": extra
        }
        with ls.trace("Template", "prompt", inputs=input) as template_trace:
            prompt = template.render(**input)
            template_trace.end(outputs=prompt)

        with ls.trace("LLM call", "llm", inputs={"prompt": prompt}) as llm_trace:
            async for attempt in AsyncRetrying(
                stop=stop_after_attempt(5), wait=wait_exponential(multiplier=1, min=3, max=60)
            ):
                with attempt:
                    try:
                        res = await model.prompt(
                            prompt=prompt, schema=DirReplySchema, max_tokens=32768, temperature=0
                        )
                        llm_trace.end(outputs=await res.text())
                        usage = await res.usage()
                    except Exception as e:
                        logger.warning(
                            f"{directory.origPath}: retry {attempt.retry_state.attempt_number}: {str(e)}"
                        )
                        raise

        with ls.trace("Parse", "parser", inputs={"input": await res.text()}) as parse_trace:
            reply = DirReplySchema.model_validate_json(await res.text())
            reply.better_name = reply.better_name.removesuffix('.md')

            parse_trace.end(outputs=reply)

        # We save a generated fulltext for a directory which is the sum of digests of all the pages and subdirs within.
        fulltext_template = Environment(loader=FileSystemLoader(settings.templates_dir)
                                        ).get_template("directory_fulltext.j2")
        fulltext = fulltext_template.render(**input)

        # Count tokens for fulltext, digest, and short_digest in parallel
        async for attempt in AsyncRetrying(
            stop=stop_after_attempt(5), wait=wait_exponential(multiplier=1, min=3, max=60)
        ):
            with attempt:
                try:
                    fulltext_tokens, digest_tokens, short_digest_tokens = await asyncio.gather(
                        anthropic_count_tokens(anthropic_client, "claude-haiku-4-5", fulltext),
                        anthropic_count_tokens(anthropic_client, "claude-haiku-4-5", reply.digest),
                        anthropic_count_tokens(
                            anthropic_client, "claude-haiku-4-5", reply.short_digest
                        )
                    )
                    token_counts = TokenCounts(
                        fulltext=fulltext_tokens,
                        digest=digest_tokens,
                        short_digest=short_digest_tokens
                    )
                except Exception as e:
                    logger.warning(
                        f"{directory.origPath}: retry token count {attempt.retry_state.attempt_number}: {str(e)}"
                    )
                    raise

        result = directory.model_copy(deep=True)
        result.displayName = reply.better_name
        result.name = reply.better_name.lower().replace(" ", "_").replace("/", "_")
        result.digest = reply.digest
        result.short_digest = reply.short_digest
        result.essence = reply.essence
        result.relevant = reply.relevant
        result.fulltext = fulltext
        result.token_counts = token_counts
        result.usage = usage

        trace.end(outputs=result.model_copy(update={"fulltext": ""}))
        return result

In [None]:
# tree.children[1] = await llm_process_directory(settings, tree.children[1], "svelte")
# tree.children[3] = await llm_process_directory(settings, tree.children[3], "svelte")

In [None]:
subdir = tree.children[1]
display(Markdown(f"`{subdir.origPath} -> {subdir.displayName}:`\n\n{subdir.essence}"))
display(Markdown("**==== Short digest ====**"))
display(Markdown(subdir.short_digest))

`02-runes -> 02-runes:`



**==== Short digest ====**



#### Great! Now let's do the whole tree recursively.

When processing a directory:

- process all pages
- process all-sibdirectories
- Generate digests for the whole directory


In [None]:
#| export
async def process_tree_depth_first(
    settings: Settings,
    tree: DocItem,
    libname: str,
    extra_dir: str | None = None,
    extra_page: str | None = None
) -> DocItem:
    """
    Process documentation tree depth-first with parallel processing.
    Mutates the doc_dir object.
    """

    with ls.trace(name=f"Process tree: {libname}/{tree.origPath}", run_type="chain") as trace:
        # First, recursively process all subdirectories in parallel
        subdirs = [c for c in tree.children if c.children]
        subdirs = await asyncio.gather(
            *[
                process_tree_depth_first(settings, subdir, libname, extra_dir, extra_page)
                for subdir in subdirs
            ]
        )
        subdirs = sorted(subdirs, key=lambda s: s.origPath)

        # Then process all pages in this directory in parallel
        pages = [c for c in tree.children if not c.children]
        pages = await asyncio.gather(
            *[llm_process_page(settings, page, libname, extra_page) for page in pages]
        )
        pages = sorted(pages, key=lambda s: s.origPath)

        # The whole tree is just 1 page.
        if not pages:
            result = await llm_process_page(
                page=tree, extra_prompt=extra_page, libname=libname, settings=settings
            )
        else:
            # .name is llm-generated and might be not unique. Make it unique.
            names: set[str] = set()
            for x in subdirs + pages:
                name, i = x.displayName, 2
                while name in names:
                    name = f"{x.displayName}_{str(i)}"
                    i += 1
                x.displayName = name
                names.add(name)

            if not any(x for x in subdirs + pages if x.relevant):
                result = DocItem(
                    origPath=tree.origPath,
                    displayName=tree.displayName,
                    children=pages,
                    relevant=False
                )
                trace.end(outputs=result)
                return result

            # Update children with processed items
            tree.children = subdirs + pages
            result = await llm_process_directory(settings, tree, libname, extra_dir)
        trace.end(outputs=result.model_copy(update={"fulltext": ""}))
        return result

In [None]:
# Clean tree
tree = build_markdown_doc_tree(Path("test_data"))
processed_tree = await process_tree_depth_first(settings, tree, "svelte")

+15.901s [36mDEBUG[0m [34m<ipykernel>:26[0m llm_process_page Processing 01-introduction/01-overview.md[0m
+0.015s [36mDEBUG[0m [34m<ipykernel>:26[0m llm_process_page Processing 01-introduction/02-getting-started.md[0m
+0.015s [36mDEBUG[0m [34m<ipykernel>:26[0m llm_process_page Processing 01-introduction/03-svelte-files.md[0m
+0.016s [36mDEBUG[0m [34m<ipykernel>:26[0m llm_process_page Processing 01-introduction/04-svelte-js-files.md[0m
+0.017s [36mDEBUG[0m [34m<ipykernel>:26[0m llm_process_page Processing 01-introduction/index.md[0m
+0.017s [36mDEBUG[0m [34m<ipykernel>:26[0m llm_process_page Processing 02-runes/01-what-are-runes.md[0m
+0.016s [36mDEBUG[0m [34m<ipykernel>:26[0m llm_process_page Processing 02-runes/02-$state.md[0m
+0.017s [36mDEBUG[0m [34m<ipykernel>:26[0m llm_process_page Processing 02-runes/03-$derived.md[0m
+0.016s [36mDEBUG[0m [34m<ipykernel>:26[0m llm_process_page Processing 02-runes/04-$effect.md[0m
+0.017s [36mDEBUG[0m

In [None]:
display(
    Markdown(
        f"`{processed_tree.origPath} -> {processed_tree.displayName}:`\n\n{processed_tree.essence}"
    )
)
display(Markdown("**==== Digest ====**"))
display(Markdown(processed_tree.digest))

`. -> Fundamentals:`

Project setup, component file structure, and rune-based reactivity system ($state, $derived, $effect, $props, $bindable, $inspect, $host)

**==== Digest ====**

## Project Setup

**SvelteKit** (recommended): `npx sv create myapp && npm run dev`

**Vite**: `npm create vite@latest` with svelte option, generates to `dist/`, requires separate routing.

**Other tools**: Rollup/Webpack plugins available but Vite preferred. Both support standalone SPA mode.

**Editor support**: VS Code extension by Svelte team, command-line checking via `sv check`.

## .svelte File Structure

Components use HTML superset with optional sections:

```svelte
<script module>
	// runs once at module evaluation
	let total = 0;
</script>

<script>
	// runs per component instance
	total += 1;
</script>

<!-- markup -->

<style>
	/* scoped CSS only */
	p { color: burlywood; }
</style>
```

**`<script>`**: JS/TS (add `lang="ts"`), runs on instance creation. Top-level variables accessible in markup. Use runes for props/reactivity.

**`<script module>`**: Runs once, not per instance. Variables accessible in component. Can export bindings (not `export default`). Legacy Svelte 4 used `context="module"`.

**`<style>`**: Component-scoped CSS only.

## .svelte.js/.svelte.ts Files

Module files supporting runes for reusable reactive logic and shared state. Function like standard modules with Svelte reactivity. Restriction: reassigned state cannot be exported across modules. Introduced in Svelte 5.

## Runes Overview

`$`-prefixed compiler keywords (not functions) controlling reactivity in `.svelte` and `.svelte.js`/`.svelte.ts` files. Cannot be imported, assigned, or passed as arguments.

### $state - Reactive State

```js
let count = $state(0);
let todos = $state([{ done: false, text: 'add todos' }]);
todos[0].done = !todos[0].done; // triggers updates
```

Objects/arrays become deeply reactive proxies. Destructuring breaks reactivity. Class instances not proxied; use `$state` in class fields.

`$state.raw`: non-mutating state, only reassignable (better performance for large objects):
```js
let person = $state.raw({ name: 'Heraclitus', age: 49 });
person = { name: 'Heraclitus', age: 50 }; // works
person.age += 1; // no effect
```

`$state.snapshot`: static snapshot of reactive proxy for external libraries.

Pass-by-value: passing `$state` values passes current values, not reactive references. Pass objects and mutate properties, or pass functions for reactive updates. State in `.svelte.js`/`.svelte.ts` cannot be exported if reassigned; export objects with mutable properties or functions accessing state instead.

### $derived - Computed Reactive Values

```js
let count = $state(0);
let doubled = $derived(count * 2);

let numbers = $state([1, 2, 3]);
let total = $derived.by(() => {
	let sum = 0;
	for (const n of numbers) sum += n;
	return sum;
});
```

Automatically updates when dependencies change. Expression must be side-effect free. Anything read synchronously is a dependency. Marked dirty on dependency change, recalculated on next read. Use `untrack` to exempt state from being a dependency.

Can be temporarily reassigned (unless `const`) for optimistic UI. Unlike `$state`, not deeply reactive proxies, but if returning object/array from reactive source, mutating properties affects underlying source.

Push-pull reactivity: state changes immediately notify dependents (push), derived values only re-evaluate when read (pull). If new value referentially identical to previous, downstream updates skipped.

### $effect - Side Effects

```js
let size = $state(50);
let color = $state('#ff3e00');
let canvas;

$effect(() => {
	const context = canvas.getContext('2d');
	context.clearRect(0, 0, canvas.width, canvas.height);
	context.fillStyle = color;
	context.fillRect(0, 0, size, size);
});
```

Runs when reactive state changes, automatically tracking synchronously-read values. Browser-only, not during SSR. Generally avoid updating state inside effects (infinite loops).

Run after component mount and in microtask after state changes. Reruns batched. Can return teardown function running before reruns, on destruction, or when parent effect reruns.

Automatically picks up reactive values read synchronously. Values read asynchronously (after await, setTimeout) NOT tracked. Only reruns when object itself changes, not properties inside. Dependencies conditional based on code paths in last run.

`$effect.pre`: runs before DOM updates.

`$effect.tracking()`: returns true if running in tracking context (effect or template), false otherwise.

`$effect.root`: creates non-tracked scope with manual cleanup for nested effects outside component initialization.

Don't use effects to synchronize stateâ€”use `$derived`. Don't use effects to link valuesâ€”use `$derived` with function bindings.

### $props - Component Input

```js
let { adjective = 'happy', super: trouper = 'lights', ...others } = $props();
```

Supports fallback values, renaming (for keywords), rest capture. Props update reactively. Child can temporarily reassign but shouldn't mutate regular object props. Use callback props or `$bindable` for two-way communication.

Type annotations for IDE support:
```ts
let { adjective }: { adjective: string } = $props();
```

`$props.id()`: generates unique ID per instance (consistent during hydration), useful for element linking.

### $bindable - Bidirectional Props

```svelte
// Child
let { value = $bindable() } = $props();
<input bind:value={value} />

// Parent
let message = $state('hello');
<FancyInput bind:value={message} />
```

Marks prop as bindable for bidirectional data flow. Allows state proxies to be mutated in child. Parent doesn't require `bind:`â€”can pass normal prop. Fallback values supported.

### $inspect - Development Logging

```js
let count = $state(0);
$inspect(count).with((type, count) => {
	if (type === 'update') debugger;
});
```

Development-only, logs values on change, tracking reactive state deeply. Re-fires on nested object/array property updates. `$inspect(...).with()` accepts callback receiving `type` ("init" or "update"). Pass `console.trace` to find change origin.

`$inspect.trace()` (5.14+): traces surrounding function in development, printing which reactive state caused effect/derived to re-run. Must be first statement in function body.

### $host - Custom Element Host

```svelte
<svelte:options customElement="my-stepper" />

<script>
	function dispatch(type) {
		$host().dispatchEvent(new CustomEvent(type));
	}
</script>

<button onclick={() => dispatch('increment')}>increment</button>
```

Provides access to host element when compiling as custom element, allowing custom event dispatch.

#### Looks decent. Keep in mind, this it the digest for the while 2 first section of the docs.


#### How much did it cost us?


In [None]:
#| export

def calculate_total_usage(doc_dir: DocItem) -> Usage:
    """Calculate total usage for a directory tree including all pages, subdirs, and summaries"""
    total_input = 0
    total_output = 0

    # Add usage from all pages in this directory
    for child in doc_dir.children:
        child_usage = calculate_total_usage(child)
        total_input += child_usage.input or 0
        total_output += child_usage.output or 0

    # Add usage from directory summarization
    if doc_dir.usage:
        total_input += doc_dir.usage.input or 0
        total_output += doc_dir.usage.output or 0

    return Usage(input=total_input, output=total_output)

def calculate_cost(usage: Usage, input_cost: float,
                   output_cost: float) -> tuple[float, float, float]:
    total_input = usage.input or 0
    total_output = usage.output or 0
    input_cost_total = (total_input/1_000_000) * input_cost
    output_cost_total = (total_output/1_000_000) * output_cost
    cost = input_cost_total + output_cost_total
    return cost, input_cost_total, output_cost_total

In [None]:
usage = calculate_total_usage(processed_tree)
print(f"\nTotal Usage:")
print(f"  Input tokens: {usage.input:,}")
print(f"  Output tokens: {usage.output:,}")
print(f"  Total tokens: {(usage.input + usage.output):,}")

cost, input_cost, output_cost = calculate_cost(usage, 1, 5)

print(f"\nCost:")
print(f"  Total: ${cost:.2f}")
print(f"  Input: ${input_cost:.2f}")
print(f"  Output: ${output_cost:.2f}")


Total Usage:
  Input tokens: 57,178
  Output tokens: 15,873
  Total tokens: 73,051

Cost:
  Total: $0.14
  Input: $0.06
  Output: $0.08


#### Let's save the results. First the markdown files.


In [None]:
#| export
import shutil

In [None]:
#| export

def save_doc_files(path: Path, doc: DocItem):
    """Save a DocItem structure to disk at the specified path.

    Args:
        path: Directory path where the documentation will be saved
        doc: DocItem object containing the documentation structure to save
    """
    if path.exists(): shutil.rmtree(path)
    path.mkdir(parents=True, exist_ok=True)

    (path / "digest.md").write_text(doc.digest)
    (path / "short_digest.md").write_text(doc.short_digest)
    (path / "essence.md").write_text(doc.essence)
    (path / "fulltext.md").write_text(doc.fulltext)

    for child in doc.children:
        save_doc_files(path / child.name, child)


In [None]:
save_doc_files(Path("test-out"), processed_tree)
!ls -l test-out
!ls -l "test-out/{processed_tree.children[0].name}"
!ls -l "test-out/{processed_tree.children[0].name}/{[c for c in processed_tree.children[0].children if not c.children][0].name}"
Markdown((Path("test-out")/processed_tree.children[0].name/[c for c in processed_tree.children[0].children if not c.children][0].name / "digest.md").read_text())

total 48
-rw-rw-r--  1 xl0 xl0  6519 Dec 10 11:55 digest.md
drwxrwxr-x  2 xl0 xl0  4096 Dec 10 11:55 effect
-rw-rw-r--  1 xl0 xl0   137 Dec 10 11:55 essence.md
-rw-rw-r--  1 xl0 xl0 14057 Dec 10 11:55 fulltext.md
drwxrwxr-x  7 xl0 xl0  4096 Dec 10 11:55 introduction
drwxrwxr-x 11 xl0 xl0  4096 Dec 10 11:55 runes
-rw-rw-r--  1 xl0 xl0   748 Dec 10 11:55 short_digest.md
drwxrwxr-x  2 xl0 xl0  4096 Dec 10 11:55 test-compact2-ttt
total 36
-rw-rw-r-- 1 xl0 xl0 1970 Dec 10 11:55 digest.md
-rw-rw-r-- 1 xl0 xl0  139 Dec 10 11:55 essence.md
-rw-rw-r-- 1 xl0 xl0 3218 Dec 10 11:55 fulltext.md
drwxrwxr-x 2 xl0 xl0 4096 Dec 10 11:55 getting-started
drwxrwxr-x 2 xl0 xl0 4096 Dec 10 11:55 introduction
drwxrwxr-x 2 xl0 xl0 4096 Dec 10 11:55 overview
drwxrwxr-x 2 xl0 xl0 4096 Dec 10 11:55 reactive_logic_files
-rw-rw-r-- 1 xl0 xl0  415 Dec 10 11:55 short_digest.md
drwxrwxr-x 2 xl0 xl0 4096 Dec 10 11:55 svelte_files
total 16
-rw-rw-r-- 1 xl0 xl0 614 Dec 10 11:55 digest.md
-rw-rw-r-- 1 xl0 xl0 167 Dec 10 

Svelte is a compiler-based framework for building web user interfaces. It transforms declarative components written in HTML, CSS, and JavaScript into optimized JavaScript code.

Example component:
```svelte
<script>
	function greet() {
		alert('Welcome to Svelte!');
	}
</script>

<button onclick={greet}>click me</button>

<style>
	button {
		font-size: 2em;
	}
</style>
```

Use cases range from standalone components to full-stack applications (via SvelteKit companion framework). Resources: interactive tutorial, playground for online experimentation, and StackBlitz for fully-featured development environment.

#### Next the metadata


In [None]:
#| export
import git
import json
from datetime import datetime, timezone

In [None]:
repo = git.Repo(search_parent_directories=True)
commit = repo.head.commit.hexsha
# Check if repo is dirty
if repo.is_dirty(untracked_files=True):
    commit += "-dirty"

commit

'3a5e8793fd9b283eba80c88bdd3cb6f23cb0a2ee-dirty'

In [None]:
#| export

def file_map(doc: DocItem):
    # if not doc.children:
    #     # Leaf node (page)
    #     return doc.model_dump(mode="json", include=["path", "relevant", "usage", "token_counts"]) | {"type": "page"}

    # Directory node
    children_map = {}
    for child in doc.children:
        children_map[child.name] = file_map(child)

    return doc.model_dump(
        mode="json", include=["origPath", "displayName", "relevant", "usage", "token_counts"]
    ) | {
        "children": children_map
    }

def build_metadata(source: Source, doc: DocItem):

    if isinstance(source, GitSource):
        source_type = "git"
    elif isinstance(source, WebSource):
        source_type = "web"
    elif isinstance(source, LLMTxtSource):
        source_type = "llms.txt"
    else:
        raise TypeError(f"Unknown source type: {type(source)}")

    # Get current git commit
    repo = git.Repo(search_parent_directories=True)
    commit = repo.head.commit.hexsha
    # Check if repo is dirty
    if repo.is_dirty(untracked_files=True):
        commit += "-dirty"

    return {
        "map": file_map(doc),
        "name": source.name,
        "ecosystems": source.ecosystems,
        "source_type": source_type,
        "source": source.model_dump(mode="json"),
        "date": datetime.now(timezone.utc).isoformat(),
        "model": settings.model,
        "commit": commit,
    }

In [None]:
fake_source = GitSource(
    commit="123456",
    doc_dir="path/to/nowhere",
    name="sveltejs/svelte",
    repo="https://github.com/sveltejs/svelte"
)
processed_tree.displayName = ""  # The top-level directry does not need a name.
Path("test-out/index.json").write_text(
    json.dumps(build_metadata(fake_source, processed_tree), indent=2)
)

8677

In [None]:
!cat test-out/index.json | head -n 20
!echo "[  .....  ]"
!cat test-out/index.json | tail -n 10

{
  "map": {
    "origPath": ".",
    "displayName": "",
    "relevant": true,
    "usage": {
      "input": 5006,
      "output": 2081,
      "details": null
    },
    "token_counts": {
      "fulltext": 3739,
      "digest": 1738,
      "short_digest": 219
    },
    "children": {
      "introduction": {
        "origPath": "01-introduction",
        "displayName": "Introduction",
        "relevant": true,
[  .....  ]
    "include": null,
    "exclude": null,
    "doc_dir": "path/to/nowhere",
    "repo": "https://github.com/sveltejs/svelte",
    "commit": "123456"
  },
  "date": "2025-12-10T16:55:39.126719+00:00",
  "model": "claude-haiku-4.5",
  "commit": "3a5e8793fd9b283eba80c88bdd3cb6f23cb0a2ee-dirty"
}

In [None]:
#| export
def save_processed_documents(source: Source, path: Path, tree: DocItem):
    tree = tree.model_copy(deep=True)
    tree.displayName = ""  # The top-level directry does not need a name.
    save_doc_files(path, tree)
    (path / "index.json").write_text(json.dumps(build_metadata(source, tree), indent=2))

In [None]:
save_processed_documents(fake_source, Path("test-out"), processed_tree)