# Document processing


In [43]:
#| default_exp docs

In [44]:
#| hide
import nbdev

nbdev.nbdev_export()

In [45]:
#| exporti

from pathlib import Path
from typing import Literal
import logging

from pydantic import Field, BaseModel, field_validator
from jinja2 import Environment, FileSystemLoader

import llm
from llm.models import Usage
import anthropic
import langsmith as ls

from lovely_docs.settings import Settings, settings
import asyncio

from tenacity import AsyncRetrying, stop_after_attempt, wait_exponential


In [46]:
#| export
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)

In [None]:
#| export

class TokenCounts(BaseModel):
    fulltext: int = 0
    digest: int = 0
    short_digest: int = 0

class DocItem(BaseModel):
    origPath: Path
    name: str

    @field_validator('name')
    @classmethod
    def validate_name(cls, v):
        if '/' in v or ' ' in v:
            raise ValueError(f"name must not contain '/' or spaces, got: {v!r}")
        return v

    displayName: str
    digest: str = ""
    short_digest: str = ""
    essence: str = ""
    relevant: bool = True
    usage: Usage = Field(default_factory=lambda: Usage(0, 0))
    token_counts: TokenCounts = Field(default_factory=TokenCounts)
    fulltext: str = ""
    children: list['DocItem'] = Field(default_factory=list)

DocItem.model_rebuild()  # Resolve forward refs.

#### Let's load all markdown files from a directory


In [48]:
#| export

def build_markdown_doc_tree(root: Path, path: Path = Path()) -> DocItem:
    """Recursively build a documentation tree from markdown files.

    Args:
        root: Root directory containing the documentation
        path: Relative path from root to process (default: root itself)

    Returns:
        DocItem containing pages and subdirectories in children, if any
        None if there are no non-emprty pages or subdirectories.
    """
    assert root.exists() and root.is_dir()
    assert (root / path).exists() and (root / path).is_dir()

    children: list[DocItem] = []

    # Get immediate children only
    for item in sorted((root / path).iterdir()):
        if item.is_file() and item.suffix == '.md':
            # We'll process files later, just record them
            rel_path = item.relative_to(root)
            fulltext = item.read_text()
            if fulltext:
                name = str(rel_path.name)
                children.append(
                    DocItem(origPath=rel_path, name=name, displayName=name, fulltext=fulltext)
                )
        if item.is_dir():
            subtree = build_markdown_doc_tree(root, item.relative_to(root))
            if subtree:
                children.append(subtree)

    # Special case - directories with 1 child get folded.
    if len(children) == 1:
        return children[0]

    if children:
        name = str(path.name)
        return DocItem(origPath=path, name=name, displayName=name, children=children)

    return None

In [49]:
tree = build_markdown_doc_tree(Path("test_data"))
# pprint.pprint(tree.model_dump(exclude=["children"]))

def display_tree(root: DocItem):
    if root.children:
        for child in root.children:
            print(
                f"{str(child.origPath)} -> {child.name}: {child.fulltext[:50].replace(chr(10), '')}..."
            )
            display_tree(child)

display_tree(tree)

01-introduction -> 01-introduction: ...
01-introduction/01-overview.md -> 01-overview.md: ---title: Overview---Svelte is a framework for...
01-introduction/02-getting-started.md -> 02-getting-started.md: ---title: Getting started---We recommend using...
01-introduction/03-svelte-files.md -> 03-svelte-files.md: ---title: .svelte files---Components are the b...
01-introduction/04-svelte-js-files.md -> 04-svelte-js-files.md: ---title: .svelte.js and .svelte.ts files---Be...
01-introduction/index.md -> index.md: ---title: Introduction---...
02-runes -> 02-runes: ...
02-runes/01-what-are-runes.md -> 01-what-are-runes.md: ---title: What are runes?---> [!NOTE] **rune**...
02-runes/02-$state.md -> 02-$state.md: ---title: $state---The `$state` rune allows yo...
02-runes/03-$derived.md -> 03-$derived.md: ---title: $derived---Derived state is declared...
02-runes/04-$effect.md -> 04-$effect.md: ---title: $effect---Effects are functions that...
02-runes/05-$props.md -> 05-$props.md: ---title: $pro

#### Let's process one page


In [None]:
#| export

class PageReplySchema(BaseModel):
    better_name: str = Field(description="")
    digest: str = Field(title="Digest, format: markdown", )
    short_digest: str = Field(title="Short digest, format:markdown")
    essence: str = Field(title="Essence, format:txt")
    relevant: bool

async def anthropic_count_tokens(client: anthropic.AsyncAnthropic, model: str, text: str):
    res = await client.messages.count_tokens(
        model=model,
        messages=[{
            "role": "user",
            "content": text
        }],
    )
    return res.input_tokens

async def llm_process_page(
    settings: Settings, page: DocItem, libname: str, extra_prompt: str | None = None
) -> DocItem:
    with ls.trace(
        name=f"Process page: {page.origPath}", run_type="chain", inputs={"input": page.fulltext}
    ) as trace:
        logger.debug(f"Processing {page.origPath}")
        assert not page.children, "A page should be a leaf node, no children allowed"

        if not page.fulltext.strip():
            logger.debug(f"Got an empry page {page.origPath}")
            return page.model_copy(update={"relevant": False})

        model = llm.get_async_model(settings.model)
        model.key = settings.api_key

        # We need to use anthropic client directly to count tokens.
        anthropic_client = anthropic.AsyncAnthropic(api_key=settings.api_key)

        template = Environment(loader=FileSystemLoader(settings.templates_dir)
                               ).get_template("process_page.j2")
        inputs = {
            "text": page.fulltext,
            "filename": str(page.origPath),
            "path": str(page.origPath.parent) + "/",
            "libname": libname,
            "extra": extra_prompt
        }
        with ls.trace("Template", "prompt", inputs=inputs) as template_trace:
            prompt = template.render(**inputs)
            template_trace.end(outputs=prompt)

        with ls.trace("LLM call", "llm", inputs={"prompt": prompt}) as llm_trace:
            async for attempt in AsyncRetrying(
                stop=stop_after_attempt(5), wait=wait_exponential(multiplier=1, min=3, max=60)
            ):
                with attempt:
                    try:
                        res = await model.prompt(
                            prompt=prompt, schema=PageReplySchema, max_tokens=32768, temperature=0
                        )
                        llm_trace.end(outputs=await res.text())
                    except Exception as e:
                        logger.warning(f"{page.origPath}: retry {attempt.retry_state.attempt_number}: {str(e)}")
                        raise

        with ls.trace("Parse", "parser", inputs={"input": await res.text()}) as parse_trace:
            reply = PageReplySchema.model_validate_json(await res.text())
            reply.better_name = reply.better_name.removesuffix('.md')
            parse_trace.end(outputs=reply)
            usage = await res.usage()

        # Count tokens for fulltext, digest, and short_digest in parallel
        async for attempt in AsyncRetrying(
            stop=stop_after_attempt(5), wait=wait_exponential(multiplier=1, min=3, max=60)
        ):
            with attempt:
                try:
                    fulltext_tokens, digest_tokens, short_digest_tokens = await asyncio.gather(
                        anthropic_count_tokens(anthropic_client, "claude-haiku-4-5", page.fulltext),
                        anthropic_count_tokens(anthropic_client, "claude-haiku-4-5", reply.digest),
                        anthropic_count_tokens(
                            anthropic_client, "claude-haiku-4-5", reply.short_digest
                        )
                    )
                    token_counts = TokenCounts(
                        fulltext=fulltext_tokens,
                        digest=digest_tokens,
                        short_digest=short_digest_tokens
                    )
                except Exception as e:
                    logger.warning(
                        f"{page.origPath}: retry token count {attempt.retry_state.attempt_number}: {str(e)}"
                    )
                    raise

        result = DocItem(
            origPath=page.origPath,
            fulltext=page.fulltext,
            displayName=reply.better_name,
            name=reply.better_name.lower().replace(" ", "_").replace("/", "_"),
            digest=reply.digest,
            short_digest=reply.short_digest,
            essence=reply.essence,
            relevant=reply.relevant,
            token_counts=token_counts,
            usage=usage
        )

        trace.end(outputs=result)
        return result

In [51]:
from lovely_docs.settings import settings
from IPython.display import Markdown, display

In [52]:
tree.children[1].children[2]

DocItem(origPath=Path('02-runes/03-$derived.md'), name='03-$derived.md', displayName='03-$derived.md', digest='', short_digest='', essence='', relevant=True, usage=Usage(input=None, output=None, details=None), token_counts=TokenCounts(fulltext=0, digest=0, short_digest=0), fulltext="---\ntitle: $derived\n---\n\nDerived state is declared with the `$derived` rune:\n\n```svelte\n<script>\n\tlet count = $state(0);\n\tlet doubled = $derived(count * 2);\n</script>\n\n<button onclick={() => count++}>\n\t{doubled}\n</button>\n\n<p>{count} doubled is {doubled}</p>\n```\n\nThe expression inside `$derived(...)` should be free of side-effects. Svelte will disallow state changes (e.g. `count++`) inside derived expressions.\n\nAs with `$state`, you can mark class fields as `$derived`.\n\n> [!NOTE] Code in Svelte components is only executed once at creation. Without the `$derived` rune, `doubled` would maintain its original value even when `count` changes.\n\n## `$derived.by`\n\nSometimes you need to

In [53]:
res = await llm_process_page(
    settings=settings,
    page=tree.children[1].children[2],
    libname="svelte")

+291.972s [36mDEBUG[0m [34m<ipykernel>:26[0m llm_process_page Processing 02-runes/03-$derived.md[0m


In [54]:
display(Markdown(f"`{res.origPath} -> {res.displayName}:`\n\n{res.essence}"))
display(Markdown("# **==== Short digest ====**"))
display(Markdown(res.short_digest))
display(Markdown("# **==== Full digest ====**"))
display(Markdown(res.digest))

`02-runes/03-$derived.md -> $derived:`

Derived state automatically updates when its dependencies change, with support for complex derivations and temporary overrides.

# **==== Short digest ====**

## $derived

Declare derived state that automatically updates when dependencies change:

```svelte
let count = $state(0);
let doubled = $derived(count * 2);
```

For complex derivations, use `$derived.by(() => { ... })`. Expressions must be side-effect free.

You can temporarily override derived values for optimistic UI. Derived values are not deeply reactive proxies like `$state`. Svelte uses push-pull reactivity: updates notify dependents immediately but derived values only re-evaluate when read.

# **==== Full digest ====**

## $derived

Declare derived state that automatically updates when dependencies change:

```svelte
<script>
	let count = $state(0);
	let doubled = $derived(count * 2);
</script>

<button onclick={() => count++}>{doubled}</button>
```

Expressions must be side-effect free. Svelte prevents state mutations inside derived expressions.

### $derived.by

For complex derivations, use `$derived.by` with a function:

```svelte
<script>
	let numbers = $state([1, 2, 3]);
	let total = $derived.by(() => {
		let sum = 0;
		for (const n of numbers) sum += n;
		return sum;
	});
</script>
```

`$derived(expression)` is equivalent to `$derived.by(() => expression)`.

### Dependencies

Anything read synchronously inside the derived expression is a dependency. When dependencies change, the derived is marked dirty and recalculated on next read. Use `untrack` to exempt state from being treated as a dependency.

### Overriding derived values

You can temporarily reassign derived values (unless declared with `const`) for optimistic UI:

```svelte
<script>
	let { post, like } = $props();
	let likes = $derived(post.likes);

	async function onclick() {
		likes += 1;
		try {
			await like();
		} catch {
			likes -= 1;
		}
	}
</script>

<button {onclick}>ðŸ§¡ {likes}</button>
```

### Reactivity

Unlike `$state`, `$derived` values are not converted to deeply reactive proxies. However, if a derived returns an object/array from reactive state, mutations to that object still affect the source.

### Update propagation

Svelte uses push-pull reactivity: state changes immediately notify dependents (push), but derived values only re-evaluate when read (pull). If a derived's new value is referentially identical to its previous value, downstream updates are skipped.

#### Looks decent, let's process all pages in a sub-directory.


In [55]:
import asyncio

pages = []
tasks = []
for page in tree.children[1].children:
    tasks.append(asyncio.create_task(llm_process_page(settings, page, "svelte")))

pages: list[DocItem] = await asyncio.gather(*tasks)

pages.sort(key=lambda x: x.origPath)

# Update the children with processed pages
tree.children[1].children = pages

# and some more
tree.children[2] = await llm_process_page(settings, tree.children[2], "svelte")
tree.children[3] = await llm_process_page(settings, tree.children[3], "svelte")

+8.831s [36mDEBUG[0m [34m<ipykernel>:26[0m llm_process_page Processing 02-runes/01-what-are-runes.md[0m
+0.014s [36mDEBUG[0m [34m<ipykernel>:26[0m llm_process_page Processing 02-runes/02-$state.md[0m
+0.014s [36mDEBUG[0m [34m<ipykernel>:26[0m llm_process_page Processing 02-runes/03-$derived.md[0m
+0.013s [36mDEBUG[0m [34m<ipykernel>:26[0m llm_process_page Processing 02-runes/04-$effect.md[0m
+0.013s [36mDEBUG[0m [34m<ipykernel>:26[0m llm_process_page Processing 02-runes/05-$props.md[0m
+0.013s [36mDEBUG[0m [34m<ipykernel>:26[0m llm_process_page Processing 02-runes/06-$bindable.md[0m
+0.013s [36mDEBUG[0m [34m<ipykernel>:26[0m llm_process_page Processing 02-runes/07-$inspect.md[0m
+0.013s [36mDEBUG[0m [34m<ipykernel>:26[0m llm_process_page Processing 02-runes/08-$host.md[0m
+0.013s [36mDEBUG[0m [34m<ipykernel>:26[0m llm_process_page Processing 02-runes/index.md[0m
+11.266s [36mDEBUG[0m [34m<ipykernel>:26[0m llm_process_page Processing 04-t

In [56]:
tree.children[2].model_dump()

{'origPath': Path('04-test-compact2/ttt/ttt.md'),
 'name': 'ttt',
 'displayName': 'ttt',
 'digest': 'This is a test file and not relevant documentation.',
 'short_digest': 'Test file - not relevant.',
 'essence': 'Test file with no actual content.',
 'relevant': False,
 'usage': {'input': 1096, 'output': 123, 'details': None},
 'token_counts': {'fulltext': 21, 'digest': 17, 'short_digest': 13},
 'fulltext': "This is a test file, it's not relevant, ignore it.",
 'children': []}

In [57]:
tree.children[3].model_dump()

{'origPath': Path('05-test-compact3/asd/test.md'),
 'name': '$effect',
 'displayName': '$effect',
 'digest': '## $effect\n\nEffects run when state updates and are browser-only (not during SSR). Use them for third-party libraries, canvas drawing, or network requests. Avoid updating state inside effects as it causes convoluted code and infinite loops.\n\n### Basic Usage\n\nEffects automatically track reactive values (`$state`, `$derived`, `$props`) accessed synchronously and re-run when dependencies change:\n\n```svelte\n<script>\n    let size = $state(50);\n    let color = $state(\'#ff3e00\');\n    let canvas;\n\n    $effect(() => {\n        const context = canvas.getContext(\'2d\');\n        context.clearRect(0, 0, canvas.width, canvas.height);\n        context.fillStyle = color;\n        context.fillRect(0, 0, size, size);\n    });\n</script>\n\n<canvas bind:this={canvas} width="100" height="100"></canvas>\n```\n\n### Lifecycle\n\nEffects run after component mount in a microtask after

In [58]:
page = tree.children[1].children[3]
display(Markdown(f"`{page.origPath} -> {page.displayName}:`\n\n{page.essence}"))
display(Markdown("# **==== Short digest ====**"))
display(Markdown(page.short_digest))
# display(Markdown("# **==== Full digest ====**"))
# display(Markdown(page.digest))

`02-runes/04-$effect.md -> $effect:`

The $effect rune runs side effects when reactive state changes, with automatic dependency tracking and optional teardown functions.

# **==== Short digest ====**

## $effect

Runs side effects when state updates. Automatically tracks reactive values and re-runs when they change. Only runs in browser, not SSR.

```svelte
$effect(() => {
  context.fillStyle = color;
  context.fillRect(0, 0, size, size);
});
```

Can return teardown function. Tracks only synchronously-read values. Use `$effect.pre` to run before DOM updates. Avoid for state synchronizationâ€”use `$derived` instead.

**Variants:** `$effect.pre`, `$effect.tracking()`, `$effect.root()`

In [59]:
display_tree(tree)

01-introduction -> 01-introduction: ...
01-introduction/01-overview.md -> 01-overview.md: ---title: Overview---Svelte is a framework for...
01-introduction/02-getting-started.md -> 02-getting-started.md: ---title: Getting started---We recommend using...
01-introduction/03-svelte-files.md -> 03-svelte-files.md: ---title: .svelte files---Components are the b...
01-introduction/04-svelte-js-files.md -> 04-svelte-js-files.md: ---title: .svelte.js and .svelte.ts files---Be...
01-introduction/index.md -> index.md: ---title: Introduction---...
02-runes -> 02-runes: ...
02-runes/01-what-are-runes.md -> what_are_runes: ---title: What are runes?---> [!NOTE] **rune**...
02-runes/02-$state.md -> $state: ---title: $state---The `$state` rune allows yo...
02-runes/03-$derived.md -> $derived: ---title: $derived---Derived state is declared...
02-runes/04-$effect.md -> $effect: ---title: $effect---Effects are functions that...
02-runes/05-$props.md -> $props: ---title: $props---The inputs to a component

#### Let's process a directory. The input is all pages digests (+sub-directory digests)


In [None]:
#| export

class DirReplySchema(BaseModel):
    better_name: str
    digest: str = Field(title="Directory digest, fmt:markdown")
    short_digest: str = Field(title="Short digest, format:markdown")
    essence: str
    relevant: bool

async def llm_process_directory(
    settings: Settings, directory: DocItem, libname: str, extra: str | None = None
) -> DocItem:
    """Create a summary for a directory based on its relevant pages and subdirectories"""

    with ls.trace(name=f"Process directory: {directory.origPath}", run_type="chain") as trace:
        logger.debug(f"Processing {directory.origPath}")

        assert directory.children, "Expected a directory, got a single page"
        assert len(directory.children), "1-child directories are supposed to be folded as pages"
        # If the directory did not have any relevant pages / subdirs, we should not be called.
        assert any(
            x for x in directory.children if x.relevant
        ), "Expected relevant children, got none"

        pages = [p for p in directory.children if not p.children and p.relevant]
        subdirs = [s for s in directory.children if s.children and s.relevant]

        # Special case - if a directory has only 1 relevant child, fold the directory.
        if len(pages + subdirs) == 1:
            return (pages + subdirs)[0].model_copy(deep=True)

        model = llm.get_async_model(settings.model)
        model.key = settings.api_key

        # We need to use anthropic client directly to count tokens.
        anthropic_client = anthropic.AsyncAnthropic(api_key=settings.api_key)

        template = Environment(loader=FileSystemLoader(settings.templates_dir)
                               ).get_template("process_directory.j2")

        input = {
            "dirname": directory.origPath.name + "/",
            "path": directory.origPath.parent.name + "/",
            "pages": pages,
            "subdirs": subdirs,
            "libname": libname,
            "extra": extra
        }
        with ls.trace("Template", "prompt", inputs=input) as template_trace:
            prompt = template.render(**input)
            template_trace.end(outputs=prompt)

        with ls.trace("LLM call", "llm", inputs={"prompt": prompt}) as llm_trace:
            async for attempt in AsyncRetrying(
                stop=stop_after_attempt(5), wait=wait_exponential(multiplier=1, min=3, max=60)
            ):
                with attempt:
                    try:
                        res = await model.prompt(
                            prompt=prompt, schema=DirReplySchema, max_tokens=32768, temperature=0
                        )
                        llm_trace.end(outputs=await res.text())
                        usage = await res.usage()
                    except Exception as e:
                        logger.warning(
                            f"{directory.origPath}: retry {attempt.retry_state.attempt_number}: {str(e)}"
                        )
                        raise

        with ls.trace("Parse", "parser", inputs={"input": await res.text()}) as parse_trace:
            reply = DirReplySchema.model_validate_json(await res.text())
            reply.better_name = reply.better_name.removesuffix('.md')

            parse_trace.end(outputs=reply)

        # We save a generated fulltext for a directory which is the sum of digests of all the pages and subdirs within.
        fulltext_template = Environment(loader=FileSystemLoader(settings.templates_dir)
                                        ).get_template("directory_fulltext.j2")
        fulltext = fulltext_template.render(**input)

        # Count tokens for fulltext, digest, and short_digest in parallel
        async for attempt in AsyncRetrying(
            stop=stop_after_attempt(5), wait=wait_exponential(multiplier=1, min=3, max=60)
        ):
            with attempt:
                try:
                    fulltext_tokens, digest_tokens, short_digest_tokens = await asyncio.gather(
                        anthropic_count_tokens(anthropic_client, "claude-haiku-4-5", fulltext),
                        anthropic_count_tokens(anthropic_client, "claude-haiku-4-5", reply.digest),
                        anthropic_count_tokens(
                            anthropic_client, "claude-haiku-4-5", reply.short_digest
                        )
                    )
                    token_counts = TokenCounts(
                        fulltext=fulltext_tokens,
                        digest=digest_tokens,
                        short_digest=short_digest_tokens
                    )
                except Exception as e:
                    logger.warning(
                        f"{directory.origPath}: retry token count {attempt.retry_state.attempt_number}: {str(e)}"
                    )
                    raise

        result = directory.model_copy(deep=True)
        result.displayName = reply.better_name
        result.name = reply.better_name.lower().replace(" ", "_").replace("/", "_")
        result.digest = reply.digest
        result.short_digest = reply.short_digest
        result.essence = reply.essence
        result.relevant = reply.relevant
        result.fulltext = fulltext
        result.token_counts = token_counts
        result.usage = usage

        trace.end(outputs=result)
        return result

In [61]:
# tree.children[1] = await llm_process_directory(settings, tree.children[1], "svelte")
# tree.children[3] = await llm_process_directory(settings, tree.children[3], "svelte")

In [62]:
subdir = tree.children[1]
display(Markdown(f"`{subdir.origPath} -> {subdir.displayName}:`\n\n{subdir.essence}"))
display(Markdown("**==== Short digest ====**"))
display(Markdown(subdir.short_digest))

`02-runes -> 02-runes:`



**==== Short digest ====**



#### Great! Now let's do the whole tree recursively.

When processing a directory:

- process all pages
- process all-sibdirectories
- Generate digests for the whole directory


In [63]:
#| export
async def process_tree_depth_first(
    settings: Settings,
    doc_dir: DocItem,
    libname: str,
    extra_dir: str | None = None,
    extra_page: str | None = None
) -> DocItem:
    """
    Process documentation tree depth-first with parallel processing.
    Mutates the doc_dir object.
    """

    with ls.trace(name=f"Process tree: {libname}/{doc_dir.origPath}", run_type="chain") as trace:
        # First, recursively process all subdirectories in parallel
        subdirs = [c for c in doc_dir.children if c.children]
        subdirs = await asyncio.gather(
            *[
                process_tree_depth_first(settings, subdir, libname, extra_dir, extra_page)
                for subdir in subdirs
            ]
        )
        subdirs = sorted(subdirs, key=lambda s: s.origPath)

        # Then process all pages in this directory in parallel
        pages = [c for c in doc_dir.children if not c.children]
        pages = await asyncio.gather(
            *[llm_process_page(settings, page, libname, extra_page) for page in pages]
        )
        pages = sorted(pages, key=lambda s: s.origPath)

        # .name is llm-generated and might be not unique. Make it unique.
        names: set[str] = set()
        for x in subdirs + pages:
            name, i = x.displayName, 2
            while name in names:
                name = f"{x.displayName}_{str(i)}"
                i += 1
            x.displayName = name
            names.add(name)

        if not any(x for x in subdirs + pages if x.relevant):
            result = DocItem(
                origPath=doc_dir.origPath,
                displayName=doc_dir.displayName,
                children=pages,
                relevant=False
            )
            trace.end(outputs=result)
            return result

        # Update children with processed items
        doc_dir.children = subdirs + pages
        result = await llm_process_directory(settings, doc_dir, libname, extra_dir)
        trace.end(outputs=result)
        return result

In [64]:
# Clean tree
tree = build_markdown_doc_tree(Path("test_data"))
processed_tree = await process_tree_depth_first(settings, tree, "svelte")

+12.865s [36mDEBUG[0m [34m<ipykernel>:26[0m llm_process_page Processing 01-introduction/01-overview.md[0m
+0.015s [36mDEBUG[0m [34m<ipykernel>:26[0m llm_process_page Processing 01-introduction/02-getting-started.md[0m
+0.014s [36mDEBUG[0m [34m<ipykernel>:26[0m llm_process_page Processing 01-introduction/03-svelte-files.md[0m
+0.013s [36mDEBUG[0m [34m<ipykernel>:26[0m llm_process_page Processing 01-introduction/04-svelte-js-files.md[0m
+0.017s [36mDEBUG[0m [34m<ipykernel>:26[0m llm_process_page Processing 01-introduction/index.md[0m
+0.014s [36mDEBUG[0m [34m<ipykernel>:26[0m llm_process_page Processing 02-runes/01-what-are-runes.md[0m
+0.015s [36mDEBUG[0m [34m<ipykernel>:26[0m llm_process_page Processing 02-runes/02-$state.md[0m
+0.014s [36mDEBUG[0m [34m<ipykernel>:26[0m llm_process_page Processing 02-runes/03-$derived.md[0m
+0.013s [36mDEBUG[0m [34m<ipykernel>:26[0m llm_process_page Processing 02-runes/04-$effect.md[0m
+0.014s [36mDEBUG[0m

In [65]:
display(
    Markdown(
        f"`{processed_tree.origPath} -> {processed_tree.displayName}:`\n\n{processed_tree.essence}"
    )
)
display(Markdown("**==== Digest ====**"))
display(Markdown(processed_tree.digest))

`. -> Reactivity & Setup:`

Project setup, component structure, and Svelte's rune-based reactivity system for state management, derived values, effects, and props.

**==== Digest ====**

## Project Setup

Create projects with SvelteKit and Vite:
```bash
npx sv create myapp
cd myapp
npm run dev
```

Alternative: `npm create vite@latest` with svelte option. Vite is recommended; plugins exist for Rollup and Webpack.

Development tools: VS Code extension, `sv check` CLI, community integrations via Svelte Society.

## Component Files

Components use `.svelte` files with optional `<script>`, `<style>`, and markup sections.

**`<script>`** runs per instance. Top-level variables accessible in markup. Use runes for props and reactivity.

**`<script module>`** runs once at module load. Variables accessible in component but not vice versa. Export bindings become module exports (no `export default`).

**`<style>`** scoped to component only.

## Reactive Logic Files

`.svelte.js` and `.svelte.ts` files support runes for reusable reactive logic and shared state. Behave like standard modules with Svelte's reactivity system.

## Runes

`$`-prefixed compiler keywords forming Svelte's reactivity system. Built-in, valid only in specific positions.

**$state** - Creates reactive state. Arrays and objects become deeply reactive proxies:
```js
let count = $state(0);
let todos = $state([{ done: false, text: 'add more todos' }]);
todos[0].done = !todos[0].done; // triggers reactivity
```
Use `$state.raw` for non-reactive objects. Use `$state.snapshot()` to convert proxies to plain objects.

**$derived** - Auto-updates when dependencies change:
```js
let count = $state(0);
let doubled = $derived(count * 2);
let total = $derived.by(() => {
  let sum = 0;
  for (const n of numbers) sum += n;
  return sum;
});
```

**$effect** - Runs side effects when reactive state changes, automatically tracking dependencies:
```js
$effect(() => {
  const interval = setInterval(() => count += 1, milliseconds);
  return () => clearInterval(interval);
});
```
Effects run after mount and in microtasks after state changes. Return a teardown function for cleanup. Use `$effect.pre()` to run before DOM updates. Use `$effect.root()` for manually controlled nested effects.

**$props** - Receives component inputs with destructuring, defaults, and rest syntax:
```js
let { adjective = 'happy', ...others } = $props();
```

**$bindable** - Enables two-way data binding for component props:
```js
// Child
let { value = $bindable() } = $props();
<input bind:value={value} />

// Parent
<FancyInput bind:value={message} />
```

**$inspect** - Development-only rune that reactively logs value changes:
```js
$inspect(count, message);
$inspect(count).with((type, value) => { /* custom handler */ });
$inspect.trace(); // traces which state caused effect/derived to re-run
```

**$host** - Access host element in custom element components:
```js
$host().dispatchEvent(new CustomEvent('increment'));
```

## $effect Details

Effects are browser-only (not during SSR). Use for third-party libraries, canvas drawing, or network requests. Avoid updating state inside effects.

Values read asynchronously (after `await` or in `setTimeout`) are not tracked. Effects only depend on values read in the last run, so conditional code affects which dependencies are tracked:
```ts
let condition = $state(true);
let color = $state('#ff3e00');

$effect(() => {
    if (condition) {
        confetti({ colors: [color] });
    } else {
        confetti();
    }
});
```

**$effect.tracking** - Returns whether code is running in a tracking context (effect or template).

Don't use effects to synchronize stateâ€”use `$derived` instead. For complex derived values, use `$derived.by`.

#### Looks decent. Keep in mind, this it the digest for the while 2 first section of the docs.


#### How much did it cost us?


In [66]:
#| export

def calculate_total_usage(doc_dir: DocItem) -> Usage:
    """Calculate total usage for a directory tree including all pages, subdirs, and summaries"""
    total_input = 0
    total_output = 0

    # Add usage from all pages in this directory
    for child in doc_dir.children:
        child_usage = calculate_total_usage(child)
        total_input += child_usage.input or 0
        total_output += child_usage.output or 0

    # Add usage from directory summarization
    if doc_dir.usage:
        total_input += doc_dir.usage.input or 0
        total_output += doc_dir.usage.output or 0

    return Usage(input=total_input, output=total_output)

def calculate_cost(usage: Usage, input_cost: float,
                   output_cost: float) -> tuple[float, float, float]:
    total_input = usage.input or 0
    total_output = usage.output or 0
    input_cost_total = (total_input/1_000_000) * input_cost
    output_cost_total = (total_output/1_000_000) * output_cost
    cost = input_cost_total + output_cost_total
    return cost, input_cost_total, output_cost_total

In [67]:
usage = calculate_total_usage(processed_tree)
print(f"\nTotal Usage:")
print(f"  Input tokens: {usage.input:,}")
print(f"  Output tokens: {usage.output:,}")
print(f"  Total tokens: {(usage.input + usage.output):,}")

cost, input_cost, output_cost = calculate_cost(usage, 1, 5)

print(f"\nCost:")
print(f"  Total: ${cost:.2f}")
print(f"  Input: ${input_cost:.2f}")
print(f"  Output: ${output_cost:.2f}")


Total Usage:
  Input tokens: 50,174
  Output tokens: 11,300
  Total tokens: 61,474

Cost:
  Total: $0.11
  Input: $0.05
  Output: $0.06


#### Let's save the results. First the markdown files.


In [68]:
#| export
import shutil

In [69]:
#| export

def save_doc_files(path: Path, doc: DocItem):
    """Save a DocItem structure to disk at the specified path.

    Args:
        path: Directory path where the documentation will be saved
        doc: DocItem object containing the documentation structure to save
    """
    if path.exists(): shutil.rmtree(path)
    path.mkdir(parents=True, exist_ok=True)

    (path / "digest.md").write_text(doc.digest)
    (path / "short_digest.md").write_text(doc.short_digest)
    (path / "essence.md").write_text(doc.essence)
    (path / "fulltext.md").write_text(doc.fulltext)

    for child in doc.children:
        save_doc_files(path / child.name, child)


In [71]:
save_doc_files(Path("test-out"), processed_tree)
!ls -l test-out
!ls -l "test-out/{processed_tree.children[0].name}"
!ls -l "test-out/{processed_tree.children[0].name}/{[c for c in processed_tree.children[0].children if not c.children][0].name}"
Markdown((Path("test-out")/processed_tree.children[0].name/[c for c in processed_tree.children[0].children if not c.children][0].name / "digest.md").read_text())

total 36
drwxrwxr-x  2 xl0 xl0 4096 Nov 13 02:56 '$effect'
-rw-rw-r--  1 xl0 xl0 3562 Nov 13 02:56  digest.md
-rw-rw-r--  1 xl0 xl0  135 Nov 13 02:56  essence.md
-rw-rw-r--  1 xl0 xl0 5150 Nov 13 02:56  fulltext.md
drwxrwxr-x  7 xl0 xl0 4096 Nov 13 02:56  getting_started
drwxrwxr-x 11 xl0 xl0 4096 Nov 13 02:56  runes
-rw-rw-r--  1 xl0 xl0  803 Nov 13 02:56  short_digest.md
drwxrwxr-x  2 xl0 xl0 4096 Nov 13 02:56  ttt
total 32
-rw-rw-r-- 1 xl0 xl0 1389 Nov 13 02:56 digest.md
-rw-rw-r-- 1 xl0 xl0  116 Nov 13 02:56 essence.md
-rw-rw-r-- 1 xl0 xl0 1326 Nov 13 02:56 fulltext.md
drwxrwxr-x 2 xl0 xl0 4096 Nov 13 02:56 getting_started
drwxrwxr-x 2 xl0 xl0 4096 Nov 13 02:56 introduction
drwxrwxr-x 2 xl0 xl0 4096 Nov 13 02:56 overview
-rw-rw-r-- 1 xl0 xl0  549 Nov 13 02:56 short_digest.md
drwxrwxr-x 2 xl0 xl0 4096 Nov 13 02:56 svelte_files
total 16
-rw-rw-r-- 1 xl0 xl0 678 Nov 13 02:56 digest.md
-rw-rw-r-- 1 xl0 xl0 123 Nov 13 02:56 essence.md
-rw-rw-r-- 1 xl0 xl0 968 Nov 13 02:56 fulltext.md
-r

Svelte is a compiler-based framework for building web user interfaces. It transforms declarative components written in HTML, CSS, and JavaScript into optimized JavaScript code.

Example component:
```svelte
<script>
	function greet() {
		alert('Welcome to Svelte!');
	}
</script>

<button onclick={greet}>click me</button>

<style>
	button {
		font-size: 2em;
	}
</style>
```

Use cases range from standalone components to full-stack applications using SvelteKit. For learning, the interactive tutorial is recommended as a starting point, with reference documentation available for specific questions. Online environments include a playground and StackBlitz for experimentation.

#### Next the metadata


In [72]:
#| export
import git
import json
from datetime import datetime, timezone
from lovely_docs.settings import Source, WebSource, GitSource, LLMTxtSource

In [73]:
repo = git.Repo(search_parent_directories=True)
commit = repo.head.commit.hexsha
# Check if repo is dirty
if repo.is_dirty(untracked_files=True):
    commit += "-dirty"

commit

'0528ad5fe5486be5b3691edc99daacfaf782e426-dirty'

In [None]:
#| export

def file_map(doc: DocItem):
    # if not doc.children:
    #     # Leaf node (page)
    #     return doc.model_dump(mode="json", include=["path", "relevant", "usage", "token_counts"]) | {"type": "page"}

    # Directory node
    children_map = {}
    for child in doc.children:
        children_map[child.name] = file_map(child)

    return doc.model_dump(
        mode="json", include=["origPath", "displayName", "relevant", "usage", "token_counts"]
    ) | {
        "children": children_map
    }

def build_metadata(source: Source, doc: DocItem):

    if isinstance(source, GitSource):
        source_type = "git"
    elif isinstance(source, WebSource):
        source_type = "web"
    elif isinstance(source, LLMTxtSource):
        source_type = "llms.txt"
    else:
        raise TypeError(f"Unknown source type: {type(source)}")

    # Get current git commit
    repo = git.Repo(search_parent_directories=True)
    commit = repo.head.commit.hexsha
    # Check if repo is dirty
    if repo.is_dirty(untracked_files=True):
        commit += "-dirty"

    return {
        "map": file_map(doc),
        "name": source.name,
        "ecosystems": source.ecosystems,
        "source_type": source_type,
        "source": source.model_dump(mode="json"),
        "date": datetime.now(timezone.utc).isoformat(),
        "model": settings.model,
        "commit": commit,
    }

In [75]:
fake_source = GitSource(
    commit="123456",
    doc_dir="path/to/nowhere",
    name="sveltejs/svelte",
    repo="https://github.com/sveltejs/svelte"
)
processed_tree.displayName = ""  # The top-level directry does not need a name.
Path("test-out/index.json").write_text(
    json.dumps(build_metadata(fake_source, processed_tree), indent=2)
)

8594

In [76]:
!cat test-out/index.json | head -n 20
!echo "[  .....  ]"
!cat test-out/index.json | tail -n 10

{
  "map": {
    "origPath": ".",
    "displayName": "",
    "relevant": true,
    "usage": {
      "input": 3189,
      "output": 1324,
      "details": null
    },
    "token_counts": {
      "fulltext": 1338,
      "digest": 979,
      "short_digest": 225
    },
    "children": {
      "getting_started": {
        "origPath": "01-introduction",
        "displayName": "Getting Started",
        "relevant": true,
[  .....  ]
    "name": "sveltejs/svelte",
    "comment": null,
    "doc_dir": "path/to/nowhere",
    "repo": "https://github.com/sveltejs/svelte",
    "commit": "123456"
  },
  "date": "2025-11-13T07:56:50.500384+00:00",
  "model": "claude-haiku-4.5",
  "commit": "0528ad5fe5486be5b3691edc99daacfaf782e426-dirty"
}

In [77]:
#| export
def save_processed_documents(source: Source, path: Path, tree: DocItem):
    tree = tree.model_copy(deep=True)
    tree.displayName = ""  # The top-level directry does not need a name.
    save_doc_files(path, tree)
    (path / "index.json").write_text(json.dumps(build_metadata(source, tree), indent=2))

In [78]:
save_processed_documents(fake_source, Path("test-out"), processed_tree)