# Document Analysis Agent

This notebook implements a LangChain v1.x agent (tool-calling, LangGraph-based) that autonomously analyzes a text file and extracts its structure as an AST.

In [37]:
# (Optional) Install required packages for LangChain v1.x
# Skip this cell if your environment already has them installed.
# %pip install -q "langchain>=1.0,<2" "langchain-core>=1.0,<2" "langchain-openai" "langgraph" "pydantic>=2,<3" "python-dotenv" "pypdf>=3.0.0" "pymupdf>=1.23.0"

In [38]:
import os
import re

import dotenv

# Load environment variables from .env (if present)
dotenv.load_dotenv()

from typing import List, Optional
from pydantic import BaseModel, Field

from langchain_core.tools import tool
from langchain_openai import AzureChatOpenAI, ChatOpenAI
from langchain.agents import create_agent
from langchain.agents.middleware import SummarizationMiddleware, TodoListMiddleware, ContextEditingMiddleware, ClearToolUsesEdit

def build_llm():
    """Create an OpenAI / Azure OpenAI chat model from environment variables."""
    provider = (os.getenv("LLM_PROVIDER") or "openai").lower()
    model = "gpt-5.2"
    temperature = float(os.getenv("TEMPERATURE") or "0")

    if provider in {"azure", "azureopenai", "azure_openai"}:
        return AzureChatOpenAI(
            azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
            azure_deployment=os.getenv("AZURE_OPENAI_DEPLOYMENT_NAME")
            or os.getenv("AZURE_OPENAI_DEPLOYMENT")
            or model,
            api_version=os.getenv("AZURE_OPENAI_API_VERSION")
            or os.getenv("OPENAI_API_VERSION"),
            api_key=os.getenv("AZURE_OPENAI_API_KEY"),
            temperature=temperature,
        )

    return ChatOpenAI(
        model=model,
        api_key=os.getenv("OPENAI_API_KEY"),
        temperature=temperature,
    )


# NOTE:
# - OpenAI: set OPENAI_API_KEY (and optionally OPENAI_MODEL / MODEL)
# - Azure OpenAI: set LLM_PROVIDER=azure and AZURE_OPENAI_ENDPOINT / AZURE_OPENAI_API_KEY /
#   AZURE_OPENAI_API_VERSION (or OPENAI_API_VERSION) / AZURE_OPENAI_DEPLOYMENT_NAME

## 1. Custom Tools Setup

In [None]:
@tool
def read_text_segment(file_path: str, start: int, length: int) -> str:
    """
    Reads a specific segment of a text file.
    
    Args:
        file_path: The path to the text file.
        start: The starting character index (0-based).
        length: The number of characters to read.
        
    Returns:
        The substring starting at 'start' with the given 'length'.
    """
    try:
        # NOTE: "start" is a character index, not a byte offset.
        # In text mode, using seek(start) can land in the middle of a UTF-8 multibyte sequence.
        with open(file_path, 'r', encoding='utf-8') as f:
            if start > 0:
                f.read(start)
            return f.read(length)
    except Exception as e:
        return f"Error reading file: {str(e)}"

@tool
def extract_regex_matches(file_path: str, regex_pattern: str) -> str:
    """
    Extracts all text matches for a given regex pattern from the file.
    
    Args:
        file_path: The path to the text file.
        regex_pattern: The python-style regex pattern to search for.
        
    Returns:
        A string representation of the list of found matches.
    """
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()
        
        matches = re.findall(regex_pattern, content)
        return str(matches)
    except Exception as e:
        return f"Error extracting matches: {str(e)}"

@tool
def get_file_length(file_path: str) -> str:
    """
    Returns the total character length of the file.
    """
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            return str(len(f.read()))
    except Exception as e:
        return f"Error getting file length: {str(e)}"

class ReadTextFileArgs(BaseModel):
    """Arguments for read_text_file tool."""
    file_path: str = Field(..., description="The path to the text file.")
    start: Optional[int] = Field(None, description="The starting character index (0-based). Omit this parameter to read from the beginning.")
    length: Optional[int] = Field(None, description="The number of characters to read. Omit this parameter to read 100 characters from the start position (or from the beginning if start is also omitted).")

@tool(args_schema=ReadTextFileArgs)
def read_text_file(file_path: str, start: Optional[int] = None, length: Optional[int] = None) -> str:
    """
    Reads a text file as UTF-8. Can read a specific segment of the file.
    
    Default behavior: If start and length are both omitted, reads the first 100 characters.
    This helps avoid exceeding context window limits when reading large files.
    
    Examples:
    - read_text_file("file.txt") -> reads first 100 characters
    - read_text_file("file.txt", start=0, length=1000) -> reads first 1000 characters
    - read_text_file("file.txt", start=500, length=2000) -> reads 2000 characters starting from position 500
    - read_text_file("file.txt", start=1000) -> reads 100 characters starting from position 1000
    
    Args:
        file_path: The path to the text file.
        start: The starting character index (0-based). Omit to read from the beginning.
        length: The number of characters to read. Omit to read 100 characters from the start position.
        
    Returns:
        The file content segment (default: first 100 characters if both start and length are omitted).
    """
    try:
        # NOTE: "start" is a character index, not a byte offset.
        # In text mode, using seek(start) can land in the middle of a UTF-8 multibyte sequence.
        with open(file_path, 'r', encoding='utf-8') as f:
            if start is not None and start > 0:
                f.read(start)
            if length is not None:
                return f.read(length)
            # Default: read 100 characters
            return f.read(100)
    except Exception as e:
        return f"Error reading file: {str(e)}"


# --- Persisted AST Store Tool (self-contained in notebook) ---
import json
import re
from dataclasses import dataclass
from datetime import datetime, timedelta, timezone
from typing import Any, Dict, List, Literal, Optional, Tuple
from uuid import uuid4


def _utc_now_iso() -> str:
    return datetime.now(timezone.utc).isoformat()


def _ensure_parent_dir(path: str) -> None:
    parent = os.path.dirname(os.path.abspath(path))
    if parent:
        os.makedirs(parent, exist_ok=True)


def _atomic_write_text(path: str, text: str, encoding: str = "utf-8") -> None:
    """Write file atomically (best-effort) using os.replace."""
    _ensure_parent_dir(path)
    tmp_path = f"{path}.tmp"
    with open(tmp_path, "w", encoding=encoding, newline="\n") as f:
        f.write(text)
    os.replace(tmp_path, path)


def _load_json(path: str) -> Dict[str, Any]:
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)


def _dump_json(data: Any) -> str:
    return json.dumps(data, ensure_ascii=False, indent=2)


def _get_meta(ast: Dict[str, Any]) -> Dict[str, Any]:
    meta = ast.get("__meta__")
    if isinstance(meta, dict):
        if "rev" not in meta:
            meta["rev"] = 0
        return meta
    meta = {"rev": 0, "updated_at": None}
    ast["__meta__"] = meta
    return meta


def _bump_meta(ast: Dict[str, Any]) -> Dict[str, Any]:
    meta = _get_meta(ast)
    meta["rev"] = int(meta.get("rev") or 0) + 1
    meta["updated_at"] = _utc_now_iso()
    return meta


# load_metaで発行するワンタイム編集トークン（Notebookカーネル内メモリに保持）
# token -> {ast_path, scope_kind, scope_value, issued_rev, expires_at}
_EDIT_TOKENS: Dict[str, Dict[str, Any]] = {}
_EDIT_TOKEN_TTL = timedelta(minutes=10)


def _issue_edit_token(*, ast_path: str, scope_kind: str, scope_value: Any, issued_rev: int) -> str:
    token = uuid4().hex
    _EDIT_TOKENS[token] = {
        "ast_path": os.path.abspath(ast_path),
        "scope_kind": scope_kind,
        "scope_value": scope_value,
        "issued_rev": int(issued_rev),
        "expires_at": datetime.now(timezone.utc) + _EDIT_TOKEN_TTL,
    }
    return token


def _consume_edit_token(*, token: str, ast_path: str, scope_kind: str, scope_value: Any, current_rev: int) -> Optional[str]:
    rec = _EDIT_TOKENS.get(token)
    if not rec:
        return "edit_token is missing or invalid. Call ast_store(action='load_meta', ...) to obtain a fresh token."

    if rec.get("expires_at") and datetime.now(timezone.utc) > rec["expires_at"]:
        _EDIT_TOKENS.pop(token, None)
        return "edit_token expired. Call load_meta again."

    if rec.get("ast_path") != os.path.abspath(ast_path):
        return "edit_token does not match ast_path. Call load_meta again."

    issued_rev = rec.get("issued_rev")
    if issued_rev is None or int(issued_rev) != int(current_rev):
        return f"stale token (issued_rev={issued_rev}, current_rev={current_rev}). Call load_meta again."

    if rec.get("scope_kind") != scope_kind:
        return "edit_token scope mismatch. Call load_meta again."

    if rec.get("scope_value") != scope_value:
        return "edit_token scope value mismatch. Call load_meta again."

    # consume
    _EDIT_TOKENS.pop(token, None)
    return None


def _normalize_path_indices(path_indices: Optional[List[int]]) -> List[int]:
    if path_indices is None:
        return []
    return list(path_indices)


@dataclass(frozen=True)
class _NodeRef:
    node: Dict[str, Any]
    parent: Optional[Dict[str, Any]]
    index_in_parent: Optional[int]


def _get_children_list(node: Dict[str, Any]) -> List[Dict[str, Any]]:
    children = node.get("children")
    if children is None:
        children = []
        node["children"] = children
    if not isinstance(children, list):
        raise ValueError("Invalid AST: 'children' must be a list.")
    return children  # type: ignore[return-value]


def _traverse(ast: Dict[str, Any], node_path: List[int]) -> _NodeRef:
    """node_path: [] = root, [0] = first child, [0,2] = third child of first child."""
    if "root" not in ast or not isinstance(ast["root"], dict):
        raise ValueError("Invalid AST: missing 'root' object.")

    current = ast["root"]
    parent: Optional[Dict[str, Any]] = None
    idx_in_parent: Optional[int] = None

    for idx in node_path:
        children = _get_children_list(current)
        if idx < 0 or idx >= len(children):
            raise IndexError(f"Invalid path index {idx}; children length is {len(children)}.")
        parent = current
        idx_in_parent = idx
        current = children[idx]
        if not isinstance(current, dict):
            raise ValueError("Invalid AST: node must be an object.")

    return _NodeRef(node=current, parent=parent, index_in_parent=idx_in_parent)


def _make_node(section_title: Optional[str], content_summary: str) -> Dict[str, Any]:
    return {
        "section_title": section_title,
        "content_summary": content_summary,
        "children": [],
    }


def _normalize_title(title: Optional[str]) -> str:
    if title is None:
        return ""
    t = str(title).replace("\u3000", " ")
    t = re.sub(r"\s+", " ", t).strip()
    return t


def _titles_equal(a: Optional[str], b: Optional[str]) -> bool:
    return _normalize_title(a) == _normalize_title(b)


def _ensure_titles_path(
    ast: Dict[str, Any],
    titles: List[str],
    *,
    create_missing: bool,
    created_default_summary: str = "",
) -> Tuple[List[int], bool]:
    """Resolve a node by a list of section titles (path). Returns (node_path, created_any)."""
    if "root" not in ast or not isinstance(ast["root"], dict):
        raise ValueError("Invalid AST: missing 'root' object.")

    current = ast["root"]
    path: List[int] = []
    created_any = False

    titles_norm: List[str] = [str(t) for t in titles if str(t).strip() != ""]
    if titles_norm:
        root_title = current.get("section_title")
        if _titles_equal(root_title, titles_norm[0]):
            titles_norm = titles_norm[1:]

    for raw_title in titles_norm:
        title = str(raw_title)
        target_norm = _normalize_title(title)

        children = _get_children_list(current)
        matches = [
            i
            for i, child in enumerate(children)
            if isinstance(child, dict) and _normalize_title(child.get("section_title")) == target_norm
        ]

        if not matches:
            if not create_missing:
                raise ValueError(f"Node not found for title '{title}'.")
            children.append(_make_node(title, created_default_summary))
            idx = len(children) - 1
            created_any = True
        else:
            # If duplicates exist under the same parent, pick the first match deterministically.
            idx = matches[0]

        path.append(idx)
        current = children[idx]
        if not isinstance(current, dict):
            raise ValueError("Invalid AST: node must be an object.")

    return path, created_any


def _titles_for_path(ast: Dict[str, Any], node_path: List[int]) -> List[str]:
    if "root" not in ast or not isinstance(ast["root"], dict):
        raise ValueError("Invalid AST: missing 'root' object.")
    current = ast["root"]
    titles: List[str] = []
    for idx in node_path:
        children = _get_children_list(current)
        if idx < 0 or idx >= len(children):
            raise IndexError(f"Invalid path index {idx}; children length is {len(children)}.")
        current = children[idx]
        if not isinstance(current, dict):
            raise ValueError("Invalid AST: node must be an object.")
        t = current.get("section_title")
        titles.append(str(t) if t is not None else "")
    return titles


def _find_nodes_by_title(
    ast: Dict[str, Any],
    title_query: str,
    *,
    max_results: int,
    case_sensitive: bool,
) -> List[Dict[str, Any]]:
    if not title_query:
        return []

    q = title_query if case_sensitive else title_query.lower()
    results: List[Dict[str, Any]] = []

    def walk(node: Dict[str, Any], path: List[int]) -> None:
        if len(results) >= max_results:
            return
        title = node.get("section_title") or ""
        hay = title if case_sensitive else str(title).lower()
        if q in hay:
            results.append({"path": path, "section_title": node.get("section_title")})
            if len(results) >= max_results:
                return

        for i, child in enumerate(_get_children_list(node)):
            if not isinstance(child, dict):
                continue
            walk(child, path + [i])

    root = ast.get("root")
    if isinstance(root, dict):
        walk(root, [])
    return results


class ASTStoreArgs(BaseModel):
    action: Literal[
        # read-only
        "load",
        "load_subtree",
        "load_meta",
        "find_by_title",
        "list_children",
        "resolve_path",
        # write
        "init",
        "ensure_path",
        "append_child",
        "append_child_by_titles",
        "upsert_child_by_title",
        "upsert_child_by_titles",
        "update_node",
        "update_node_by_titles",
        "append_to_summary",
        "append_to_summary_by_titles",
    ] = Field(..., description="Operation to perform on the persisted AST.")

    ast_path: str = Field(
        "ast_state.json",
        description="Path to the AST JSON file (will be created/updated).",
    )

    # init
    file_name: Optional[str] = Field(None, description="Document name for the AST (required for init).")
    root_title: Optional[str] = Field(None, description="Optional title for the root node.")
    root_summary: Optional[str] = Field("", description="Root node content_summary.")

    # navigation (index)
    node_path: Optional[List[int]] = Field(None, description="Target node path (index list from root).")
    parent_path: Optional[List[int]] = Field(None, description="Parent node path (index list from root).")

    # navigation (titles)
    node_titles: Optional[List[str]] = Field(None, description="Target node titles path (from root).")
    parent_titles: Optional[List[str]] = Field(None, description="Parent node titles path (from root).")

    # node data
    section_title: Optional[str] = Field(None, description="Section title for new node or updated node.")
    content_summary: Optional[str] = Field(None, description="Section content summary for new node or updated node.")
    append_text: Optional[str] = Field(None, description="Text to append to content_summary.")

    # append options
    position: Optional[int] = Field(None, description="Insert position under parent children. Omit to append at the end.")

    # ensure/resolve options
    create_missing: bool = Field(False, description="If true, create missing nodes for ensure_path.")
    created_default_summary: str = Field("", description="Default summary for auto-created nodes.")

    # find options
    title_query: Optional[str] = Field(None, description="Substring to find in section_title.")
    max_results: int = Field(20, description="Max number of matches for find operations.")
    case_sensitive: bool = Field(False, description="Whether title matching is case-sensitive.")

    # edit guard
    purpose: Optional[Literal[
        "append_child",
        "upsert_child",
        "update_node",
        "append_to_summary",
        "ensure_path",
    ]] = Field(None, description="Required for action=load_meta. Indicates what write you intend next.")

    edit_token: Optional[str] = Field(None, description="One-time token returned by load_meta; required for write actions.")

    include_children: bool = Field(True, description="For load_meta/list_children: include current children titles and indices.")


@tool(args_schema=ASTStoreArgs)
def ast_store(
    action: str,
    ast_path: str = "ast_state.json",
    file_name: Optional[str] = None,
    root_title: Optional[str] = None,
    root_summary: str = "",
    # navigation (index)
    node_path: Optional[List[int]] = None,
    parent_path: Optional[List[int]] = None,
    # navigation (titles)
    node_titles: Optional[List[str]] = None,
    parent_titles: Optional[List[str]] = None,
    # node data
    section_title: Optional[str] = None,
    content_summary: Optional[str] = None,
    append_text: Optional[str] = None,
    # append options
    position: Optional[int] = None,
    # ensure/resolve options
    create_missing: bool = False,
    created_default_summary: str = "",
    # find options
    title_query: Optional[str] = None,
    max_results: int = 20,
    case_sensitive: bool = False,
    # edit guard
    purpose: Optional[str] = None,
    edit_token: Optional[str] = None,
    include_children: bool = True,
) -> str:
    """Persisted AST editor. Always reads current AST from disk and writes back immediately."""
    try:
        node_path_n = _normalize_path_indices(node_path)
        parent_path_n = _normalize_path_indices(parent_path)

        # init
        if action == "init":
            if not file_name:
                return _dump_json({"ok": False, "error": "file_name is required for action=init"})

            now = _utc_now_iso()
            ast: Dict[str, Any] = {
                "file_name": file_name,
                "__meta__": {"rev": 0, "updated_at": now},
                "root": _make_node(root_title or file_name, root_summary or ""),
            }
            _atomic_write_text(ast_path, _dump_json(ast))
            return _dump_json(
                {
                    "ok": True,
                    "action": "init",
                    "ast_path": ast_path,
                    "file_name": file_name,
                    "rev": 0,
                    "updated_at": now,
                }
            )

        # other actions require existing file
        if not os.path.exists(ast_path):
            return _dump_json(
                {
                    "ok": False,
                    "error": f"AST file not found: {ast_path}. Call action=init first.",
                }
            )

        ast = _load_json(ast_path)
        meta = _get_meta(ast)
        current_rev = int(meta.get("rev") or 0)

        if action == "load":
            return _dump_json({"ok": True, "action": "load", "rev": current_rev, "updated_at": meta.get("updated_at"), "ast": ast})

        if action == "load_subtree":
            ref = _traverse(ast, node_path_n)
            return _dump_json(
                {
                    "ok": True,
                    "action": "load_subtree",
                    "rev": current_rev,
                    "updated_at": meta.get("updated_at"),
                    "node_path": node_path_n,
                    "node": ref.node,
                }
            )

        if action == "resolve_path":
            if not node_titles:
                return _dump_json({"ok": False, "error": "node_titles is required for action=resolve_path"})
            path, _created = _ensure_titles_path(
                ast,
                node_titles,
                create_missing=False,
                created_default_summary=created_default_summary,
            )
            return _dump_json(
                {
                    "ok": True,
                    "action": "resolve_path",
                    "rev": current_rev,
                    "updated_at": meta.get("updated_at"),
                    "node_titles": node_titles,
                    "node_path": path,
                }
            )

        if action == "list_children":
            if node_titles:
                path, _created = _ensure_titles_path(
                    ast,
                    node_titles,
                    create_missing=False,
                    created_default_summary=created_default_summary,
                )
            else:
                path = node_path_n
            ref = _traverse(ast, path)
            children = _get_children_list(ref.node)
            children_info = []
            if include_children:
                for i, ch in enumerate(children):
                    if isinstance(ch, dict):
                        children_info.append({"index": i, "section_title": ch.get("section_title")})
            return _dump_json(
                {
                    "ok": True,
                    "action": "list_children",
                    "rev": current_rev,
                    "updated_at": meta.get("updated_at"),
                    "node_path": path,
                    "node_titles": _titles_for_path(ast, path),
                    "children": children_info,
                }
            )

        if action == "load_meta":
            if not purpose:
                return _dump_json({"ok": False, "error": "purpose is required for action=load_meta"})

            # resolve scope node (the node you intend to edit)
            if node_titles:
                scope_path, _created = _ensure_titles_path(
                    ast,
                    node_titles,
                    create_missing=False,
                    created_default_summary=created_default_summary,
                )
            else:
                scope_path = node_path_n

            scope_ref = _traverse(ast, scope_path)
            children_info = []
            if include_children:
                for i, ch in enumerate(_get_children_list(scope_ref.node)):
                    if isinstance(ch, dict):
                        children_info.append({"index": i, "section_title": ch.get("section_title")})

            token = _issue_edit_token(
                ast_path=ast_path,
                scope_kind=str(purpose),
                scope_value={"node_path": scope_path},
                issued_rev=current_rev,
            )

            return _dump_json(
                {
                    "ok": True,
                    "action": "load_meta",
                    "rev": current_rev,
                    "updated_at": meta.get("updated_at"),
                    "purpose": purpose,
                    "node_path": scope_path,
                    "node_titles": _titles_for_path(ast, scope_path),
                    "children": children_info,
                    "edit_token": token,
                }
            )

        if action == "find_by_title":
            q = title_query or ""
            matches = _find_nodes_by_title(
                ast,
                q,
                max_results=max(1, int(max_results)),
                case_sensitive=bool(case_sensitive),
            )
            return _dump_json(
                {
                    "ok": True,
                    "action": "find_by_title",
                    "rev": current_rev,
                    "updated_at": meta.get("updated_at"),
                    "title_query": q,
                    "matches": matches,
                }
            )

        # --- write actions (require edit_token issued by load_meta) ---
        if action == "ensure_path":
            if not node_titles:
                return _dump_json({"ok": False, "error": "node_titles is required for action=ensure_path", "rev": current_rev})

            token_err = _consume_edit_token(
                token=edit_token or "",
                ast_path=ast_path,
                scope_kind="ensure_path",
                scope_value={"node_path": []},
                current_rev=current_rev,
            )
            if token_err:
                return _dump_json({"ok": False, "error": token_err, "rev": current_rev, "updated_at": meta.get("updated_at")})

            path, created_any = _ensure_titles_path(
                ast,
                node_titles,
                create_missing=bool(create_missing),
                created_default_summary=created_default_summary,
            )

            if created_any:
                new_meta = _bump_meta(ast)
                _atomic_write_text(ast_path, _dump_json(ast))
                return _dump_json(
                    {
                        "ok": True,
                        "action": "ensure_path",
                        "rev": int(new_meta.get("rev") or 0),
                        "updated_at": new_meta.get("updated_at"),
                        "node_titles": node_titles,
                        "node_path": path,
                        "created": True,
                    }
                )

            # no change
            return _dump_json(
                {
                    "ok": True,
                    "action": "ensure_path",
                    "rev": current_rev,
                    "updated_at": meta.get("updated_at"),
                    "node_titles": node_titles,
                    "node_path": path,
                    "created": False,
                }
            )

        if action == "append_child_by_titles":
            if content_summary is None:
                return _dump_json({"ok": False, "error": "content_summary is required for action=append_child_by_titles", "rev": current_rev})
            if not parent_titles:
                return _dump_json({"ok": False, "error": "parent_titles is required for action=append_child_by_titles", "rev": current_rev})

            parent_path_resolved, _created = _ensure_titles_path(
                ast,
                parent_titles,
                create_missing=False,
                created_default_summary=created_default_summary,
            )

            token_err = _consume_edit_token(
                token=edit_token or "",
                ast_path=ast_path,
                scope_kind="append_child",
                scope_value={"node_path": parent_path_resolved},
                current_rev=current_rev,
            )
            if token_err:
                return _dump_json({"ok": False, "error": token_err, "rev": current_rev, "updated_at": meta.get("updated_at")})

            parent_ref = _traverse(ast, parent_path_resolved)
            children = _get_children_list(parent_ref.node)

            new_node = _make_node(section_title, content_summary)
            if position is None:
                children.append(new_node)
                new_index = len(children) - 1
            else:
                pos = int(position)
                if pos < 0 or pos > len(children):
                    return _dump_json({"ok": False, "error": f"position out of range: {pos} (0..{len(children)})", "rev": current_rev})
                children.insert(pos, new_node)
                new_index = pos

            new_meta = _bump_meta(ast)
            _atomic_write_text(ast_path, _dump_json(ast))
            return _dump_json(
                {
                    "ok": True,
                    "action": "append_child_by_titles",
                    "rev": int(new_meta.get("rev") or 0),
                    "updated_at": new_meta.get("updated_at"),
                    "parent_titles": parent_titles,
                    "parent_path": parent_path_resolved,
                    "new_node_path": parent_path_resolved + [new_index],
                }
            )

        if action == "upsert_child_by_titles":
            if not section_title:
                return _dump_json({"ok": False, "error": "section_title is required for action=upsert_child_by_titles", "rev": current_rev})
            if content_summary is None:
                return _dump_json({"ok": False, "error": "content_summary is required for action=upsert_child_by_titles", "rev": current_rev})
            if not parent_titles:
                return _dump_json({"ok": False, "error": "parent_titles is required for action=upsert_child_by_titles", "rev": current_rev})

            parent_path_resolved, _created = _ensure_titles_path(
                ast,
                parent_titles,
                create_missing=False,
                created_default_summary=created_default_summary,
            )

            token_err = _consume_edit_token(
                token=edit_token or "",
                ast_path=ast_path,
                scope_kind="upsert_child",
                scope_value={"node_path": parent_path_resolved},
                current_rev=current_rev,
            )
            if token_err:
                return _dump_json({"ok": False, "error": token_err, "rev": current_rev, "updated_at": meta.get("updated_at")})

            parent_ref = _traverse(ast, parent_path_resolved)
            children = _get_children_list(parent_ref.node)

            found_index: Optional[int] = None
            target_norm = _normalize_title(section_title)
            for i, child in enumerate(children):
                if not isinstance(child, dict):
                    continue
                if _normalize_title(child.get("section_title")) == target_norm:
                    found_index = i
                    break

            if found_index is None:
                children.append(_make_node(section_title, content_summary))
                found_index = len(children) - 1
                op = "created"
            else:
                child = children[found_index]
                existing = str(child.get("content_summary") or "")
                if existing:
                    child["content_summary"] = existing.rstrip() + "\n" + str(content_summary).lstrip()
                else:
                    child["content_summary"] = str(content_summary)
                op = "appended"

            new_meta = _bump_meta(ast)
            _atomic_write_text(ast_path, _dump_json(ast))
            return _dump_json(
                {
                    "ok": True,
                    "action": "upsert_child_by_titles",
                    "rev": int(new_meta.get("rev") or 0),
                    "updated_at": new_meta.get("updated_at"),
                    "parent_titles": parent_titles,
                    "parent_path": parent_path_resolved,
                    "node_path": parent_path_resolved + [found_index],
                    "op": op,
                }
            )

        if action == "update_node_by_titles":
            if not node_titles:
                return _dump_json({"ok": False, "error": "node_titles is required for action=update_node_by_titles", "rev": current_rev})
            if section_title is None and content_summary is None:
                return _dump_json({"ok": False, "error": "section_title and/or content_summary must be provided for action=update_node_by_titles", "rev": current_rev})

            node_path_resolved, _created = _ensure_titles_path(
                ast,
                node_titles,
                create_missing=False,
                created_default_summary=created_default_summary,
            )

            token_err = _consume_edit_token(
                token=edit_token or "",
                ast_path=ast_path,
                scope_kind="update_node",
                scope_value={"node_path": node_path_resolved},
                current_rev=current_rev,
            )
            if token_err:
                return _dump_json({"ok": False, "error": token_err, "rev": current_rev, "updated_at": meta.get("updated_at")})

            ref = _traverse(ast, node_path_resolved)
            if section_title is not None:
                ref.node["section_title"] = section_title
            if content_summary is not None:
                ref.node["content_summary"] = content_summary

            new_meta = _bump_meta(ast)
            _atomic_write_text(ast_path, _dump_json(ast))
            return _dump_json(
                {
                    "ok": True,
                    "action": "update_node_by_titles",
                    "rev": int(new_meta.get("rev") or 0),
                    "updated_at": new_meta.get("updated_at"),
                    "node_titles": node_titles,
                    "node_path": node_path_resolved,
                }
            )

        if action == "append_to_summary_by_titles":
            if not node_titles:
                return _dump_json({"ok": False, "error": "node_titles is required for action=append_to_summary_by_titles", "rev": current_rev})
            if append_text is None:
                return _dump_json({"ok": False, "error": "append_text is required for action=append_to_summary_by_titles", "rev": current_rev})

            node_path_resolved, _created = _ensure_titles_path(
                ast,
                node_titles,
                create_missing=False,
                created_default_summary=created_default_summary,
            )

            token_err = _consume_edit_token(
                token=edit_token or "",
                ast_path=ast_path,
                scope_kind="append_to_summary",
                scope_value={"node_path": node_path_resolved},
                current_rev=current_rev,
            )
            if token_err:
                return _dump_json({"ok": False, "error": token_err, "rev": current_rev, "updated_at": meta.get("updated_at")})

            ref = _traverse(ast, node_path_resolved)
            existing = str(ref.node.get("content_summary") or "")
            if existing:
                ref.node["content_summary"] = existing.rstrip() + "\n" + str(append_text).lstrip()
            else:
                ref.node["content_summary"] = str(append_text)

            new_meta = _bump_meta(ast)
            _atomic_write_text(ast_path, _dump_json(ast))
            return _dump_json(
                {
                    "ok": True,
                    "action": "append_to_summary_by_titles",
                    "rev": int(new_meta.get("rev") or 0),
                    "updated_at": new_meta.get("updated_at"),
                    "node_titles": node_titles,
                    "node_path": node_path_resolved,
                }
            )

        # --- legacy index-based write actions (also token-guarded) ---
        if action == "append_child":
            if content_summary is None:
                return _dump_json({"ok": False, "error": "content_summary is required for action=append_child", "rev": current_rev})

            token_err = _consume_edit_token(
                token=edit_token or "",
                ast_path=ast_path,
                scope_kind="append_child",
                scope_value={"node_path": parent_path_n},
                current_rev=current_rev,
            )
            if token_err:
                return _dump_json({"ok": False, "error": token_err, "rev": current_rev, "updated_at": meta.get("updated_at")})

            parent_ref = _traverse(ast, parent_path_n)
            children = _get_children_list(parent_ref.node)

            new_node = _make_node(section_title, content_summary)
            if position is None:
                children.append(new_node)
                new_index = len(children) - 1
            else:
                pos = int(position)
                if pos < 0 or pos > len(children):
                    return _dump_json({"ok": False, "error": f"position out of range: {pos} (0..{len(children)})", "rev": current_rev})
                children.insert(pos, new_node)
                new_index = pos

            new_meta = _bump_meta(ast)
            _atomic_write_text(ast_path, _dump_json(ast))
            return _dump_json(
                {
                    "ok": True,
                    "action": "append_child",
                    "rev": int(new_meta.get("rev") or 0),
                    "updated_at": new_meta.get("updated_at"),
                    "parent_path": parent_path_n,
                    "new_node_path": parent_path_n + [new_index],
                }
            )

        if action == "upsert_child_by_title":
            if not section_title:
                return _dump_json({"ok": False, "error": "section_title is required for action=upsert_child_by_title", "rev": current_rev})
            if content_summary is None:
                return _dump_json({"ok": False, "error": "content_summary is required for action=upsert_child_by_title", "rev": current_rev})

            token_err = _consume_edit_token(
                token=edit_token or "",
                ast_path=ast_path,
                scope_kind="upsert_child",
                scope_value={"node_path": parent_path_n},
                current_rev=current_rev,
            )
            if token_err:
                return _dump_json({"ok": False, "error": token_err, "rev": current_rev, "updated_at": meta.get("updated_at")})

            parent_ref = _traverse(ast, parent_path_n)
            children = _get_children_list(parent_ref.node)

            found_index: Optional[int] = None
            target_norm = _normalize_title(section_title)
            for i, child in enumerate(children):
                if not isinstance(child, dict):
                    continue
                if _normalize_title(child.get("section_title")) == target_norm:
                    found_index = i
                    break

            if found_index is None:
                children.append(_make_node(section_title, content_summary))
                found_index = len(children) - 1
                op = "created"
            else:
                child = children[found_index]
                existing = str(child.get("content_summary") or "")
                if existing:
                    child["content_summary"] = existing.rstrip() + "\n" + str(content_summary).lstrip()
                else:
                    child["content_summary"] = str(content_summary)
                op = "appended"

            new_meta = _bump_meta(ast)
            _atomic_write_text(ast_path, _dump_json(ast))
            return _dump_json(
                {
                    "ok": True,
                    "action": "upsert_child_by_title",
                    "rev": int(new_meta.get("rev") or 0),
                    "updated_at": new_meta.get("updated_at"),
                    "parent_path": parent_path_n,
                    "node_path": parent_path_n + [found_index],
                    "op": op,
                }
            )

        if action == "update_node":
            if section_title is None and content_summary is None:
                return _dump_json({"ok": False, "error": "section_title and/or content_summary must be provided for action=update_node", "rev": current_rev})

            token_err = _consume_edit_token(
                token=edit_token or "",
                ast_path=ast_path,
                scope_kind="update_node",
                scope_value={"node_path": node_path_n},
                current_rev=current_rev,
            )
            if token_err:
                return _dump_json({"ok": False, "error": token_err, "rev": current_rev, "updated_at": meta.get("updated_at")})

            ref = _traverse(ast, node_path_n)
            if section_title is not None:
                ref.node["section_title"] = section_title
            if content_summary is not None:
                ref.node["content_summary"] = content_summary

            new_meta = _bump_meta(ast)
            _atomic_write_text(ast_path, _dump_json(ast))
            return _dump_json(
                {
                    "ok": True,
                    "action": "update_node",
                    "rev": int(new_meta.get("rev") or 0),
                    "updated_at": new_meta.get("updated_at"),
                    "node_path": node_path_n,
                }
            )

        if action == "append_to_summary":
            if append_text is None:
                return _dump_json({"ok": False, "error": "append_text is required for action=append_to_summary", "rev": current_rev})

            token_err = _consume_edit_token(
                token=edit_token or "",
                ast_path=ast_path,
                scope_kind="append_to_summary",
                scope_value={"node_path": node_path_n},
                current_rev=current_rev,
            )
            if token_err:
                return _dump_json({"ok": False, "error": token_err, "rev": current_rev, "updated_at": meta.get("updated_at")})

            ref = _traverse(ast, node_path_n)
            existing = str(ref.node.get("content_summary") or "")
            if existing:
                ref.node["content_summary"] = existing.rstrip() + "\n" + str(append_text).lstrip()
            else:
                ref.node["content_summary"] = str(append_text)

            new_meta = _bump_meta(ast)
            _atomic_write_text(ast_path, _dump_json(ast))
            return _dump_json(
                {
                    "ok": True,
                    "action": "append_to_summary",
                    "rev": int(new_meta.get("rev") or 0),
                    "updated_at": new_meta.get("updated_at"),
                    "node_path": node_path_n,
                }
            )

        return _dump_json({"ok": False, "error": f"Unknown action: {action}"})

    except Exception as e:
        return _dump_json({"ok": False, "error": str(e)})




## 2. Structured Output Definition

In [40]:
class DocumentNode(BaseModel):
    section_title: Optional[str] = Field(None, description="The title of the section")
    content_summary: str = Field(..., description="A brief summary of the content in this section")
    children: List['DocumentNode'] = Field(default_factory=list, description="Sub-sections or nested content")


class DocumentAST(BaseModel):
    file_name: str = Field(..., description="Name of the file analyzed")
    root: DocumentNode = Field(..., description="The root node of the document structure")


class AgentResult(BaseModel):
    status: str = Field(..., description="Execution status, e.g. 'ok'")
    ast_path: str = Field(..., description="Path to the persisted AST JSON")
    note: Optional[str] = Field(None, description="Optional message")


# pydantic v2: resolve forward references
DocumentNode.model_rebuild()
DocumentAST.model_rebuild()
AgentResult.model_rebuild()


tools = [read_text_segment, read_text_file, extract_regex_matches, get_file_length, ast_store]
middleware = [
    # SummarizationMiddleware(
    #         model="gpt-5.2",
    #         trigger=("tokens", 100000),
    #         keep=("messages", 5),
    #     ),
    ContextEditingMiddleware(
            edits=[
                ClearToolUsesEdit(
                    trigger=5000,
                    keep=3,
                ),
            ],
        ),
    TodoListMiddleware()
]

## 3. Agent Setup

In [41]:
# Initialize LLM (OpenAI or Azure OpenAI)
llm = build_llm()

system_prompt = """
You are an intelligent document analysis agent.
Your goal is to analyze a text file and reconstruct its structure as an Abstract Syntax Tree (AST).

Important:
- Do NOT rely on chat memory to hold the whole AST.
- Use the ast_store tool to persist the AST on disk incrementally.

Workflow (IMPORTANT: write operations are token-guarded):

Planning / Todos (MANDATORY):
- At the start of this run, call write_todos to create a short task plan (2-8 items).
- As you execute, keep the todo list accurate: mark one item in_progress, then completed, and move to the next.
- If you discover new required work, add it to the todo list and continue tracking until all tasks are completed.

AST Initialization:
- Check if AST file exists using ast_store(action="load", ast_path=...).
- If AST is empty or missing, initialize it using ast_store(action="init", file_name=..., root_title=..., root_summary=...).
- Then AUTONOMOUSLY scan the document to detect ALL heading patterns and build the complete AST structure.
- Before ANY write (append/update), you MUST call:
  - ast_store(action="load_meta", ast_path=..., purpose=..., node_titles=[...])
  This returns a one-time edit_token scoped to that exact node and current revision.
- Then perform exactly ONE write using that edit_token.
  (If you need another write, call load_meta again.)

Prefer title-path operations (avoid index miscounts):
- Add a new child under an existing parent:
  - load_meta: ast_store(action="load_meta", purpose="append_child", node_titles=["第一部【企業情報】"])  # parent
  - write: ast_store(action="append_child_by_titles", parent_titles=["第一部【企業情報】"], section_title="第５【…】", content_summary="...", edit_token="...")
- Upsert a child (append to summary if same title exists under parent):
  - load_meta: ast_store(action="load_meta", purpose="upsert_child", node_titles=["第一部【企業情報】","第１【企業の概況】"])  # parent
  - write: ast_store(action="upsert_child_by_titles", parent_titles=[...], section_title="１【…】", content_summary="...", edit_token="...")
- Update an existing node:
  - load_meta: ast_store(action="load_meta", purpose="update_node", node_titles=["第一部【企業情報】","第１【企業の概況】","１【…】"])  # node
  - write: ast_store(action="update_node_by_titles", node_titles=[...], content_summary="...", edit_token="...")
- Append to an existing node's summary:
  - load_meta: ast_store(action="load_meta", purpose="append_to_summary", node_titles=[...])
  - write: ast_store(action="append_to_summary_by_titles", node_titles=[...], append_text="...", edit_token="...")

Inspect current state (read-only):
- ast_store(action="list_children", node_titles=[...])
- ast_store(action="load_subtree", node_path=[...])
- ast_store(action="find_by_title", title_query="...")
- ast_store(action="resolve_path", node_titles=[...])

Guidelines:
- AUTONOMOUSLY detect heading patterns in the document. Do NOT assume any specific format (EDINET, Markdown, etc.).
- Read the file in chunks using read_text_file to identify heading patterns:
  * Common patterns include: Markdown (#, ##, ###), numbered sections (1., 1.1, 1.1.1), bracketed headings (【...】), chapter titles (第X章, Chapter X), etc.
  * Look for visual indicators: lines that stand out, repeated patterns, indentation, etc.
  * Consider document-specific conventions (e.g., "独立監査人の監査報告書" might be a major section even without standard formatting).
- Build the AST hierarchy based on detected headings, maintaining proper parent-child relationships.
- Summarize the content under each heading concisely.
- If no AST exists, initialize it first using ast_store(action="init", file_name=..., root_title=..., root_summary=...).

Tool Usage:
- read_text_file reads only 100 characters by default (when start and length are omitted) to avoid exceeding context window limits.
- Use read_text_file with start and length parameters to read specific segments of large files.
- Only include the start and length parameters when you need to read a specific portion of the file.
- Use get_file_length first to determine file size, then read in chunks as needed.

Final Output:
- Return AgentResult with status="ok" and the provided ast_path.
- Do NOT output the whole AST in the final response.
""".strip()

agent = create_agent(
    model=llm,
    tools=tools,
    system_prompt=system_prompt,
    response_format=AgentResult,
    middleware=middleware,
    debug=True,
)

## 4. Execution

In [42]:
import json

# target_file = "sample.txt"
target_file = "富士フィルム_有価証券報告書.pdf"

if target_file.endswith(".pdf"):
    # pymupdf (fitz)を使用して日本語PDFを正しく読み込む
    import fitz  # PyMuPDF
    doc = fitz.open(target_file)
    text_content = []
    for page_num in range(len(doc)):
        page = doc[page_num]
        text_content.append(page.get_text())
    doc.close()
    
    target_file = target_file.replace(".pdf", ".txt")
    with open(target_file, "w", encoding="utf-8") as f:
        f.write("\n".join(text_content))

# ASTは都度ファイルに永続化する（LLMの記憶に依存しない）
ast_path = f"{target_file}.ast.json"

# ASTファイルが存在しない場合は初期化（エージェントが自律的に見出しを検出して構築する）
if not os.path.exists(ast_path):
    import json
    from datetime import datetime, timezone
    root_title = target_file.replace(".txt", "").replace(".pdf", "")
    ast_init = {
        "file_name": os.path.basename(target_file),
        "__meta__": {"rev": 0, "updated_at": datetime.now(timezone.utc).isoformat()},
        "root": {
            "section_title": root_title,
            "content_summary": "",
            "children": []
        }
    }
    with open(ast_path, "w", encoding="utf-8") as f:
        json.dump(ast_init, f, ensure_ascii=False, indent=2)

query = (
    f"Analyze the file '{target_file}'. "
    f"AUTONOMOUSLY detect ALL heading patterns in the document (do NOT assume any specific format). "
    f"Build and persist the complete AST structure in '{ast_path}' using the ast_store tool. "
    "IMPORTANT: All write operations are token-guarded. Before each write, you MUST call "
    "ast_store(action='load_meta', purpose=..., node_titles=[...]) to obtain a one-time edit_token, "
    "then perform exactly ONE write using that edit_token. "
    "Prefer title-path operations to avoid index miscounts: append_child_by_titles / upsert_child_by_titles / "
    "update_node_by_titles / append_to_summary_by_titles. "
    "Use list_children/find_by_title/resolve_path to confirm current AST state before deciding parents. "
    "Read the file in chunks to identify heading patterns - look for Markdown (#, ##), numbered sections (1., 1.1), "
    "bracketed headings (【...】), chapter titles, and any other visual indicators of document structure. "
    "Do NOT try to output the whole AST in your final response. "
    f"Finally return AgentResult with status='ok' and ast_path='{ast_path}'."
)

inputs = {"messages": [{"role": "user", "content": query}]}
result = agent.invoke(inputs)

agent_status = result.get("structured_response")

# 実際のASTはファイルから読み出す（出力が大きくてもLLMの記憶に依存しない）
with open(ast_path, "r", encoding="utf-8") as f:
    ast_data = json.load(f)

try:
    ast_obj = DocumentAST.model_validate(ast_data)
    ast_json = json.dumps(ast_obj.model_dump(), ensure_ascii=False, indent=2)
except Exception:
    ast_json = json.dumps(ast_data, ensure_ascii=False, indent=2)

[1m[values][0m {'messages': [HumanMessage(content="Analyze the file '富士フィルム_有価証券報告書.txt'. AUTONOMOUSLY detect ALL heading patterns in the document (do NOT assume any specific format). Build and persist the complete AST structure in '富士フィルム_有価証券報告書.txt.ast.json' using the ast_store tool. IMPORTANT: All write operations are token-guarded. Before each write, you MUST call ast_store(action='load_meta', purpose=..., node_titles=[...]) to obtain a one-time edit_token, then perform exactly ONE write using that edit_token. Prefer title-path operations to avoid index miscounts: append_child_by_titles / upsert_child_by_titles / update_node_by_titles / append_to_summary_by_titles. Use list_children/find_by_title/resolve_path to confirm current AST state before deciding parents. Read the file in chunks to identify heading patterns - look for Markdown (#, ##), numbered sections (1., 1.1), bracketed headings (【...】), chapter titles, and any other visual indicators of document structure. Do NOT try

In [43]:
from langchain_core.messages import AIMessage, ToolMessage, HumanMessage

# tool_call_id -> ToolMessage
_tool_messages = {
    m.tool_call_id: m for m in result["messages"] if isinstance(m, ToolMessage)
}

all_logs = []
for idx, m in enumerate(result["messages"]):
    if isinstance(m, HumanMessage):
        all_logs.append({
            "type": "human_message",
            "index": idx,
            "content": m.content,
        })
    elif isinstance(m, AIMessage):
        # AIの思考プロセス（ツール呼び出しがない場合）
        if not m.tool_calls:
            all_logs.append({
                "type": "ai_thought",
                "index": idx,
                "content": m.content,
            })
        # ツール呼び出しがある場合
        else:
            for tc in m.tool_calls:
                tool_msg = _tool_messages.get(tc.get("id"))
                all_logs.append({
                    "type": "tool_call",
                    "index": idx,
                    "tool_name": tc.get("name"),
                    "args": tc.get("args"),
                    "tool_call_id": tc.get("id"),
                    "ai_content": m.content,  # AIの思考プロセスも含める
                    "output": getattr(tool_msg, "content", None) if tool_msg else None,
                    "status": getattr(tool_msg, "status", None) if tool_msg else None,
                })
    elif isinstance(m, ToolMessage):
        # ToolMessageは既にtool_callのログに含まれているので、必要に応じて追加
        all_logs.append({
            "type": "tool_result",
            "index": idx,
            "tool_call_id": m.tool_call_id,
            # "content": m.content,
            "status": getattr(m, "status", None),
        })

In [44]:
import json

for log in all_logs:
    print(json.dumps(log, ensure_ascii=False, indent=2))
    print("-" * 50)

{
  "type": "human_message",
  "index": 0,
  "content": "Analyze the file '富士フィルム_有価証券報告書.txt'. AUTONOMOUSLY detect ALL heading patterns in the document (do NOT assume any specific format). Build and persist the complete AST structure in '富士フィルム_有価証券報告書.txt.ast.json' using the ast_store tool. IMPORTANT: All write operations are token-guarded. Before each write, you MUST call ast_store(action='load_meta', purpose=..., node_titles=[...]) to obtain a one-time edit_token, then perform exactly ONE write using that edit_token. Prefer title-path operations to avoid index miscounts: append_child_by_titles / upsert_child_by_titles / update_node_by_titles / append_to_summary_by_titles. Use list_children/find_by_title/resolve_path to confirm current AST state before deciding parents. Read the file in chunks to identify heading patterns - look for Markdown (#, ##), numbered sections (1., 1.1), bracketed headings (【...】), chapter titles, and any other visual indicators of document structure. Do NOT

In [45]:
print(ast_json)

{
  "file_name": "富士フィルム_有価証券報告書.txt",
  "root": {
    "section_title": "富士フィルム_有価証券報告書",
    "content_summary": "",
    "children": [
      {
        "section_title": "【表紙】",
        "content_summary": "",
        "children": []
      },
      {
        "section_title": "【提出書類】",
        "content_summary": "",
        "children": []
      },
      {
        "section_title": "【根拠条文】",
        "content_summary": "",
        "children": []
      },
      {
        "section_title": "【提出先】",
        "content_summary": "",
        "children": []
      },
      {
        "section_title": "【提出日】",
        "content_summary": "",
        "children": []
      },
      {
        "section_title": "【事業年度】",
        "content_summary": "",
        "children": []
      },
      {
        "section_title": "【会社名】",
        "content_summary": "",
        "children": []
      },
      {
        "section_title": "【英訳名】",
        "content_summary": "",
        "children": []
      },
      {
        "sectio