# Page Refinement Demo (Plan / Jsonlist)
This notebook lets you pick a doctree file and a page index, then run a small end-to-end test of page-level refinement using the new function APIs.

- Plan path: Model returns a plan `{page_idx, items, merges?, virtual_titles?}`.
- Jsonlist path: Model returns an array `[{node_id, level}]` for reorder+relevel only.

No network calls are included here; you can paste a model response into a variable to simulate the flow.

In [1]:
import json, os, glob
from pprint import pprint

from tree import (
    build_page_payload,
    render_plan_prompt, render_jsonlist_prompt,
    validate_and_normalize_plan, validate_and_normalize_jsonlist,
    apply_plan_to_document, apply_nodes_to_document,
)

def load_doctree(path):
    with open(path, 'r', encoding='utf-8') as f:
        return json.load(f)

def save_doctree(obj, path):
    with open(path, 'w', encoding='utf-8') as f:
        json.dump(obj, f, ensure_ascii=False, indent=2)

def page_ids(elements):
    return [e['node_id'] for e in elements]

def types_by_id(elements):
    return {e['node_id']: e['type'] for e in elements}


## 1) Choose doctree file and page index

In [2]:
# Adjust this glob to your dataset root if needed
candidates = glob.glob('**/doctree.json', recursive=True)
print('Found doctree files:', len(candidates))
for i, p in enumerate(candidates[:10]):
    print(i, p)

# Select one file and a page index here
DOCTREE_PATH = candidates[0] if candidates else ''
PAGE_IDX = 0
print('Using doctree:', DOCTREE_PATH, 'page:', PAGE_IDX)


Found doctree files: 0
Using doctree:  page: 0


## 2) Build payload and inspect elements

In [None]:
root = load_doctree(DOCTREE_PATH)
payload = build_page_payload(root, PAGE_IDX, page_image_ref=None)
print('Elements on page:', len(payload['elements']))
for e in payload['elements'][:8]:
    print(e['node_id'], e['type'], '|', e.get('snippet','')[:60])

# Render prompts (you will pass these to your LLM)
plan_prompt = render_plan_prompt(payload, level_max=4)
jsonlist_prompt = render_jsonlist_prompt(payload, level_max=4)
print('Plan prompt preview:
', plan_prompt.splitlines()[0])
print('Jsonlist prompt preview:
', jsonlist_prompt.splitlines()[0])


## 3) Paste your model output and apply
- Paste the LLM output string into `RAW` below.
- If it's a plan object, set `IS_PLAN = True`; if it's a jsonlist array, set `IS_PLAN = False`.

In [None]:
IS_PLAN = True  # set to False if you pasted a jsonlist
RAW = '{
  "page_idx": %d,
  "items": [],
  "merges": [],
  "virtual_titles": []
}' % PAGE_IDX

elems = payload['elements']
ids = page_ids(elems)
types = types_by_id(elems)

if IS_PLAN:
    ok, norm, errs = validate_and_normalize_plan(json.loads(RAW),
                                                allowed_ids=ids,
                                                types_by_id=types,
                                                page_idx=PAGE_IDX,
                                                level_max=4)
    print('Plan ok:', ok)
    if not ok:
        pprint(errs)
    else:
        new_root = apply_plan_to_document(root, PAGE_IDX, norm)
else:
    ok, items, errs = validate_and_normalize_jsonlist(json.loads(RAW), allowed_ids=ids, level_max=4)
    print('Jsonlist ok:', ok)
    if not ok:
        pprint(errs)
    else:
        # Here you can transform items into full nodes or let your model return nodes directly
        # For demo purpose, we will not apply jsonlist directly.
        pass


## 4) Inspect changes (if applied)

In [None]:
def page_slice(doc):
    return [ch for ch in doc.get('children', []) if ch.get('page_idx') == PAGE_IDX]

old_page = page_slice(root)
print('Old page nodes:', len(old_page))
for ch in old_page[:6]:
    print(ch.get('type'), ch.get('node_level'), '|', (ch.get('text') or ch.get('table_text') or '')[:60])

try:
    _ = new_root
    new_page = page_slice(new_root)
    print('
New page nodes:', len(new_page))
    for ch in new_page[:6]:
        print(ch.get('type'), ch.get('node_level'), '|', (ch.get('text') or ch.get('table_text') or '')[:60])
except NameError:
    print('No new_root available; apply a valid plan first.')
