# TOC Detection & Parsing (LLM-only)
This notebook tests Table of Contents (TOC) detection and parsing using only LLM outputs.
- Input must be a flat doctree (builder output). If you pass a content_list (list), it will be wrapped into a flat object temporarily.
- Detection: calls LLM per page to decide is_toc.
- Parsing: calls LLM once with all detected TOC pages to produce a headings tree.

In [1]:
from tree import (
    build_flat_doctree,
    build_toc_page_payload,
    render_toc_parse_with_span_prompt,
    find_toc_pages,
    consolidate_toc_v2,
    gpt_llm_call, qwen_llm_call
)
import os, json

def load_flat(path):
    with open(path, 'r', encoding='utf-8') as f:
        obj = json.load(f)
    if isinstance(obj, list):  # 允许直接给 content_list.json
        src = os.path.dirname(path)
        doc_id = os.path.basename(src) or 'document'
        return build_flat_doctree(obj, source_dir=src, doc_id=doc_id, include_indices=True)
    return obj

## 1) Select doctree and provider

In [2]:
DOCTREE_PATH = './../../../data/users/yiming/dtagent/MinerU_MMLB/698bba535087fa9a7f9009e172a7f763/doctree.json'  # TODO: change to your file
PROVIDER = 'gpt'  # 'gpt' | 'qwen'
flat = load_flat(DOCTREE_PATH)
print('doc_id:', flat.get('doc_id'), '| children:', len(flat.get('children', [])))

def llm_call(messages, images=None):
    if PROVIDER == 'qwen':
        return qwen_llm_call(messages, images, model='qwen-vl-max', json_mode=True)
    else:
        return gpt_llm_call(messages, images, model='gpt-4o', json_mode=True)


doc_id: 698bba535087fa9a7f9009e172a7f763 | children: 110


## 2) Single-page detection preview (LLM-only)

In [3]:
from tree import render_toc_parse_with_span_prompt
PAGE = 0  # set to a likely TOC page
payload = build_toc_page_payload(flat, PAGE)
print('lines on page', PAGE, ':', len(payload['lines']))
for ln in payload['lines'][:8]:
    print('-', (ln.get('text') or '')[:100])

messages = [
  {'role':'system','content':'You only output JSON.'},
  {'role':'user','content': render_toc_parse_with_span_prompt(payload)},
]
raw = llm_call(messages, images=None)
print('RAW:', raw[:300])
obj = json.loads(raw)
print('is_toc:', obj.get('is_toc'), 'conf:', obj.get('confidence'))


lines on page 0 : 0
RAW: {
  "headings": [],
  "start_idx": 0,
  "end_idx": 0,
  "pages": []
}
is_toc: None conf: None


## 3) Batch detection (find_toc_pages)

In [4]:
toc_pages = find_toc_pages(flat, llm_call, start_page=0, max_scan_pages=50)
print('Detected TOC pages:', toc_pages)


Detected TOC pages: [6, 7]


## 4) Parse headings tree from detected TOC pages

In [5]:
children_before = len(flat.get('children', []))
new_root = consolidate_toc_v2(
    flat, toc_pages, llm_call,
    include_geometry=True,
    split_concatenated=True,
    long_text_threshold=300,
    max_text_len=None
)
children_after = len(new_root.get('children', []))
print('children before/after:', children_before, '→', children_after)

children before/after: 110 → 99


In [6]:
toc_positions = [i for i,ch in enumerate(new_root['children']) if ch.get('type')=='toc']
print('toc node at:', toc_positions)
if toc_positions:
    t = new_root['children'][toc_positions[0]]
    print('toc pages:', t.get('pages'), '| headings:', len(t.get('headings', [])))
    for h in t.get('headings', [])[:10]:
        print('-', h.get('level'), h.get('page'), '|', (h.get('title') or '')[:100])


toc node at: [17]
toc pages: [6, 7] | headings: 8
- 1 1 | Executive Summary
- 1 None | Chapter 1: Historical Overview of Hamilton County
- 1 23 | Chapter 2: Survey Results
- 1 36 | Chapter 3: Recommendations
- 1 51 | Chapter 4: Preservation in Nebraska
- 1 58 | Appendix A: Inventory of Surveyed Properties
- 1 67 | References
- 1 70 | Glossary


In [7]:
new_root.keys()

dict_keys(['type', 'doc_id', 'children', 'source_path', 'indices'])

In [8]:
new_root['children']

[{'type': 'image',
  'node_id': '698bba535087fa9a7f9009e172a7f763#0',
  'node_idx': 0,
  'page_idx': 0,
  'node_level': -1,
  'img_path': 'images/9bb51ce9db382f2eade929642f829dd09a85da36c0fe9a899e03cc3e62dca286.jpg',
  'outline': [75, 82, 534, 346],
  'text': '',
  'description': ''},
 {'type': 'image',
  'node_id': '698bba535087fa9a7f9009e172a7f763#1',
  'node_idx': 1,
  'page_idx': 0,
  'node_level': -1,
  'img_path': 'images/6cb8f0c71dab749b5080f6a7aca9e034ab38bb42b55845ae4bcb9ae73d8f77d0.jpg',
  'outline': [75, 351, 528, 709],
  'text': '',
  'description': ''},
 {'type': 'text',
  'node_id': '698bba535087fa9a7f9009e172a7f763#2',
  'node_idx': 2,
  'page_idx': 1,
  'node_level': -1,
  'text': ''},
 {'type': 'text',
  'node_id': '698bba535087fa9a7f9009e172a7f763#3',
  'node_idx': 3,
  'page_idx': 2,
  'node_level': 1,
  'text': 'Hamilton CountyNebraska Historic Buildings Survey'},
 {'type': 'text',
  'node_id': '698bba535087fa9a7f9009e172a7f763#4',
  'node_idx': 4,
  'page_idx': 2,
