# TOC Detection & Parsing (LLM-only)
This notebook tests Table of Contents (TOC) detection and parsing using only LLM outputs.
- Input must be a flat doctree (builder output). If you pass a content_list (list), it will be wrapped into a flat object temporarily.
- Detection: calls LLM per page to decide is_toc.
- Parsing: calls LLM once with all detected TOC pages to produce a headings tree.

In [1]:
from tree import (
    build_flat_doctree,
    build_toc_page_payload,
    render_toc_parse_with_span_prompt,
    find_toc_pages,
    consolidate_toc_v2,
    gpt_llm_call, qwen_llm_call
)
import os, json

def load_flat(path):
    with open(path, 'r', encoding='utf-8') as f:
        obj = json.load(f)
    if isinstance(obj, list):  # 允许直接给 content_list.json
        src = os.path.dirname(path)
        doc_id = os.path.basename(src) or 'document'
        return build_flat_doctree(obj, source_dir=src, doc_id=doc_id, include_indices=True)
    return obj

## 1) Select doctree and provider

In [2]:
DOCTREE_PATH = './../../../data/users/yiming/dtagent/MinerU_MMLB/Macbook_air/doctree.json'  # TODO: change to your file
PROVIDER = 'gpt'  # 'gpt' | 'qwen'
flat = load_flat(DOCTREE_PATH)
print('doc_id:', flat.get('doc_id'), '| children:', len(flat.get('children', [])))

def llm_call(messages, images=None):
    if PROVIDER == 'qwen':
        return qwen_llm_call(messages, images, model='qwen-vl-max', json_mode=True)
    else:
        return gpt_llm_call(messages, images, model='gpt-4o', json_mode=True)


doc_id: Macbook_air | children: 471


## 2) Single-page detection preview (LLM-only)

In [3]:
from tree import render_toc_parse_with_span_prompt
PAGE = 0  # set to a likely TOC page
payload = build_toc_page_payload(flat, PAGE)
print('lines on page', PAGE, ':', len(payload['lines']))
for ln in payload['lines'][:8]:
    print('-', (ln.get('text') or '')[:100])

messages = [
  {'role':'system','content':'You only output JSON.'},
  {'role':'user','content': render_toc_parse_with_span_prompt(payload)},
]
raw = llm_call(messages, images=None)
print('RAW:', raw[:300])
obj = json.loads(raw)
print('is_toc:', obj.get('is_toc'), 'conf:', obj.get('confidence'))


lines on page 0 : 1
- Congratulations, you and your MacBook Air were made for each other.
RAW: {}
is_toc: None conf: None


## 3) Batch detection (find_toc_pages)

In [4]:
toc_pages = find_toc_pages(flat, llm_call, start_page=0, max_scan_pages=50)
print('Detected TOC pages:', toc_pages)


Detected TOC pages: [4, 5]


## 4) Parse headings tree from detected TOC pages

In [5]:
children_before = len(flat.get('children', []))
new_root = consolidate_toc_v2(
    flat, toc_pages, llm_call,
    include_geometry=True,
    split_concatenated=True,
    long_text_threshold=300,
    max_text_len=None
)
children_after = len(new_root.get('children', []))
print('children before/after:', children_before, '→', children_after)

children before/after: 471 → 460


In [6]:
toc_positions = [i for i,ch in enumerate(new_root['children']) if ch.get('type')=='toc']
print('toc node at:', toc_positions)
if toc_positions:
    t = new_root['children'][toc_positions[0]]
    print('toc pages:', t.get('pages'), '| headings:', len(t.get('headings', [])))
    for h in t.get('headings', [])[:10]:
        print('-', h.get('level'), h.get('page'), '|', (h.get('title') or '')[:100])


toc node at: [36]
toc pages: [4, 5] | headings: 4
- 1 8 | Chapter 1: Ready, Set Up, Go
- 1 26 | Chapter 2: Life with Your MacBook Air
- 1 40 | Chapter 3: Problem, Meet Solution
- 1 58 | Chapter 4: Last, but Not Least


In [7]:
new_root.keys()

dict_keys(['type', 'doc_id', 'children', 'source_path', 'indices'])

In [8]:
new_root['children']

[{'type': 'text',
  'node_id': 'Macbook_air#0',
  'node_idx': 0,
  'page_idx': 0,
  'node_level': -1,
  'text': 'Congratulations, you and your MacBook Air were made for each other.'},
 {'type': 'text',
  'node_id': 'Macbook_air#1',
  'node_idx': 1,
  'page_idx': 1,
  'node_level': 1,
  'text': 'Built-in iSight camera'},
 {'type': 'text',
  'node_id': 'Macbook_air#2',
  'node_idx': 2,
  'page_idx': 1,
  'node_level': -1,
  'text': 'Video chat with up to three friends anywhere in the world at the same time.'},
 {'type': 'text',
  'node_id': 'Macbook_air#3',
  'node_idx': 3,
  'page_idx': 1,
  'node_level': -1,
  'text': 'www.apple.com/macbookair'},
 {'type': 'text',
  'node_id': 'Macbook_air#4',
  'node_idx': 4,
  'page_idx': 1,
  'node_level': -1,
  'text': 'Mac Help Q isight'},
 {'type': 'image',
  'node_id': 'Macbook_air#5',
  'node_idx': 5,
  'page_idx': 1,
  'node_level': -1,
  'img_path': 'images/77c226b9180de0d8c229e1bd944d980866cead4226936407d660e7fe97816308.jpg',
  'outline': [5