# TOC Detection & Parsing (LLM-only)
This notebook tests Table of Contents (TOC) detection and parsing using only LLM outputs.
- Input must be a flat doctree (builder output). If you pass a content_list (list), it will be wrapped into a flat object temporarily.
- Detection: calls LLM per page to decide is_toc.
- Parsing: calls LLM once with all detected TOC pages to produce a headings tree.

In [1]:
import os, json
from tree import (
  build_flat_doctree,
  build_toc_page_payload, render_toc_detect_prompt, find_toc_pages, build_toc_tree_with_llm
)
from tree import gpt_llm_call, qwen_llm_call

def load_flat(path):
    with open(path, 'r', encoding='utf-8') as f:
        obj = json.load(f)
    if isinstance(obj, list):
        src = os.path.dirname(path)
        doc_id = os.path.basename(src) or 'document'
        return build_flat_doctree(obj, source_dir=src, doc_id=doc_id, include_indices=True)
    return obj


## 1) Select doctree and provider

In [3]:
DOCTREE_PATH = './../../../data/users/yiming/dtagent/MinerU_MMLB/User_Manual_1500S_Classic_EN/doctree.json'  # TODO: change to your file
PROVIDER = 'gpt'  # 'gpt' | 'qwen'
flat = load_flat(DOCTREE_PATH)
print('doc_id:', flat.get('doc_id'), '| children:', len(flat.get('children', [])))

def llm_call(messages, images=None):
    if PROVIDER == 'qwen':
        return qwen_llm_call(messages, images, model='qwen-vl-max', json_mode=True)
    else:
        return gpt_llm_call(messages, images, model='gpt-4o', json_mode=True)


doc_id: User_Manual_1500S_Classic_EN | children: 1216


## 2) Single-page detection preview (LLM-only)

In [4]:
from tree import render_toc_detect_prompt
PAGE = 1  # set to a likely TOC page
payload = build_toc_page_payload(flat, PAGE)
print('lines on page', PAGE, ':', len(payload['lines']))
for ln in payload['lines'][:8]:
    print('-', (ln.get('text') or '')[:100])

messages = [
  {'role':'system','content':'You only output JSON.'},
  {'role':'user','content': render_toc_detect_prompt(payload)},
]
raw = llm_call(messages, images=None)
print('RAW:', raw[:300])
obj = json.loads(raw)
print('is_toc:', obj.get('is_toc'), 'conf:', obj.get('confidence'))


lines on page 1 : 14
- Congratulations on the purchase of your WMF coffee machine.
- Congratulations on the purchase of your WMF coffee machine.The WMF 1500 S coffee machine is a fully 
- Follow the User Manual
- > Read the User Manual carefully prior to use. > Please refer to the User Manual, paying special att
- A CAUTION
- Follow the User Manual signs and symbols page 16 Follow the Safety chapter > starting on page 6
- Hazard to life due to electrical shock
- The voltage inside the coffee machine is a hazard to life. $>$ Never open the housing. $>$ Never loo
RAW: {"is_toc":false,"confidence":0.7}
is_toc: False conf: 0.7


## 3) Batch detection (find_toc_pages)

In [5]:
toc_pages = find_toc_pages(flat, llm_call, start_page=1, max_scan_pages=50)
print('Detected TOC pages:', toc_pages)


Detected TOC pages: [2, 3, 4]


## 4) Parse headings tree from detected TOC pages

In [6]:
if toc_pages:
    toc_tree = build_toc_tree_with_llm(flat, toc_pages, llm_call)
    print('pages:', toc_tree.get('pages'))
    print('headings:', len(toc_tree.get('headings', [])))
    for h in toc_tree.get('headings', [])[:10]:
        print('-', h.get('level'), h.get('page'), '|', (h.get('title') or '')[:80])
else:
    print('No TOC pages by LLM. Adjust start_page or test a single page in step 2.')


pages: [2, 3, 4]
headings: 13
- 1 6 | 1 Safety
- 1 13 | 2 Introduction
- 1 17 | 3 Operation
- 1 27 | 4 Software
- 1 57 | 5 Other settings
- 1 59 | 6 Care
- 1 82 | 7 HACCP cleaning schedule
- 1 84 | 8 Maintenance and descaling
- 1 86 | 9 Messages and instructions
- 1 93 | 10 Safety and warranty


In [6]:
toc_tree

{'type': 'toc',
 'doc_id': 'User_Manual_1500S_Classic_EN',
 'pages': [2, 3, 4],
 'headings': [{'title': '1 Safety',
   'level': 1,
   'page': 6,
   'children': [{'title': '1.1 General safety instructions',
     'level': 2,
     'page': 6,
     'children': []},
    {'title': '1.2 Intended use', 'level': 2, 'page': 6, 'children': []},
    {'title': '1.3 Conditions for usage and installation',
     'level': 2,
     'page': 12,
     'children': []}]},
  {'title': '2 Introduction',
   'level': 1,
   'page': 13,
   'children': [{'title': '2.1 Parts of the coffee machine',
     'level': 2,
     'page': 14,
     'children': []}]},
  {'title': '3 Operation',
   'level': 1,
   'page': 17,
   'children': [{'title': '3.1 Operation safety instructions',
     'level': 2,
     'page': 18,
     'children': []},
    {'title': '3.2 Switch on coffee machine',
     'level': 2,
     'page': 18,
     'children': []},
    {'title': '3.3 Milk or milk foam (optional)',
     'level': 2,
     'page': 19,
     'c