# TOC Detection & Parsing (LLM-only)
This notebook tests Table of Contents (TOC) detection and parsing using only LLM outputs.
- Input must be a flat doctree (builder output). If you pass a content_list (list), it will be wrapped into a flat object temporarily.
- Detection: calls LLM per page to decide is_toc.
- Parsing: calls LLM once with all detected TOC pages to produce a headings tree.

In [1]:
import os, json
from tree import (
  build_flat_doctree,
  build_toc_page_payload, render_toc_detect_prompt, find_toc_pages, build_toc_tree_with_llm
)
from tree import gpt_llm_call, qwen_llm_call

def load_flat(path):
    with open(path, 'r', encoding='utf-8') as f:
        obj = json.load(f)
    if isinstance(obj, list):
        src = os.path.dirname(path)
        doc_id = os.path.basename(src) or 'document'
        return build_flat_doctree(obj, source_dir=src, doc_id=doc_id, include_indices=True)
    return obj


## 1) Select doctree and provider

In [2]:
DOCTREE_PATH = './../../../data/users/yiming/dtagent/MinerU_MMLB/Macbook_air/doctree.json'  # TODO: change to your file
PROVIDER = 'gpt'  # 'gpt' | 'qwen'
flat = load_flat(DOCTREE_PATH)
print('doc_id:', flat.get('doc_id'), '| children:', len(flat.get('children', [])))

def llm_call(messages, images=None):
    if PROVIDER == 'qwen':
        return qwen_llm_call(messages, images, model='qwen-vl-max', json_mode=True)
    else:
        return gpt_llm_call(messages, images, model='gpt-4o', json_mode=True)


doc_id: Macbook_air | children: 471


## 2) Single-page detection preview (LLM-only)

In [3]:
from tree import render_toc_detect_prompt
PAGE = 0  # set to a likely TOC page
payload = build_toc_page_payload(flat, PAGE)
print('lines on page', PAGE, ':', len(payload['lines']))
for ln in payload['lines'][:8]:
    print('-', (ln.get('text') or '')[:100])

messages = [
  {'role':'system','content':'You only output JSON.'},
  {'role':'user','content': render_toc_detect_prompt(payload)},
]
raw = llm_call(messages, images=None)
print('RAW:', raw[:300])
obj = json.loads(raw)
print('is_toc:', obj.get('is_toc'), 'conf:', obj.get('confidence'))


lines on page 0 : 1
- Congratulations, you and your MacBook Air were made for each other.
RAW: {"is_toc":false,"confidence":0.8}
is_toc: False conf: 0.8


## 3) Batch detection (find_toc_pages)

In [4]:
toc_pages = find_toc_pages(flat, llm_call, start_page=0, max_scan_pages=50)
print('Detected TOC pages:', toc_pages)


Detected TOC pages: [4, 5]


## 4) Parse headings tree from detected TOC pages

In [5]:
if toc_pages:
    toc_tree = build_toc_tree_with_llm(flat, toc_pages, llm_call)
    print('pages:', toc_tree.get('pages'))
    print('headings:', len(toc_tree.get('headings', [])))
    for h in toc_tree.get('headings', [])[:10]:
        print('-', h.get('level'), h.get('page'), '|', (h.get('title') or '')[:80])
else:
    print('No TOC pages by LLM. Adjust start_page or test a single page in step 2.')


pages: [4, 5]
headings: 6
- 1 None | Chapter 1: Ready, Set Up, Go
- 1 None | Chapter 2: Life with Your MacBook Air
- 1 None | Chapter 3: Problem, Meet Solution
- 1 None | Chapter 4: Last, but Not Least
- 1 None | Looking for Something?
- 1 70 | Index


In [6]:
toc_tree

{'type': 'toc',
 'doc_id': 'Macbook_air',
 'pages': [4, 5],
 'headings': [{'title': 'Chapter 1: Ready, Set Up, Go',
   'level': 1,
   'children': [{'title': 'Welcome', 'level': 2, 'page': 8, 'children': []},
    {'title': "What's in the Box", 'level': 2, 'page': 9, 'children': []},
    {'title': 'Setting Up Your MacBook Air',
     'level': 2,
     'page': 10,
     'children': []},
    {'title': 'Setting Up DVD or CD Sharing',
     'level': 2,
     'page': 15,
     'children': []},
    {'title': 'Migrating Information to Your MacBook Air',
     'level': 2,
     'page': 16,
     'children': []},
    {'title': 'Getting Additional Information onto Your MacBook Air',
     'level': 2,
     'page': 19,
     'children': []},
    {'title': 'Putting Your MacBook Air to Sleep or Shutting It Down',
     'level': 2,
     'page': 22,
     'children': []}]},
  {'title': 'Chapter 2: Life with Your MacBook Air',
   'level': 1,
   'children': [{'title': 'Basic Features of Your MacBook Air',
     'level'