In [None]:
!pip uninstall docx



: 

In [1]:
"""
Auto TOC generator for messy DOCX and PDFs.

pip install python-docx pymupdf docx
"""

import re
import statistics
from collections import Counter, defaultdict
from docx import Document
from docx.enum.text import WD_PARAGRAPH_ALIGNMENT

import fitz  # PyMuPDF


ModuleNotFoundError: No module named 'exceptions'

In [None]:


# ---------- Helpers for DOCX ----------
def _get_run_font_size_pt(run):
    """Return run font size in points if available, else None."""
    try:
        s = run.font.size
        if s is None:
            return None
        # s is a docx.shared.Length which usually has .pt
        try:
            return float(s.pt)
        except Exception:
            # fallback if it's already numeric
            return float(s)
    except Exception:
        return None


def _get_paragraph_max_font_size_pt(para):
    """Return the largest run font size in a paragraph, or paragraph style font size, or None."""
    sizes = []
    for run in para.runs:
        sz = _get_run_font_size_pt(run)
        if sz:
            sizes.append(sz)
    # try style font-size if no runs report size
    try:
        sty_sz = para.style.font.size
        if sty_sz:
            try:
                sizes.append(float(sty_sz.pt))
            except Exception:
                sizes.append(float(sty_sz))
    except Exception:
        pass
    return max(sizes) if sizes else None


# numbering detection regexes
NUM_RE_1 = re.compile(r'^\s*(?P<num>(?:\d+\.)+\d*|\d+)\b[.\-:)]*\s*(?P<title>.*)', re.I)
NUM_RE_CHAPTER = re.compile(r'^\s*(chapter|section)\s+(?P<num>[\dIVXLCDMivxlcdm\.]+)\b[.\-:)]*\s*(?P<title>.*)', re.I)


def extract_docx_headings_improved(docx_path, min_score=2, dedupe_threshold_ratio=0.6):
    """
    Returns list of heading dicts: {"text", "level", "page": None, "score", "reason"}.
    - min_score: pick paragraphs with score >= min_score
    - dedupe_threshold_ratio: if a candidate text appears on > ratio * total_paragraphs, it's probably header/footer and removed
    """
    doc = Document(docx_path)
    paras = [p for p in doc.paragraphs if p.text and p.text.strip()]
    p_infos = []
    # gather font sizes for median computation
    all_font_sizes = []
    for p in paras:
        size = _get_paragraph_max_font_size_pt(p)
        if size:
            all_font_sizes.append(size)

    median_size = statistics.median(all_font_sizes) if all_font_sizes else None
    total_paras = max(1, len(paras))

    for i, p in enumerate(paras):
        text = p.text.strip()
        num_words = len(text.split())
        style_name = getattr(p.style, "name", "") or ""
        style_level = None
        if style_name.lower().startswith("heading"):
            # try to extract numeric level (Heading 1, Heading 2, ...)
            m = re.search(r'(\d+)', style_name)
            style_level = int(m.group(1)) if m else 1

        max_size = _get_paragraph_max_font_size_pt(p)
        is_bold = any((run.bold is True) for run in p.runs)
        alignment = getattr(p, "alignment", None)
        is_center = alignment == WD_PARAGRAPH_ALIGNMENT.CENTER

        # numbering checks
        num_level = None
        starts_numbering = False
        m = NUM_RE_1.match(text)
        if m:
            starts_numbering = True
            num_text = m.group("num")
            # numeric level = count of dot-separated parts
            num_level = len(num_text.split('.'))
            # keep the rest of title if present
            text_no_num = m.group("title").strip() or text
        else:
            m2 = NUM_RE_CHAPTER.match(text)
            if m2:
                starts_numbering = True
                num_level = 1  # treat chapter/section as top level
                text_no_num = m2.group("title").strip() or text
            else:
                text_no_num = text

        # compute score heuristics
        score = 0
        reasons = []
        if style_level:
            score += 10
            reasons.append("explicit heading style")
        if starts_numbering:
            score += 4
            reasons.append("starts with numbering")
        if max_size and median_size:
            # if significantly larger than median, add score proportional to difference
            if max_size >= median_size + 1.0:
                # bigger -> more likely heading
                delta = max_size - median_size
                add = 2 if delta < 3 else 4
                score += add
                reasons.append(f"font size bigger than median by {delta:.1f}pt")
        if is_bold:
            score += 1
            reasons.append("bold run")
        if is_center:
            score += 1
            reasons.append("centered")
        if num_words <= 12:
            score += 1
            reasons.append("short length")

        p_infos.append({
            "index": i,
            "text": text,
            "text_no_num": text_no_num,
            "style_level": style_level,
            "max_font_size_pt": max_size,
            "is_bold": is_bold,
            "is_center": is_center,
            "starts_numbering": starts_numbering,
            "num_level": num_level,
            "num_words": num_words,
            "score": score,
            "reasons": reasons
        })

    # pick candidates
    candidates = [p for p in p_infos if p["score"] >= min_score or p["style_level"]]
    # dedupe likely running headers/footers: if same text appears many times across document, ignore
    text_counter = Counter([c["text"].lower() for c in candidates])
    filtered = []
    for c in candidates:
        cnt = text_counter[c["text"].lower()]
        if cnt > dedupe_threshold_ratio * total_paras:
            # skip repeating header/footer
            continue
        filtered.append(c)

    if not filtered:
        # fallback: relax threshold to include best scored ones
        candidates_sorted = sorted(p_infos, key=lambda x: x["score"], reverse=True)
        filtered = candidates_sorted[:max(1, int(len(candidates_sorted) * 0.05))]  # top 5% at least

    # determine level mapping
    # priority: style_level -> numbering depth -> font-size tiers
    # build font-size tiers
    sizes = sorted({c["max_font_size_pt"] for c in filtered if c["max_font_size_pt"]}, reverse=True)
    size_to_level = {s: i + 1 for i, s in enumerate(sizes)}  # largest size => level 1

    headings = []
    for c in filtered:
        level = None
        if c["style_level"]:
            level = c["style_level"]
        elif c["num_level"]:
            # num level typically indicates nesting directly
            level = c["num_level"]
        elif c["max_font_size_pt"] and c["max_font_size_pt"] in size_to_level:
            level = size_to_level[c["max_font_size_pt"]]
        else:
            # default low-level heading
            level = 2
        headings.append({
            "text": c["text_no_num"],
            "level": int(level),
            "page": None,
            "score": c["score"],
            "reasons": c["reasons"]
        })

    # sort by occurrence in document (index)
    headings_sorted = sorted(headings, key=lambda h: next((p["index"] for p in p_infos if p["text_no_num"] == h["text"]), 0))
    return headings_sorted


# ---------- Helpers for PDF ----------
NUM_RE_PDF = re.compile(r'^\s*(?P<num>(?:\d+\.)+\d*|\d+|chapter\s+\d+)\b[.\-:)]*\s*(?P<title>.*)', re.I)


def extract_pdf_headings_improved(pdf_path, size_delta_threshold=1.0, dedupe_threshold_ratio=0.6):
    """
    Extract headings from PDF using PyMuPDF (fitz).
    - size_delta_threshold: how many points above median size to consider a span heading candidate
    """
    doc = fitz.open(pdf_path)
    all_spans = []
    for page_num, page in enumerate(doc):
        # get dict for structured access
        page_dict = page.get_text("dict")
        for block in page_dict.get("blocks", []):
            for line in block.get("lines", []):
                for span in line.get("spans", []):
                    text = span.get("text", "").strip()
                    if not text:
                        continue
                    size = span.get("size", None)
                    if size is None:
                        continue
                    all_spans.append({
                        "text": text,
                        "size": float(size),
                        "page": page_num + 1
                    })

    if not all_spans:
        return []

    sizes = [s["size"] for s in all_spans]
    median_size = statistics.median(sizes)

    # candidate spans are those large enough or starting with numbering
    candidates = []
    for s in all_spans:
        starts_num = bool(NUM_RE_PDF.match(s["text"]))
        if s["size"] >= median_size + size_delta_threshold or starts_num:
            candidates.append(s)

    # merge consecutive spans on same page if they come from same line (simple grouping by page+text)
    # dedupe headers that repeat across many pages
    text_page_counts = Counter([ (c["text"].lower(), c["page"]) for c in candidates ])
    # find texts that appear on many pages
    text_counts_across_pages = Counter([ c["text"].lower() for c in candidates ])
    filtered = []
    num_pages = len(doc)
    for c in candidates:
        if text_counts_across_pages[c["text"].lower()] > dedupe_threshold_ratio * num_pages:
            continue  # likely running header/footer
        filtered.append(c)

    if not filtered:
        # fallback to best by size
        filtered = sorted(candidates, key=lambda x: x["size"], reverse=True)[:50]

    # map unique sizes to levels
    unique_sizes = sorted({c["size"] for c in filtered}, reverse=True)
    size_to_level = {s: i + 1 for i, s in enumerate(unique_sizes)}

    headings = []
    # avoid duplicates of the same (text,page) pair
    seen = set()
    for c in filtered:
        key = (c["text"].strip(), c["page"])
        if key in seen:
            continue
        seen.add(key)

        # numbering level if present
        m = NUM_RE_PDF.match(c["text"])
        num_level = None
        text_no_num = c["text"]
        if m:
            num = m.group("num")
            # count numeric segments if dot separated
            if '.' in num:
                num_level = len(num.split('.'))
            else:
                num_level = 1
            # remove number from title portion if present
            text_no_num = (m.group("title") or c["text"]).strip() or c["text"]

        level = size_to_level.get(c["size"], 2)
        if num_level:
            # try to use numbering depth as override (but keep it reasonable)
            level = min(level, num_level)

        headings.append({
            "text": text_no_num,
            "level": int(level),
            "page": c["page"],
            "size": c["size"]
        })

    # sort by page and appearance (we don't have exact order within page beyond collection order)
    headings_sorted = sorted(headings, key=lambda h: (h["page"], -h.get("size", 0)))
    return headings_sorted






In [None]:
# ---------- Printing / Export ----------
def pretty_print_toc(headings, source_name="DOCX/PDF"):
    print(f"\nExtracted TOC ({source_name}):")
    for h in headings:
        indent = "  " * (max(0, h["level"] - 1))
        page = f" (p.{h['page']})" if h.get("page") else ""
        print(f"{indent}- {h['text']}{page}")


def export_toc_markdown(headings):
    lines = []
    for h in headings:
        indent = "  " * (max(0, h["level"] - 1))
        lines.append(f"{indent}- {h['text']}")
    return "\n".join(lines)



In [None]:
# ---------- Example usage ----------
if __name__ == "__main__":
    # DOCX example
    docx_file = "example_messy.docx"
    try:
        docx_headings = extract_docx_headings_improved(docx_file)
        pretty_print_toc(docx_headings, source_name="DOCX")
    except Exception as e:
        print("DOCX extraction failed:", e)

    # PDF example
    pdf_file = "example_long.pdf"
    try:
        pdf_headings = extract_pdf_headings_improved(pdf_file)
        pretty_print_toc(pdf_headings, source_name="PDF")
    except Exception as e:
        print("PDF extraction failed:", e)