<a href="https://colab.research.google.com/github/yomna29/Qwen3_8/blob/main/Qwen3_8B.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!apt-get -y install poppler-utils >/dev/null
!pip -q install "transformers>=4.43" accelerate pillow pdf2image >/dev/null

In [None]:
import os, json, time, math
from typing import List, Tuple

import torch
from PIL import Image
from pdf2image import convert_from_path
from transformers import AutoProcessor
from transformers import AutoModelForVision2Seq


# Speed/compat toggles
os.environ.setdefault("PYTORCH_ENABLE_SDPA", "1")
os.environ.setdefault("PYTORCH_MPS_ENABLE_FALLBACK", "1")  # harmless elsewhere

# Detect device
device = "cuda" if torch.cuda.is_available() else "cpu"

# Detect GPU arch (for attention impl)
if device == "cuda":
    sm_major, sm_minor = torch.cuda.get_device_capability()
else:
    sm_major, sm_minor = (0, 0)

# T4 is sm_75 → use SDPA. Ampere+ (sm>=80) can try flash-attn v2
attn_impl = "flash_attention_2" if (device == "cuda" and sm_major >= 7 ) else "sdpa"

# DType — force fp16 on T4 for speed (bf16 not supported on sm_75)
if device == "cuda":
    dtype = torch.float16
else:
    dtype = torch.float32

print(f"Device: {device}")
print(f"GPU capability (SM): {sm_major}.{sm_minor}")
print(f"Chosen attention impl: {attn_impl}")
print(f"dtype: {dtype}")


Device: cuda
GPU capability (SM): 8.0
Chosen attention impl: flash_attention_2
dtype: torch.float16


Loading the model

In [None]:
MODEL = "Qwen/Qwen3-VL-8B-Instruct"  # small & fast; swap if needed

processor = AutoProcessor.from_pretrained(MODEL, trust_remote_code=True)
try:
    processor.tokenizer.padding_side = "left"
    if processor.tokenizer.pad_token is None and processor.tokenizer.eos_token is not None:
        processor.tokenizer.pad_token = processor.tokenizer.eos_token
except Exception:
    pass

loader_kwargs = dict(
    dtype=dtype,
    device_map="auto" if device == "cuda" else None,
    trust_remote_code=True,
    attn_implementation=attn_impl,
)

try:
    model = AutoModelForVision2Seq.from_pretrained("Qwen/Qwen3-VL-8B-Instruct", device_map="auto")
except Exception as e:
    print(f"[Info] Falling back to PyTorch attention: {e}")
    loader_kwargs.pop("attn_implementation", None)
    model = AutoModelForVision2Seq.from_pretrained("Qwen/Qwen3-VL-8B-Instruct", device_map="auto")


if device != "cuda":
    model.to(device)

model.config.use_cache = True
model.eval()
torch.set_grad_enabled(False)
torch.backends.cudnn.benchmark = True
try:
    torch.set_float16_matmul_precision("high")
except Exception:
    pass

# Token IDs
eos_id = getattr(processor.tokenizer, "eos_token_id", None)
pad_id = getattr(processor.tokenizer, "pad_token_id", None) or eos_id

print("Model loaded.")


preprocessor_config.json:   0%|          | 0.00/390 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

video_preprocessor_config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

chat_template.json: 0.00B [00:00, ?B/s]



config.json: 0.00B [00:00, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/2.72G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.90G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/269 [00:00<?, ?B/s]

Model loaded.


In [None]:
import re
from typing import List, Tuple
from PIL import Image


def fit_long_side(img: Image.Image, long_side: int = 1280) -> Image.Image:
    """Shrink image so its longest side = long_side (no upscale)."""
    w, h = img.size
    scale = long_side / max(w, h)
    if scale < 1:
        new_size = (int(w * scale), int(h * scale))
        return img.resize(new_size, Image.BICUBIC)
    return img


# ---------- Prompt: pure Markdown, page header + rich Visual description ----------
# Keep rules ONLY in the system message so they don't get transcribed.
# Strongly nudge the model to produce a *descriptive* illustration line for each visual.
SYSTEM_MD = (
"""<image>
<|grounding|>
You are an OCR and document-structure model. Return ONLY Markdown text (no JSON, no HTML, no code fences).

GLOBAL RULES:
- Do NOT output any links, URLs, file paths, or image references like ![...](http://...) or <img ...>.
- Do NOT invent filenames, URLs, or external resources.
- Do NOT repeat these instructions or any system text in the output.

OVERALL FORMAT:
- Do NOT wrap the entire output in ``` fences.
- Use headings (#, ##, ###) for titles and sections that visibly look like headings.
- Preserve lists, sublists, and inline formatting (*bold*, _italic_, `code`) when it is clearly present.
- Maintain the natural reading order of the page.
- Ignore page numbers, running headers, and footers unless they contain important content.

TEXT & MATHEMATICAL CONTENT:
- Transcribe all visible text exactly, preserving original spelling and punctuation.
- Convert mathematical expressions to LaTeX:
  - Inline math → `$ ... $`
  - Display math on its own line → `$$ ... $$`
- If you are unsure about part of an equation, transcribe what you see as faithfully as possible in LaTeX.

TABLES (VERY IMPORTANT):
- If content is arranged in rows and columns like a table, you MUST render it as a Markdown pipe table.
- Do NOT replace tables with prose descriptions.
- Keep the correct number of rows and columns whenever possible.
- If some cells are unreadable, still create the table and put `???` or a close approximation in those cells.
- Use header rows when they are visually present.

DIAGRAMS, FIGURES, IMAGES:
- For any non-text visual element (diagram, flowchart, block diagram, network, chart/graph, schematic, photo, etc.):
  - Add a separate line starting with:
    `Visual: <Short title> — <detailed description>`
  - In the description, explain:
    - The main shapes and layout (boxes, arrows, blocks, axes, etc.).
    - Important labels and how elements are connected or related.
    - For charts/graphs: axes labels, units, direction of change, and key trends or comparisons.
  - Aim for 20–120 words per visual when possible.
- Do NOT use Markdown image syntax like ![alt](url) and do NOT output or invent any URLs.

DOCUMENT FLOW:
- Maintain logical reading order within each page.
- Use horizontal rules (`---`) only for clear major section breaks, not between every page.

Now convert this page into Markdown following these rules exactly.


"""
)

# Minimal user cue; prevents echo of long rules.
USER_INSTR = "# Page {page_number}"


def _build_system_prompt(page_number: int) -> str:
    return SYSTEM_MD.replace("{page_number}", str(page_number))


def _build_user_prompt(page_number: int) -> str:
    return USER_INSTR.replace("{page_number}", str(page_number))


def build_messages_for_batch(
    images: List[Image.Image],
    page_numbers: List[int],
):
    """
    Build chat-style message lists for Qwen-VL style processors.
    Each batch item:
      - system: SYSTEM_MD
      - user: [image + text '# Page N']
    """
    messages = []
    for img, pno in zip(images, page_numbers):
        messages.append(
            [
                {"role": "system", "content": _build_system_prompt(pno)},
                {
                    "role": "user",
                    "content": [
                        {"type": "image", "image": img},
                        {"type": "text", "text": _build_user_prompt(pno)},
                    ],
                },
            ]
        )
    return messages


# ---------- Post-processing: strip leaks, enforce header + Visual format ----------

ROLE_LINE = re.compile(r"(?im)^\s*(system|user|assistant)\b.*$")
RULES_BLOCK = re.compile(
    r"(?is)(You are an OCR|Rules:|Extract the page|Output format:)[\s\S]*?(?=#\s*Page|\Z)"
)
ANY_RULES = re.compile(r"(?is)(You are an OCR|Rules:|Output format:)[\s\S]+$")
PAGE_HEADER = re.compile(r"(?im)^[^\n#]*#\s*Page\s*(\d+).*$")
MD_FENCE = re.compile(r"(?m)^\s*```.*$")
BACKTICKS = re.compile(r"`{3,}")
EXTRANEOUS_QUOTES = re.compile(r"^[\"'>\s]+(.*)$")
BLANK_COLLAPSE = re.compile(r"\n{3,}")
REF_TAGS = re.compile(r"</?ref[^>]*>", re.IGNORECASE)
BARE_PAGE_LINE = re.compile(r"(?im)^\s*Page\s+\d+\s*$")

# New: remove any big leaked block that starts at `<|grounding|>` and runs up
# to the next "# Page ..." (or end of text).
GROUNDING_BLOCK = re.compile(
    r"(?is)<\|grounding\|>.*?(?=^#\s*Page\b|\Z)", re.MULTILINE
)

# Lines that sometimes leak; nuke them if they appear (updated to match the new wording).
INSTR_LINES = [
    # generic instruction spill patterns
    r"Then write all visible text on the page as Markdown\.?",
    r"For every diagram, figure, image, chart, table, or equation, illustrative description:?\.?",
    r"Visual:\s*<Short Title>\s*—\s*<20–80 words.*?>",
    r"For multi-panel visuals, emit separate lines,.*",
    r"Do not include system(?:/| or )user text, metadata, or URLs\.?",
    r"Transcribe ALL visible text faithfully(?: as Markdown)?\.?",
    r"Preserve headings, lists, tables if obvious\.?",
    r"Extract (?:the page|text) as Markdown \(no backticks, no JSON, no extra explanations\)\.?",
    r"Keep lists and tables if clear\.?",
    r"Begin with '# Page \{page_number\}'\.?",
    r"Return ONLY Markdown \(no backticks, no JSON, no extra explanations\)\.?",
    r"Describe any diagrams\. Do not only copy captions\. Add a 'Visual:' line\.",
    r"Visual:\s*<Short Title>\s*—\s*<20-200 words describing what it shows:.*$",

    # explicit SYSTEM_MD spill patterns
    r"<\|grounding\|>.*$",
    r"Convert this document to comprehensive Markdown with high accuracy\.?",
    r"\*\*TEXT & STRUCTURE:\*\*.*$",
    r"\*\*MATHEMATICAL CONTENT:\*\*.*$",
    r"\*\*TABLES:\*\*.*$",
    r"\*\*DIAGRAMS & FIGURES:\*\*.*$",
    r"\*\*DOCUMENT FLOW:\*\*.*$",
]

INSTR_RE = re.compile(r"(?im)^\s*(?:" + "|".join(INSTR_LINES) + r")\s*$")


def enforce_page_header(md: str, page_no: int) -> str:
    """
    Ensure the first non-empty line is '# Page N' and normalize any variants
    of that header to the canonical form.
    """
    s = (md or "").strip()
    if not s:
        return f"# Page {page_no}"

    lines = s.splitlines()
    cleaned_lines = []
    header_found = False

    for ln in lines:
        stripped = ln.strip()
        if not stripped and not header_found:
            # skip leading blank lines
            continue

        match = PAGE_HEADER.match(stripped)
        if match:
            if not header_found:
                cleaned_lines.append(f"# Page {page_no}")
                header_found = True
            continue  # skip any redundant page header

        cleaned_lines.append(ln)

    result = "\n".join(cleaned_lines).strip()
    if not header_found:
        return f"# Page {page_no}\n\n{result}" if result else f"# Page {page_no}"
    return result


def normalize_visual_lines(md: str) -> str:
    """
    Normalize 'Visual:' lines to a single well-formed line, ensure an em dash
    between title and description, and ensure trailing punctuation.
    """
    lines = md.splitlines()
    out: List[str] = []

    in_visual_block = False
    current_visual_line = ""

    for ln in lines:
        stripped = ln.strip()

        # start of a new visual block
        if stripped.lower().startswith("visual"):
            if in_visual_block:
                current_visual_line += " " + stripped
            else:
                if current_visual_line:
                    out.append(current_visual_line.strip())
                current_visual_line = stripped
                in_visual_block = True
            continue

        # continuation of a visual block
        if in_visual_block and stripped:
            current_visual_line += " " + stripped
            continue

        # leaving visual block
        if current_visual_line:
            out.append(current_visual_line.strip())
            current_visual_line = ""
        in_visual_block = False
        out.append(ln)

    if current_visual_line:
        out.append(current_visual_line.strip())

    final_out: List[str] = []
    for ln in out:
        stripped = ln.strip()
        if stripped.lower().startswith("visual"):
            # remove quotes/markers
            ln_clean = EXTRANEOUS_QUOTES.sub(r"\1", stripped)
            # normalize "Visual:"
            ln_clean = re.sub(r"(?i)^visual\s*:", "Visual:", ln_clean)

            # Ensure "Visual: <title> — <desc>"
            if "—" not in ln_clean:
                # Visual: title - desc
                m_dash = re.match(r"(Visual:\s*)(.+?)\s+-\s+(.+)$", ln_clean)
                if m_dash:
                    prefix, title_part, desc_part = m_dash.groups()
                    ln_clean = f"{prefix}{title_part} — {desc_part}"
                else:
                    # Visual: title desc
                    m_space = re.match(r"(Visual:\s*)(.+?)\s+(.+)$", ln_clean)
                    if m_space:
                        prefix, title_part, desc_part = m_space.groups()
                        ln_clean = f"{prefix}{title_part} — {desc_part}"
                    else:
                        # fallback: keep whatever and add a placeholder
                        ln_clean = re.sub(
                            r"(Visual:\s*)(.+)$",
                            r"Visual: \2 — (descriptive explanation)",
                            ln_clean,
                        )

            # Ensure it ends with punctuation
            if not ln_clean.rstrip().endswith((".", "!", "?")):
                ln_clean += "."

            final_out.append(ln_clean)
        else:
            final_out.append(ln)

    return "\n".join(final_out)


def clean_qwen_echo(md: str, page_no: int) -> str:
    """
    Remove echoed roles, rules (including SYSTEM_MD spills), and fences,
    normalize Visual lines, and ensure a canonical '# Page N' header.
    Returns pure Markdown.
    """
    s = (md or "").replace("\u200b", "").strip()
    if not s:
        return f"# Page {page_no}"

    # Remove any large `<|grounding|> ... # Page` block first
    s = GROUNDING_BLOCK.sub("", s)

    s = REF_TAGS.sub("", s)
    s = ROLE_LINE.sub("", s)
    s = RULES_BLOCK.sub("", s)
    s = ANY_RULES.sub("", s)
    s = MD_FENCE.sub("", s)
    s = BACKTICKS.sub("", s)

    # Strip any leaked instruction lines or bare "Page N" lines
    s = "\n".join(ln for ln in s.splitlines() if not INSTR_RE.match(ln.strip()))
    s = "\n".join(ln for ln in s.splitlines() if not BARE_PAGE_LINE.match(ln.strip()))

    # Collapse big blank chunks
    s = BLANK_COLLAPSE.sub("\n\n", s).strip()

    # Normalize Visual blocks and page header
    s = normalize_visual_lines(s)
    s = enforce_page_header(s, page_no)

    # Final cleanup of extra blanks
    s = BLANK_COLLAPSE.sub("\n\n", s).strip()
    return s


# ---------- Generation wrapper (works with Qwen-VL style models) ----------
def run_batched_generation(
    images: List[Image.Image],
    page_numbers: List[int],
    processor,
    model,
    device,
    eos_id: int,
    pad_id: int,
    max_new_tokens: int = 700,
) -> Tuple[List[str], float]:
    """
    Run one batch through a Qwen-VL style vision-language model and return:
      - list of cleaned Markdown strings (one per page)
      - elapsed generation time in seconds
    """
    import time
    import torch

    assert len(images) == len(page_numbers) and len(images) > 0

    messages = build_messages_for_batch(images, page_numbers)
    prompts = [
        processor.apply_chat_template(
            m,
            add_generation_prompt=True,
            tokenize=False,
        )
        for m in messages
    ]

    inputs = processor(text=prompts, images=images, return_tensors="pt", padding=True)
    inputs = {k: (v.to(device) if hasattr(v, "to") else v) for k, v in inputs.items()}

    start = time.perf_counter()
    with torch.inference_mode():
        out = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False,
            use_cache=True,
            num_beams=1,
            eos_token_id=eos_id,
            pad_token_id=pad_id,
        )
    elapsed = time.perf_counter() - start

    raw_texts = processor.batch_decode(out, skip_special_tokens=True)
    texts = [clean_qwen_echo(t, pno) for t, pno in zip(raw_texts, page_numbers)]
    return texts, elapsed


In [None]:
from pathlib import Path
import time
import json
from typing import List, Tuple

from PIL import Image
import torch
from pdf2image import convert_from_path

# We keep using fit_long_side and run_batched_generation from the previous cell.


def load_pdf_as_images(pdf_path: str, dpi: int = 120, long_side: int = 896) -> List[Image.Image]:
    """Render all pages of a PDF to RGB images and resize them."""
    pages = convert_from_path(pdf_path, dpi=dpi)
    imgs = [fit_long_side(p.convert("RGB"), long_side=long_side) for p in pages]
    return imgs


def load_single_image(img_path: str, long_side: int = 896) -> List[Image.Image]:
    """Load a single image and resize it."""
    img = Image.open(img_path).convert("RGB")
    return [fit_long_side(img, long_side=long_side)]


def process_path_batched(
    path: str,
    batch_size: int = 2,
    long_side: int = 896,
    dpi: int = 300,
    max_new_tokens: int = 360,
) -> Tuple[List[str], List[float], dict]:
    """
    Pure VL pipeline (no hybrid pdftotext + VL):
      - For PDFs: render every page to an image and run VL on all pages.
      - For images: run VL on the single image.
    Returns:
      md_pages: List[str] — cleaned Markdown per page
      per_page_times: List[float] — approx time per page
      stats: dict — runtime and environment info
    """
    p = Path(path)

    per_page_times: List[float] = []
    md_pages: List[str] = []
    vl_pages_used: List[int] = []

    total_start = time.perf_counter()

    if p.suffix.lower() == ".pdf":
        # Full VL for every page
        images = load_pdf_as_images(path, dpi=dpi, long_side=long_side)
        num_pages = len(images)
        page_numbers = list(range(1, num_pages + 1))

        page_md = {}

        for i in range(0, num_pages, batch_size):
            batch_imgs = images[i : i + batch_size]
            batch_pages = page_numbers[i : i + batch_size]

            decoded, elapsed = run_batched_generation(
                batch_imgs,
                batch_pages,
                processor=processor,
                model=model,
                device=device,
                eos_id=eos_id,
                pad_id=pad_id,
                max_new_tokens=max_new_tokens,
            )

            # Store per-page text and times
            for pg, txt in zip(batch_pages, decoded):
                vl_pages_used.append(pg)
                page_md[pg] = txt.strip()

            each = elapsed / max(1, len(batch_pages))
            per_page_times.extend([each] * len(batch_pages))
            print(f"[VL] Processed pages {batch_pages} in {elapsed:.2f}s")

        # Rebuild md_pages in page order
        md_pages = [page_md[i] for i in range(1, num_pages + 1)]

    else:
        # Single image path (always VL)
        imgs = load_single_image(path, long_side=long_side)
        decoded, elapsed = run_batched_generation(
            imgs,
            [1],
            processor=processor,
            model=model,
            device=device,
            eos_id=eos_id,
            pad_id=pad_id,
            max_new_tokens=max_new_tokens,
        )
        md_pages = [decoded[0].strip()]
        per_page_times = [elapsed]
        vl_pages_used = [1]

    total_time = time.perf_counter() - total_start

    # Pull env info from earlier cell (these names are expected to be defined there)
    stats = {
        "num_pages": len(md_pages),
        "avg_time_per_page_sec": round(
            sum(per_page_times) / max(1, len(per_page_times)), 2
        )
        if per_page_times
        else None,
        "total_runtime_sec": round(total_time, 2),
        "vl_pages_used": sorted(vl_pages_used),
        "attn_implementation": getattr(
            model.config, "attn_implementation", "torch"
        )
        if "model" in globals()
        else None,
        "device": device if "device" in globals() else None,
        "sm": f"{sm_major}.{sm_minor}" if "sm_major" in globals() else None,
        "dtype": str(dtype).replace("torch.", "") if "dtype" in globals() else None,
        "batch_size": batch_size,
        "long_side": long_side,
        "dpi": dpi,
        "model": MODEL if "MODEL" in globals() else None,
    }

    return md_pages, per_page_times, stats


In [None]:
from pathlib import Path
import json
from enum import Enum
from dataclasses import dataclass

# ---- Quality modes ----
class Quality(Enum):
    BASELINE = "baseline"
    FAST = "fast"
    ACCURATE = "accurate"


@dataclass
class InferenceConfig:
    dpi: int
    max_new_tokens: int
    batch_size: int
    long_side: int


def get_inference_config(quality: Quality) -> InferenceConfig:
    """
    Use the **current settings as ACCURATE**, and derive FAST / BASELINE from that.
    You can tweak these numbers as you wish, but ACCURATE matches your current config.
    """
    if quality is Quality.ACCURATE:
        # ← your current configuration
        return InferenceConfig(
            dpi=250,          # was: DPI = 250
            max_new_tokens=300,  # was: MAX_NEW_TOKENS = 300
            batch_size=16,    # was: BATCH_SIZE = 16
            long_side=512,    # was: LONG_SIDE = 512
        )
    elif quality is Quality.FAST:
        # Faster: lower DPI, shorter tokens, slightly smaller images
        return InferenceConfig(
            dpi=200,
            max_new_tokens=220,
            batch_size=16,
            long_side=448,
        )
    else:  # Quality.BASELINE
        # Middle ground between FAST and ACCURATE
        return InferenceConfig(
            dpi=230,
            max_new_tokens=260,
            batch_size=16,
            long_side=512,
        )


# ---- Choose mode here ----
QUALITY = Quality.FAST   # change to FAST or BASELINE when you like
cfg = get_inference_config(QUALITY)

# ---- Config ----
PDF_OR_IMAGE_PATH = "2_Telephony.pdf"   # ← set your input path (.pdf or image)
OUT_MD = "2_telephony.md"               # all pages concatenated into one Markdown file
OUT_JSONL = "per_page.md.jsonl"         # optional: each page as a JSONL string
OUT_REPORT = "inference_report.json"    # timings, knobs, etc.

DPI = cfg.dpi
MAX_NEW_TOKENS = cfg.max_new_tokens
BATCH_SIZE = cfg.batch_size
LONG_SIDE = cfg.long_side

# ---- Ensure output directories exist (if user specified paths with folders) ----
for pth in (OUT_MD, OUT_JSONL, OUT_REPORT):
    Path(pth).parent.mkdir(parents=True, exist_ok=True)

# ---- Run pipeline ----
md_pages, per_page_times, stats = process_path_batched(
    PDF_OR_IMAGE_PATH,
    batch_size=BATCH_SIZE,
    long_side=LONG_SIDE,
    dpi=DPI,
    max_new_tokens=MAX_NEW_TOKENS,
)

# Save Markdown
with open(OUT_MD, "w", encoding="utf-8") as f:
    f.write("\n\n".join(md_pages))

# Save per-page JSONL
with open(OUT_JSONL, "w", encoding="utf-8") as f:
    for pg_no, md in enumerate(md_pages, start=1):
        rec = {"page": pg_no, "markdown": md}
        f.write(json.dumps(rec, ensure_ascii=False) + "\n")

# Save report
report = {
    "path": PDF_OR_IMAGE_PATH,
    "params": {
        "quality": QUALITY.value,
        "batch_size": BATCH_SIZE,
        "long_side": LONG_SIDE,
        "dpi": DPI,
        "max_new_tokens": MAX_NEW_TOKENS,
    },
    "stats": stats,
}
with open(OUT_REPORT, "w", encoding="utf-8") as f:
    json.dump(report, f, ensure_ascii=False, indent=2)

# ---- Console summary ----
print("✅ Done.")
print("📄 Pages:", len(md_pages))
print("📊 Avg time/page (s):", report["stats"]["avg_time_per_page_sec"])
print("📝 Markdown saved to:", OUT_MD)
print("🧾 Per-page JSONL saved to:", OUT_JSONL)
print("📒 Report saved to:", OUT_REPORT)
print("⚙️ Stats:", json.dumps(stats, indent=2))


[VL] Processed pages [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16] in 30.05s
[VL] Processed pages [17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32] in 28.44s
[VL] Processed pages [33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48] in 28.45s
[VL] Processed pages [49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64] in 28.42s
[VL] Processed pages [65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80] in 28.46s
[VL] Processed pages [81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96] in 28.44s
[VL] Processed pages [97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112] in 29.51s
[VL] Processed pages [113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128] in 28.47s
[VL] Processed pages [129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144] in 28.48s
[VL] Processed pages [145, 146, 147] in 13.35s
✅ Done.
📄 Pages: 147
📊 Avg time/page (s): 1.85
📝 Mark