
# MinerU — Clean, Structured Version (no server required)

This notebook wraps the MinerU / `magic_pdf` pipeline into clear, reusable functions.  
It keeps the **same functionality** as the original intent:
- Iterate PDFs under `downloads/` (recursively if desired)
- Skip already-parsed files via `parsed_list.txt`
- Run document analysis (`doc_analyze`) and export:
  - Markdown (`.md`), images, and content list (`_content_list.json`)
  - Intermediate JSON (`_middle.json`)
  - Layout and span visualization PDFs
- Robust logging and error handling

> **Note:** This version is designed to run **with or without a GPU**. If a GPU is available, it will be used; otherwise the pipeline runs on CPU (slower). You don't need vLLM or a model server.


In [None]:

import os
import sys
import json
from pathlib import Path
from typing import Iterable, List, Set, Optional, Dict, Tuple

# Optional: tqdm for progress
try:
    from tqdm import tqdm
except Exception:
    tqdm = lambda x, **k: x  # fallback

# MinerU / magic_pdf
from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
from magic_pdf.data.dataset import PymuDocDataset
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.config.enums import SupportedPdfParseMethod

# If CUDA is available, honor CUDA_VISIBLE_DEVICES if set. Otherwise run on CPU.
# You can override with: os.environ["CUDA_VISIBLE_DEVICES"] = "0" (or "" for CPU)
print("CUDA_VISIBLE_DEVICES:", os.environ.get("CUDA_VISIBLE_DEVICES", "(not set)"))


## Configuration

In [None]:

# Root directory (by default, current working directory)
ROOT = Path(os.getcwd())

# Input PDFs directory
INPUT_DIR = ROOT / "downloads"  # change if needed

# Output root (MinerU artifacts)
OUT_DIR = ROOT / "mineru_out"
OUT_DIR.mkdir(parents=True, exist_ok=True)

# Parsed list file to skip already-processed PDFs
PARSED_LIST_FILE = ROOT / "parsed_list.txt"

# Parsing method: choose a SupportedPdfParseMethod (e.g., PYMU, PDFMINER)
PARSE_METHOD = SupportedPdfParseMethod.PYMU  # typically PymuPDF works well

# Whether to search INPUT_DIR recursively for PDFs
RECURSIVE = True

# Max PDFs to process in one run (None = no limit)
MAX_PDFS: Optional[int] = None

print("ROOT:", ROOT)
print("INPUT_DIR:", INPUT_DIR)
print("OUT_DIR:", OUT_DIR)
print("PARSE_METHOD:", PARSE_METHOD)


## Utilities

In [None]:

def load_parsed_list(path: Path) -> Set[str]:
    if path.exists():
        return set(p.strip() for p in path.read_text(encoding="utf-8").splitlines() if p.strip())
    else:
        path.write_text("", encoding="utf-8")
        return set()

def save_parsed_list(path: Path, parsed: Set[str]) -> None:
    path.write_text("\n".join(sorted(parsed)), encoding="utf-8")

def iter_pdfs(root: Path, recursive: bool = True) -> Iterable[Path]:
    if recursive:
        yield from root.rglob("*.pdf")
    else:
        yield from root.glob("*.pdf")

def safe_name(pdf_path: Path) -> str:
    # Keep arXiv-like prefix + title portion if present, otherwise filename stem
    # Example: '1610.00402v2.Dynamic_Polygon_Clouds__Representation_and_Compression_for_VR_AR.pdf'
    # -> '1610.00402v2.Dynamic_Polygon_Clouds__Representation_and_Compression_for_VR_AR'
    parts = pdf_path.name.split(".")
    if len(parts) >= 3 and parts[0].isdigit():  # heuristic for arXiv-like names
        return ".".join(parts[:2])
    return pdf_path.stem

def ensure_dirs(base: Path, name_without_ext: str) -> Tuple[Path, Path, Path]:
    """Return (local_md_dir, image_dir, local_vis_dir)."""
    local_md_dir = base / "md"
    image_dir = base / "images" / name_without_ext
    local_vis_dir = base / "vis"
    for d in (local_md_dir, image_dir, local_vis_dir):
        d.mkdir(parents=True, exist_ok=True)
    return local_md_dir, image_dir, local_vis_dir


## Core processing

In [None]:

def process_pdf(pdf_path: Path, parse_method=PARSE_METHOD, out_root: Path = OUT_DIR) -> Dict[str, str]:
    """Run MinerU pipeline on a single PDF and save artifacts.

    Returns a dict of output paths for reference.
    """
    pdf_path = pdf_path.resolve()
    name_without_ext = safe_name(pdf_path)
    local_md_dir, image_dir, local_vis_dir = ensure_dirs(out_root, name_without_ext)

    # Writers
    md_writer = FileBasedDataWriter(local_md_dir.as_posix())
    image_writer = FileBasedDataWriter(image_dir.as_posix())
    vis_writer = FileBasedDataWriter(local_vis_dir.as_posix())
    data_reader = FileBasedDataReader(pdf_path.as_posix())

    # Dataset and inference
    ds = PymuDocDataset(data_reader, parse_method)

    # Run doc analysis; if you have a custom model path, pass it via kwargs as needed.
    # Example: doc_analyze(ds, ocr=False, model_path="...")
    infer_result = ds.apply(doc_analyze, ocr=False)

    # Choose pipeline mode; in many setups 'pipe_txt_mode' yields Markdown + images
    pipe_result = infer_result.pipe_txt_mode(image_writer)

    # Visualizations of layout and spans
    vis_layout_pdf = (local_vis_dir / f"{name_without_ext}_layout.pdf").as_posix()
    vis_spans_pdf = (local_vis_dir / f"{name_without_ext}_spans.pdf").as_posix()
    pipe_result.draw_layout(vis_layout_pdf)
    pipe_result.draw_span(vis_spans_pdf)

    # Markdown + content list + middle json
    md_file = f"{name_without_ext}.md"
    content_list_file = f"{name_without_ext}_content_list.json"
    middle_json_file = f"{name_without_ext}_middle.json"

    pipe_result.dump_md(md_writer, md_file, image_dir.as_posix())
    pipe_result.dump_content_list(md_writer, content_list_file, image_dir.as_posix())
    pipe_result.dump_middle_json(md_writer, middle_json_file)

    return {
        "md": (local_md_dir / md_file).as_posix(),
        "content_list": (local_md_dir / content_list_file).as_posix(),
        "middle_json": (local_md_dir / middle_json_file).as_posix(),
        "vis_layout": vis_layout_pdf,
        "vis_spans": vis_spans_pdf,
        "images_dir": image_dir.as_posix(),
    }


## Batch runner

In [None]:

def run_batch(input_dir: Path = INPUT_DIR, max_pdfs: Optional[int] = MAX_PDFS) -> None:
    parsed = load_parsed_list(PARSED_LIST_FILE)
    count = 0
    for pdf in tqdm(iter_pdfs(input_dir, recursive=RECURSIVE), desc="PDFs"):
        key = pdf.resolve().as_posix()
        if key in parsed:
            continue
        try:
            outs = process_pdf(pdf)
            parsed.add(key)
            print(f"[OK] {pdf.name} -> {outs['md']}")
        except Exception as e:
            print(f"[ERROR] {pdf.name}: {e}")
        count += 1
        if max_pdfs is not None and count >= max_pdfs:
            break
    save_parsed_list(PARSED_LIST_FILE, parsed)

# Dry run check (won't process if no PDFs exist)
print("Ready. Place PDFs under:", INPUT_DIR)


## Quick test on a single file (optional)

In [None]:

# Example:
# sample_pdf = next(iter_pdfs(INPUT_DIR, recursive=RECURSIVE), None)
# if sample_pdf is not None:
#     process_pdf(sample_pdf)
# else:
#     print("No PDFs found for quick test.")
