In [1]:
# PDF -> Markdown using PyMuPDF + Ollama native API (/api/generate with "images": [...])
# Run in JupyterLab
# pip install pymupdf requests tqdm

import os
import base64
import time
from typing import List, Tuple
import fitz  # PyMuPDF
import requests
from tqdm import tqdm

In [2]:
# ---- Paths (yours) ----
PDF_PATH = "/home/isztld/hackaton/hackingsocks/00_dataset/financebench/pdfs/3M_2015_10K.pdf"
OUT_MD_PATH = "/home/isztld/hackaton/hackingsocks/00_dataset/financebench_md/md/3M_2015_10K.md"


In [3]:
# ---- Ollama native API ----
OLLAMA_URL = "http://localhost:11434/api/generate"
MODEL = "qwen2.5vl:7b"

In [4]:
# ---- Rendering / batching ----
DPI = 170                 # 150–200 is usually good for SEC filings
PAGES_PER_REQUEST = 2     # lower if you hit 413 / OOM; raise for speed if stable
TEMPERATURE = 0.1
TIMEOUT = 180             # seconds per request
PAGE_BREAK = "\n\n---\n\n"

# Optional: limit pages during testing (1-based, inclusive)
START_PAGE = None  # e.g., 1
END_PAGE   = None  # e.g., 10

# ---- Prompt tuned for 10-K PDFs (multi-column, tables, footnotes) ----
PROMPT_TEMPLATE = """You are a precise SEC document transcriber.
Convert the following PDF page images into clean Markdown.

Rules:
1) Preserve SEC/10-K hierarchy and reading order (handle multi-column pages left→right, top→bottom).
2) Use Markdown headings: # Title, ## PART I/II/III/IV, ### ITEM 1, ITEM 1A, ITEM 7, ITEM 8, etc.
3) Render financial statements/tables as Markdown tables with headers and totals kept intact.
4) Keep lists, sublists, and inline formatting. Use $...$ only for actual math.
5) Remove running headers/footers/page numbers and join hyphenated line breaks (rejoin split words).
6) Keep footnote markers and place footnotes directly after the related table/section when visible.
7) If text is unreadable, write “[illegible]”.
8) Output ONLY Markdown — no extra commentary.
9) Insert a line with '---' between pages, in the exact order provided.

The batch below contains consecutive pages. Produce one continuous Markdown snippet for the batch, with '---' separating pages.
"""

In [5]:
def _page_range(total_pages: int, start_1b=None, end_1b=None) -> Tuple[int, int]:
    """Return 0-based [start, endExclusive] from 1-based inputs (or None)."""
    s = 1 if start_1b is None else max(1, int(start_1b))
    e = total_pages if end_1b is None else min(total_pages, int(end_1b))
    if e < s:
        e = s
    return s - 1, e

def render_pdf_pages_to_base64(pdf_path: str, dpi: int, start_1b=None, end_1b=None) -> List[str]:
    """Render selected PDF pages to base64-encoded PNG strings using PyMuPDF."""
    doc = fitz.open(pdf_path)
    start0, end1b = _page_range(len(doc), start_1b, end_1b)
    scale = dpi / 72.0
    mat = fitz.Matrix(scale, scale)
    imgs = []
    for i in range(start0, end1b):
        page = doc.load_page(i)
        pix = page.get_pixmap(matrix=mat, alpha=False)
        png_bytes = pix.tobytes("png")
        imgs.append(base64.b64encode(png_bytes).decode("utf-8"))
    return imgs

def chunked(seq, n):
    for i in range(0, len(seq), n):
        yield seq[i:i+n], i  # (chunk, chunk_start_index)

def call_ollama_generate(images_base64: List[str]) -> str:
    """Call Ollama native /api/generate with multiple images, return the text response."""
    payload = {
        "model": MODEL,
        "prompt": PROMPT_TEMPLATE,
        "images": images_base64,
        "stream": False,
        "options": {
            "temperature": TEMPERATURE
        }
    }
    for attempt in range(4):
        r = requests.post(OLLAMA_URL, json=payload, timeout=TIMEOUT)
        if r.status_code == 200:
            j = r.json()
            # Native response shape: {"model": ..., "response": "...", "done": true, ...}
            return (j.get("response") or "").strip()
        if r.status_code in (413, 429, 502, 503, 504):
            time.sleep(2.0 * (attempt + 1))
            continue
        raise RuntimeError(f"Ollama error {r.status_code}: {r.text[:500]}")
    raise RuntimeError("Failed after retries.")

In [6]:
# # ---- Run conversion ----
# os.makedirs(os.path.dirname(OUT_MD_PATH), exist_ok=True)
# 
# print("Rendering PDF pages with PyMuPDF ...")
# b64_pages = render_pdf_pages_to_base64(PDF_PATH, DPI, START_PAGE, END_PAGE)
# total_pages = len(b64_pages)
# print(f"Total pages to process: {total_pages}")
# 
# parts = []
# for batch, offset in tqdm(list(chunked(b64_pages, PAGES_PER_REQUEST)), desc="Converting", unit="batch"):
#     md = call_ollama_generate(batch)
#     md = md.strip()
#     # Normalize: ensure a page break at the end of each batch (model also inserts '---')
#     if not md.endswith(PAGE_BREAK.strip()):
#         md += PAGE_BREAK
#     parts.append(md)
# 
# final_md = "".join(parts).strip()
# # Collapse duplicate breaks if any
# while PAGE_BREAK * 2 in final_md:
#     final_md = final_md.replace(PAGE_BREAK * 2, PAGE_BREAK)
# 
# with open(OUT_MD_PATH, "w", encoding="utf-8") as f:
#     f.write(final_md)
# 
# print(f"Saved Markdown to: {OUT_MD_PATH}")

In [7]:
# ---- Run conversion ----
os.makedirs(os.path.dirname(OUT_MD_PATH), exist_ok=True)

print("Rendering PDF pages with PyMuPDF ...")
b64_pages = render_pdf_pages_to_base64(PDF_PATH, DPI, START_PAGE, END_PAGE)
total_pages = len(b64_pages)
print(f"Total pages to process: {total_pages}")

Rendering PDF pages with PyMuPDF ...
Total pages to process: 158


In [21]:
parts = []
for page in tqdm(b64_pages):
    md = call_ollama_generate([page])
    parts.append(md.strip())

  3%|▎         | 4/158 [03:18<2:07:14, 49.57s/it]


ReadTimeout: HTTPConnectionPool(host='localhost', port=11434): Read timed out. (read timeout=180)

In [None]:
#md = md.strip()

In [18]:
with open(OUT_MD_PATH, "w", encoding="utf-8") as f:
    f.write(md)