In [18]:
# PDF -> Markdown using PyMuPDF + Ollama native API (/api/generate with "images": [...])
# Run in JupyterLab
# pip install pymupdf requests tqdm

import os
import base64
import time
from typing import List, Tuple
import fitz  # PyMuPDF
import requests
from tqdm import tqdm

from typing import List, Optional
import json
import time
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

In [19]:
# ---- Paths (yours) ----
PDF_PATH = "/home/isztld/hackaton/hackingsocks/00_dataset/financebench/pdfs/3M_2015_10K.pdf"
OUT_MD_PATH = "/home/isztld/hackaton/hackingsocks_david/00_dataset/financebench_md/md/3M_2015_10K.md"


In [20]:
# ---- Ollama native API ----
OLLAMA_URL = "http://localhost:11434/api/generate"
MODEL = "qwen2.5vl:7b" # 32b

In [21]:
# ---- Rendering / batching ----
DPI = 170                 # 150–200 is usually good for SEC filings
PAGES_PER_REQUEST = 2     # lower if you hit 413 / OOM; raise for speed if stable
TEMPERATURE = 0.1
TIMEOUT = 180             # seconds per request
PAGE_BREAK = "\n\n---\n\n"

# Optional: limit pages during testing (1-based, inclusive)
START_PAGE = None  # e.g., 1
END_PAGE   = None  # e.g., 10

# ---- Prompt tuned for 10-K PDFs (multi-column, tables, footnotes) ----
PROMPT_TEMPLATE = """You are a precise SEC document transcriber.
Convert the following PDF page images into clean Markdown.

Rules:
1) Preserve SEC/10-K hierarchy and reading order (handle multi-column pages left→right, top→bottom).
2) Use Markdown headings: # Title, ## PART I/II/III/IV, ### ITEM 1, ITEM 1A, ITEM 7, ITEM 8, etc.
3) Render financial statements/tables as Markdown tables with headers and totals kept intact.
4) Keep lists, sublists, and inline formatting. Use $...$ only for actual math.
5) Remove running headers/footers/page numbers and join hyphenated line breaks (rejoin split words).
6) Keep footnote markers and place footnotes directly after the related table/section when visible.
7) If text is unreadable, write “[illegible]”.
8) Output ONLY Markdown — no extra commentary.
9) Insert a line with '---' between pages, in the exact order provided.

The batch below contains consecutive pages. Produce one continuous Markdown snippet for the batch, with '---' separating pages.
"""

In [22]:
def _page_range(total_pages: int, start_1b=None, end_1b=None) -> Tuple[int, int]:
    """Return 0-based [start, endExclusive] from 1-based inputs (or None)."""
    s = 1 if start_1b is None else max(1, int(start_1b))
    e = total_pages if end_1b is None else min(total_pages, int(end_1b))
    if e < s:
        e = s
    return s - 1, e

def render_pdf_pages_to_base64(pdf_path: str, dpi: int, start_1b=None, end_1b=None) -> List[str]:
    """Render selected PDF pages to base64-encoded PNG strings using PyMuPDF."""
    doc = fitz.open(pdf_path)
    start0, end1b = _page_range(len(doc), start_1b, end_1b)
    scale = dpi / 72.0
    mat = fitz.Matrix(scale, scale)
    imgs = []
    for i in range(start0, end1b):
        page = doc.load_page(i)
        pix = page.get_pixmap(matrix=mat, alpha=False)
        png_bytes = pix.tobytes("png")
        imgs.append(base64.b64encode(png_bytes).decode("utf-8"))
    return imgs

def chunked(seq, n):
    for i in range(0, len(seq), n):
        yield seq[i:i+n], i  # (chunk, chunk_start_index)

#def call_ollama_generate(images_base64: List[str]) -> str:
#    """Call Ollama native /api/generate with multiple images, return the text response."""
#    payload = {
#        "model": MODEL,
#        "prompt": PROMPT_TEMPLATE,
#        "images": images_base64,
#        "stream": False,
#        "options": {
#            "temperature": TEMPERATURE
#        }
#    }
#    for attempt in range(4):
#        r = requests.post(OLLAMA_URL, json=payload, timeout=TIMEOUT)
#        if r.status_code == 200:
#            j = r.json()
#            # Native response shape: {"model": ..., "response": "...", "done": true, ...}
#            return (j.get("response") or "").strip()
#        if r.status_code in (413, 429, 502, 503, 504):
#            time.sleep(2.0 * (attempt + 1))
#            continue
#        raise RuntimeError(f"Ollama error {r.status_code}: {r.text[:500]}")
#    raise RuntimeError("Failed after retries.")


# ---- one global Session with retries & keep-alive ----
_retry = Retry(
    total=4,                     # total tries per request
    connect=4,                   # connection errors
    read=4,                      # read timeouts
    backoff_factor=0.6,          # 0.6, 1.2, 2.4, 4.8s ...
    status_forcelist=(408, 413, 429, 500, 502, 503, 504),
    allowed_methods=frozenset(["POST"]),
    respect_retry_after_header=True,
)
_adapter = HTTPAdapter(pool_connections=10, pool_maxsize=10, max_retries=_retry, pool_block=True)
SESSION = requests.Session()
SESSION.mount("http://", _adapter)
SESSION.mount("https://", _adapter)

def call_ollama_generate(images_base64: List[str]) -> str:
    """Call Ollama /api/generate with multiple images, return the text response."""
    payload = {
        "model": MODEL,
        "prompt": PROMPT_TEMPLATE,
        "images": images_base64,
        "stream": False,
        "options": {"temperature": TEMPERATURE},
    }

    # Split timeout into (connect, read) so slow generations don’t look like connect failures.
    timeout = (10, TIMEOUT)  # 10s connect, TIMEOUT read

    # We still keep a light manual retry loop to handle non-idempotent edge cases gracefully.
    for attempt in range(4):
        try:
            r = SESSION.post(
                OLLAMA_URL,
                data=json.dumps(payload),           # avoid re-encoding payload on retries
                headers={"Content-Type": "application/json"},
                timeout=timeout,
            )
            # Always read the body so the connection can be reused.
            text = r.text  # forces body download
            status = r.status_code

            if status == 200:
                try:
                    j = r.json()
                except ValueError:
                    raise RuntimeError(f"Invalid JSON from Ollama: {text[:500]}")
                return (j.get("response") or "").strip()

            if status == 413:
                # Likely request too large; retrying usually won’t help.
                raise RuntimeError(
                    "Ollama returned 413 (Payload Too Large). "
                    "Reduce image count/size or model context; server said: "
                    f"{text[:500]}"
                )

            if status in (408, 429, 500, 502, 503, 504):
                # Backoff with jitter; also honor Retry-After if present.
                retry_after = r.headers.get("Retry-After")
                if retry_after:
                    try:
                        sleep_s = max(float(retry_after), 0.5)
                    except ValueError:
                        sleep_s = 0.5
                else:
                    sleep_s = 0.5 * (2 ** attempt) + (0.1 * attempt)
                time.sleep(sleep_s)
                continue

            # Other statuses: fail fast with context.
            raise RuntimeError(f"Ollama error {status}: {text[:500]}")

        except (requests.Timeout, requests.ConnectionError) as e:
            # Exponential backoff on network hiccups.
            if attempt < 3:
                time.sleep(0.5 * (2 ** attempt))
                continue
            raise RuntimeError(f"Network error talking to Ollama: {e}") from e
        finally:
            try:
                r.close()  # be explicit; ensure socket is returned to the pool
            except Exception:
                pass

    raise RuntimeError("Failed after retries.")

In [23]:
# # ---- Run conversion ----
# os.makedirs(os.path.dirname(OUT_MD_PATH), exist_ok=True)
# 
# print("Rendering PDF pages with PyMuPDF ...")
# b64_pages = render_pdf_pages_to_base64(PDF_PATH, DPI, START_PAGE, END_PAGE)
# total_pages = len(b64_pages)
# print(f"Total pages to process: {total_pages}")
# 
# parts = []
# for batch, offset in tqdm(list(chunked(b64_pages, PAGES_PER_REQUEST)), desc="Converting", unit="batch"):
#     md = call_ollama_generate(batch)
#     md = md.strip()
#     # Normalize: ensure a page break at the end of each batch (model also inserts '---')
#     if not md.endswith(PAGE_BREAK.strip()):
#         md += PAGE_BREAK
#     parts.append(md)
# 
# final_md = "".join(parts).strip()
# # Collapse duplicate breaks if any
# while PAGE_BREAK * 2 in final_md:
#     final_md = final_md.replace(PAGE_BREAK * 2, PAGE_BREAK)
# 
# with open(OUT_MD_PATH, "w", encoding="utf-8") as f:
#     f.write(final_md)
# 
# print(f"Saved Markdown to: {OUT_MD_PATH}")

In [24]:
# ---- Run conversion ----
os.makedirs(os.path.dirname(OUT_MD_PATH), exist_ok=True)

print("Rendering PDF pages with PyMuPDF ...")
b64_pages = render_pdf_pages_to_base64(PDF_PATH, DPI, START_PAGE, END_PAGE)
total_pages = len(b64_pages)
print(f"Total pages to process: {total_pages}")

Rendering PDF pages with PyMuPDF ...
Total pages to process: 158


In [25]:
md = call_ollama_generate([b64_pages[39]])

In [26]:
all_pages = []
for page in tqdm(b64_pages):
    md = call_ollama_generate([page])
    all_pages.append(md)
    #time.sleep(10)

 18%|█▊        | 28/158 [19:08<1:28:51, 41.01s/it]


KeyboardInterrupt: 

In [8]:
md

"```markdown\n# FINANCIAL CONDITION AND LIQUIDITY\n\n3M continues its transition to a better-optimized capital structure and is adding leverage at a measured pace. The strength and stability of 3M's business model and strong free cash flow capability, together with proven capital markets access, enable the Company to implement this strategy. Investing in 3M's businesses to drive organic growth remains the first priority, thus 3M will continue to deploy capital towards research and development, capital expenditures, and commercialization capability. Investment in organic growth will be supplemented by complementary acquisitions. 3M will also continue to return cash to shareholders through dividends and share repurchases. Sources for cash availability in the United States, such as ongoing cash flow from operations and access to capital markets, have historically been sufficient to fund dividend payments to shareholders and share repurchases, as well as funding U.S. acquisitions and other

In [None]:
parts = []
for page in tqdm(b64_pages):
    md = call_ollama_generate([page])
    parts.append(md.strip())

In [None]:
md = md.strip()

In [13]:
os.makedirs(os.path.dirname(OUT_MD_PATH), exist_ok=True)

with open(OUT_MD_PATH, "w", encoding="utf-8") as f:
    f.write("".join(all_pages).strip())

In [None]:
md