In [1]:
import os
import re
import fitz                                # PyMuPDF
from PIL import Image
import pandas as pd
import torch
from transformers import (
    AutoTokenizer,
    AutoProcessor,
    AutoModelForImageTextToText
)

In [2]:
def convert_pdf_to_images(pdf_path: str) -> list[Image.Image]:
    doc = fitz.open(pdf_path)
    images = []
    for page in doc:
        pix = page.get_pixmap()
        mode = "RGBA" if pix.alpha else "RGB"
        img = Image.frombytes(mode, [pix.width, pix.height], pix.samples)
        images.append(img)
    return images

In [3]:
def load_nanonets_model(model_path: str):
    model = AutoModelForImageTextToText.from_pretrained(
        model_path,
        torch_dtype="auto",
        device_map="auto",
        attn_implementation="flash_attention_2"
    )
    model.eval()
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    processor = AutoProcessor.from_pretrained(model_path)
    return model, tokenizer, processor

In [4]:
def ocr_image_with_nanonets(
    image: Image.Image,
    model: AutoModelForImageTextToText,
    processor: AutoProcessor,
    max_new_tokens: int = 4096
) -> str:
    prompt = (
        "Extract the text from the above document as if you were reading it naturally. "
        "Return the tables in html format. Return the equations in LaTeX representation. "
        "If there is an image in the document and image caption is not present, add a small description of the image inside the <img></img> tag; "
        "otherwise, add the image caption inside <img></img>. Watermarks should be wrapped in brackets. "
        "Ex: <watermark>OFFICIAL COPY</watermark>. Page numbers should be wrapped in brackets. "
        "Ex: <page_number>14</page_number> or <page_number>9/22</page_number>. Prefer using ☐ and ☑ for check boxes."
    )
    messages = [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": prompt},
        ]},
    ]
    text = processor.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    inputs = processor(
        text=[text],
        images=[image],
        padding=True,
        return_tensors="pt"
    ).to(model.device)

    output_ids = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        do_sample=False
    )
    generated = processor.batch_decode(
        output_ids[:, inputs.input_ids.shape[-1]:],
        skip_special_tokens=True,
        clean_up_tokenization_spaces=True
    )
    return generated[0]

In [5]:
def extract_judge(text: str) -> str:
    match = re.search(r"Judge[:\s]+([A-Za-z .,\-]+)", text)
    return match.group(1).strip() if match else ''

In [6]:
def ocr_pdfs_to_dataframe(pdf_paths: list[str], model_path: str) -> pd.DataFrame:
    model, tokenizer, processor = load_nanonets_model(model_path)
    records = []

    for pdf_path in pdf_paths:
        # convert each page to image
        pages = convert_pdf_to_images(pdf_path)
        page_texts = [ocr_image_with_nanonets(img, model, processor) for img in pages]
        combined_text = "\n".join(page_texts)

        case_name  = os.path.splitext(os.path.basename(pdf_path))[0]
        judge_name = extract_judge(combined_text)

        records.append({
            "case_name":  case_name,
            "judge_name": judge_name,
            "text":       combined_text
        })

    return pd.DataFrame(records, columns=["case_name", "judge_name", "text"])

In [7]:
pdf_list = [
    "presentations\hist-presentation\e5yrggr.pdf"
]
df = ocr_pdfs_to_dataframe(pdf_list, "nanonets/Nanonets-OCR-s")
df.head()

  "presentations\hist-presentation\e5yrggr.pdf"


model.safetensors.index.json: 0.00B [00:00, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.51G [00:00<?, ?B/s]

  "presentations\hist-presentation\e5yrggr.pdf"


ImportError: FlashAttention2 has been toggled on, but it cannot be used due to the following error: the package flash_attn seems to be not installed. Please refer to the documentation of https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention-2 to install Flash Attention 2.