In [None]:
import io
from PyPDF2 import PdfReader, PdfWriter
import google.generativeai as genai
from langchain.schema import Document

def load_page_from_gemini(page_bytes: bytes, page_num: int, file_name: str) -> str:
    """
    Send a single page PDF (in bytes) to Gemini OCR and return extracted text.
    """
    API_KEY = "your api key"
    genai.configure(api_key=API_KEY)
    model = genai.GenerativeModel('gemini-2.5-pro')

    # Prepare BytesIO object with name
    page_file = io.BytesIO(page_bytes)
    page_file.name = f"page_{page_num}.pdf"

    # ✅ Must provide mime_type for in-memory uploads
    sample_file = genai.upload_file(
        path=page_file,
        display_name=f"{file_name}-page-{page_num}",
        mime_type="application/pdf"
    )

    print(f"Uploaded page {page_num} as: {sample_file.uri}")

    response = model.generate_content([
        sample_file,
        "Extract all text from this page, including image descriptions and tables, in a structured format."
    ])

    return response.text


def extract_pdf_pagewise(pdf_path: str) -> Document:
    """
    Extract scanned PDF text page by page using Gemini OCR.
    """
    reader = PdfReader(pdf_path)
    text = ""
    file_name = pdf_path.split("/")[-1]

    for i in range(len(reader.pages)):
        writer = PdfWriter()
        writer.add_page(reader.pages[i])

        # Page → Bytes
        page_bytes = io.BytesIO()
        writer.write(page_bytes)
        page_bytes.seek(0)

        # OCR this page
        page_text = load_page_from_gemini(page_bytes.getvalue(), i + 1, file_name)

        # Append
        text += f"\n--- Page {i+1} ---\n" + page_text

    return Document(page_content=text, metadata={"source": file_name})


In [None]:
pdf_path = "/Users/sameersingh/Documents/DataViz/data/Report on Title 2024-11-18.pdf"
pdf_text = extract_pdf_pagewise(pdf_path)
print(pdf_text.page_content[:1000])  # print first 1000 chars


In [None]:
import io
from PyPDF2 import PdfReader, PdfWriter
import google.generativeai as genai
from langchain.schema import Document

# 🔑 Gemini API key
API_KEY = "your api key"
genai.configure(api_key=API_KEY)


def load_pdf_chunk_from_gemini(chunk_bytes: bytes, chunk_num: int, file_name: str) -> str:
    """
    Send a multi-page PDF chunk to Gemini OCR and return extracted text.
    """
    model = genai.GenerativeModel("gemini-2.5-pro")

    # Gemini needs mime_type if we pass BytesIO
    chunk_file = io.BytesIO(chunk_bytes)
    chunk_file.name = f"chunk_{chunk_num}.pdf"

    sample_file = genai.upload_file(
        path=chunk_file,
        display_name=f"{file_name}-chunk-{chunk_num}",
        mime_type="application/pdf"
    )

    print(f"Uploaded chunk {chunk_num} as: {sample_file.uri}")

    response = model.generate_content([
        sample_file,
        "Extract all text (including from images & tables) from this PDF chunk in correct page order. "
        "Preserve structure and readability."
    ])

    return response.text


def extract_pdf_by_chunks(pdf_path: str, pages_per_chunk: int = 10) -> Document:
    """
    Extract scanned PDF text by sending multiple pages at once to Gemini OCR.
    """
    reader = PdfReader(pdf_path)
    text = ""
    file_name = pdf_path.split("/")[-1]

    total_pages = len(reader.pages)
    chunk_num = 1

    for start in range(0, total_pages, pages_per_chunk):
        end = min(start + pages_per_chunk, total_pages)

        # Collect pages into one chunk
        writer = PdfWriter()
        for i in range(start, end):
            writer.add_page(reader.pages[i])

        chunk_bytes = io.BytesIO()
        writer.write(chunk_bytes)
        chunk_bytes.seek(0)

        # OCR this chunk
        chunk_text = load_pdf_chunk_from_gemini(chunk_bytes.getvalue(), chunk_num, file_name)

        text += f"\n--- Chunk {chunk_num} (Pages {start+1}-{end}) ---\n{chunk_text}\n"
        chunk_num += 1

    return Document(page_content=text, metadata={"source": file_name})


In [None]:
pdf_path = "/Users/sameersingh/Documents/DataViz/data/Report on Title 2024-11-18.pdf"
# Example: 200 pages → 10 pages per chunk → 20 API calls
result_doc = extract_pdf_by_chunks(pdf_path, pages_per_chunk=10)
print(result_doc.page_content[:2000])

### using claude for ocr

In [None]:
import base64
import io
from PyPDF2 import PdfReader, PdfWriter
import anthropic
from langchain.schema import Document

# Claude client
client = anthropic.Anthropic(api_key="your api key")


def ocr_pdf_chunk_with_claude(chunk_bytes: bytes, chunk_num: int, file_name: str) -> str:
    """
    Send a multi-page PDF chunk (bytes) to Claude OCR and return extracted text.
    """
    pdf_data = base64.standard_b64encode(chunk_bytes).decode("utf-8")

    message = client.messages.create(
        model="claude-sonnet-4-20250514",
        max_tokens=7000,
        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "document",
                        "source": {
                            "type": "base64",
                            "media_type": "application/pdf",
                            "data": pdf_data
                        }
                    },
                    {
                        "type": "text",
                        "text": (
                            "Extract all text from this PDF chunk. "
                            "If there are forms or checkboxes marked by pen, mention them clearly. "
                            "Preserve the order and formatting."
                        )
                    }
                ],
            }
        ],
    )

    return message.content[0].text if message.content else ""


def extract_pdf_by_chunks(pdf_path: str, pages_per_chunk: int = 10) -> Document:
    """
    Extract scanned PDF text by splitting into chunks (e.g., 10 pages each)
    and sending each chunk to Claude OCR.
    """
    reader = PdfReader(pdf_path)
    text = ""
    file_name = pdf_path.split("/")[-1]

    total_pages = len(reader.pages)
    chunk_num = 1

    for start in range(0, total_pages, pages_per_chunk):
        end = min(start + pages_per_chunk, total_pages)

        # Merge multiple pages into one chunk
        writer = PdfWriter()
        for i in range(start, end):
            writer.add_page(reader.pages[i])

        chunk_bytes = io.BytesIO()
        writer.write(chunk_bytes)
        chunk_bytes.seek(0)

        # OCR this chunk
        print(f"Processing pages {start+1} to {end}...")
        chunk_text = ocr_pdf_chunk_with_claude(chunk_bytes.getvalue(), chunk_num, file_name)

        text += f"\n--- Chunk {chunk_num} (Pages {start+1}-{end}) ---\n{chunk_text}\n"
        chunk_num += 1

    return Document(page_content=text, metadata={"source": file_name})


In [None]:
pdf_path = "/Users/sameersingh/Documents/DataViz/data/Report on Title 2024-11-18.pdf"

# Example: 200 pages → 10 pages per chunk → 20 API calls (much faster than 200 calls)
result_doc = extract_pdf_by_chunks(pdf_path, pages_per_chunk=10)

print(result_doc.page_content[:2000])  # preview first 2000 chars

In [None]:
print(result_doc.page_content)

In [None]:
from mistralai import Mistral
## method for ocr the scanned pdf
def ocr_scanned_pdf(pdf_path : str) -> str:
    """ocr_scanned_pdf methhod is used for perform ocr on scanned pdf's.

    Args:
        pdf_path (str): path of the files 

    Returns:
        str: text of the ocr 
    """
    try:
        print(f"INFO : OCR start for file {pdf_path}")
        api_key = "your api key"
        client = Mistral(api_key=api_key)
        uploded_pdf = client.files.upload(
        file={
            "file_name":pdf_path,
            "content":open(pdf_path,"rb")
        },
        purpose="ocr"
        )
        client.files.retrieve(file_id=uploded_pdf.id)
        signed_url = client.files.get_signed_url(file_id=uploded_pdf.id)
        ocr_response = client.ocr.process(
        model = "mistral-ocr-latest",
        document={
            "type" : "document_url",
            "document_url" : signed_url.url,
        },
        include_image_base64=False
        )
        text =""
        for page in ocr_response.pages:
            text = text + page.markdown + "\n"
            # print(page.markdown + "\\n")
        print(f"INFO : OCR completed for the file {pdf_path}")
        return text
    except Exception as exp:
        print(f"ERROR : error in ocr_scanned_pdf : {exp}")
        return ""

In [None]:
text = ocr_scanned_pdf("/Users/sameersingh/Documents/DataViz/data/Construction_Contract_letters.pdf")

## loading data using python lib

In [None]:
print(text)

In [19]:
# Save in a .txt file
with open("Template.txt", "w", encoding="utf-8") as f:
    f.write(text)

print("Text saved successfully in output.txt")

Text saved successfully in output.txt
