## Handwritten Notes to Text
The following notebook tests 4 OCR models:
- Microsoft TrOCR
- Google Document AI Vision OCR v2
- OpenAI GPT-4o
- HuggingFaceM4/idefics2-8b
    > **_NOTE:_**  Do not run on local CPU environments, use with colab gpu runtime or on the chip cluster


#### Load image

Load and display `data/test-note-copy.png` (or replace with your image).

In [None]:
from PIL import Image
import matplotlib.pyplot as plt

IMAGE_PATH = "../data/test-note-copy.png"

img = Image.open(IMAGE_PATH).convert("RGB")
plt.imshow(img)
plt.axis("off")
plt.show()

print("Loaded:", IMAGE_PATH)

#### 1. Microsoft TrOCR

In [None]:
from transformers import VisionEncoderDecoderModel, TrOCRProcessor

def run_trocr(image_path):
    processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")
    model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten")

    image = Image.open(image_path).convert("RGB")
    pixel_values = processor(image, return_tensors="pt").pixel_values
    generated_ids = model.generate(pixel_values)
    
    return processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()

trocr_text = run_trocr(IMAGE_PATH)
print("TrOCR Result:\n", trocr_text)


#### 2. Google Document AI Vision OCR v2

In [None]:
import os
from dotenv import load_dotenv
from google.cloud import documentai
from PIL import Image

# Load .env variables
load_dotenv()

# Resolve GOOGLE_APPLICATION_CREDENTIALS absolute path
GOOGLE_CREDS_REL = os.getenv("GOOGLE_APPLICATION_CREDENTIALS")
GOOGLE_CREDS = os.path.abspath(os.path.join("..", GOOGLE_CREDS_REL))

# Google DocAI setup
GCP_PROJECT_ID = os.getenv("GCP_PROJECT_ID")
GCP_LOCATION = os.getenv("GCP_LOCATION", "us")
GCP_PROCESSOR_ID = os.getenv("GCP_PROCESSOR_ID")

processor_name = (
    f"projects/{GCP_PROJECT_ID}/locations/{GCP_LOCATION}/processors/{GCP_PROCESSOR_ID}"
)
client = documentai.DocumentProcessorServiceClient()

# Test connectivity to processor
try:
    client.get_processor(name=processor_name)
except Exception as e:
    raise RuntimeError(f"Error connecting to Document AI:\n{e}")

# OCR function
def run_docai(image_path):
    with open(image_path, "rb") as f:
        file_bytes = f.read()

    mime_type = "application/pdf" if image_path.endswith(".pdf") else "image/png"

    request = {
        "name": processor_name,
        "raw_document": {"content": file_bytes, "mime_type": mime_type},
    }

    result = client.process_document(request=request)
    document = result.document
    return document.text.strip()

image = Image.open(IMAGE_PATH)  # Preview check
docai_text = run_docai(IMAGE_PATH)
print("\n Google DocAI Extracted Text:")
print(docai_text)

### 3. OpenAI GPT-4o

In [None]:
import base64
import mimetypes
import os
from dotenv import load_dotenv
from openai import OpenAI

load_dotenv()

OPENAI_KEY = os.getenv("OPENAI_API_KEY")

def run_gpt4o(image_path):
    with open(image_path, "rb") as f:
        image_bytes = f.read()

    mime_type = mimetypes.types_map.get(".png", "image/png")
    b64 = base64.b64encode(image_bytes).decode("utf-8")
    data_url = f"data:{mime_type};base64,{b64}"

    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": "Extract the handwritten text:"},
                    {"type": "image_url", "image_url": {"url": data_url}}
                ]
            }
        ]
    )
    return response.choices[0].message.content.strip()

gpt_text = run_gpt4o(IMAGE_PATH)
print("GPT-4o-mini Result:\n", gpt_text)

#### 4. HuggingFaceM4/idefics2-8b

> **_NOTE:_**  Do not run on local CPU environments, use with colab gpu runtime or on the chip cluster as the model is large to load on cpu.

In [None]:
# !pip install transformers accelerate pillow --quiet

# from transformers import AutoModelForVision2Seq, AutoProcessor
# from google.colab import files
# from PIL import Image
# import torch

# # Upload handwritten note image
# print("Upload handwritten note image (png/jpg)")
# uploaded = files.upload()
# file_name = list(uploaded.keys())[0]
# img = Image.open(file_name).convert("RGB")
# display(img)

# # Load model + processor
# model_id = "HuggingFaceM4/idefics2-8b"
# processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
# model = AutoModelForVision2Seq.from_pretrained(
#     model_id,
#     torch_dtype=torch.float16,
#     device_map="auto",
#     trust_remote_code=True,
# )

# # Include <image> token in the prompt
# prompt = "Extract the handwritten text: <image>"

# inputs = processor(
#     text=[prompt],
#     images=[img],
#     return_tensors="pt"
# ).to("cuda")

# generated_ids = model.generate(
#     **inputs,
#     max_new_tokens=300,
# )

# output = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

# print("IDEFICS2 OCR Output:")
# print(output.strip())