In [None]:
!pip install --upgrade pip
!pip uninstall torch torchvision torchaudio -y
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124
!pip install transformers pillow accelerate einops timm

In [None]:
from huggingface_hub import login
login(token="<HF_token")

In [None]:
import torch

from transformers import AutoProcessor, AutoModelForCausalLM
from PIL import Image

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
model = AutoModelForCausalLM.from_pretrained("microsoft/Florence-2-large", torch_dtype=torch_dtype, trust_remote_code=True, attn_implementation="eager").to(device)
processor = AutoProcessor.from_pretrained("microsoft/Florence-2-large", trust_remote_code=True)

In [None]:
image = Image.open("ocr-or-caption-image-here.jpg")

prompt = "<OCR>"
# prompt = "<DETAILED_CAPTION>"

In [None]:
inputs = processor(text=prompt, images=image, return_tensors="pt").to(device, torch_dtype)

generated_ids = model.generate(
    input_ids=inputs["input_ids"],
    pixel_values=inputs["pixel_values"],
    max_new_tokens=1024,
    num_beams=3,
    do_sample=False,
    use_cache=False
)

In [None]:
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
parsed_answer = processor.post_process_generation(generated_text, task=prompt, image_size=(image.width, image.height))

raw_output = parsed_answer["<OCR>"]
# raw_output = parsed_answer["<DETAILED_CAPTION>"]
clean_output = raw_output.replace('\n', ' ').strip()

print("OCR:", clean_output)