In [20]:
import torch
from PIL import Image
from transformers import AutoProcessor, AutoModelForCausalLM 

# 1. Setup Device
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

# 2. Load Model 
# Note: We keep attn_implementation="eager" to avoid the first error you saw.
model = AutoModelForCausalLM.from_pretrained(
    "microsoft/Florence-2-large", 
    torch_dtype=torch_dtype, 
    trust_remote_code=True,
    attn_implementation="eager"
).to(device)

processor = AutoProcessor.from_pretrained("microsoft/Florence-2-large", trust_remote_code=True)

# 3. Load Image
# Make sure this path is correct on your machine!
url = "../Pictures/Testing_Pictures/kermit.jpg" 
try:
    image = Image.open(url).convert("RGB")
except FileNotFoundError:
    raise FileNotFoundError(f"Image not found at {url}. Please check the path.")



prompt = ""
inputs = processor(text=prompt, images=image, return_tensors="pt").to(device, torch_dtype)

generated_ids = model.generate(
    input_ids=inputs["input_ids"],
    pixel_values=inputs["pixel_values"],
    max_new_tokens=4096,
    num_beams=3,
    do_sample=False,
    use_cache=False  
)

generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
parsed_answer = processor.post_process_generation(generated_text, task="<OD>", image_size=(image.width, image.height))

print(parsed_answer)

{'<OD>': {'bboxes': [[175.6160125732422, 0.7199999690055847, 797.1840209960938, 478.3199768066406]], 'labels': ['Kermit the Frog in The Muppets']}}
