# Single Image

In [2]:
import torch
from PIL import Image
from transformers import AutoProcessor, AutoModelForCausalLM 

device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

model = AutoModelForCausalLM.from_pretrained(
    "microsoft/Florence-2-large", 
    torch_dtype=torch_dtype, 
    trust_remote_code=True,
    attn_implementation="eager"
).to(device)

processor = AutoProcessor.from_pretrained("microsoft/Florence-2-large", trust_remote_code=True)

url = "../Pictures_&_Videos/Testing_Pictures/kermit.jpg" 
try:
    image = Image.open(url).convert("RGB")
except FileNotFoundError:
    raise FileNotFoundError(f"Image not found at {url}. Please check the path.")



prompt = ""
inputs = processor(text=prompt, images=image, return_tensors="pt").to(device, torch_dtype)

generated_ids = model.generate(
    input_ids=inputs["input_ids"],
    pixel_values=inputs["pixel_values"],
    max_new_tokens=4096,
    num_beams=3,
    do_sample=False,
    use_cache=False  
)

generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
parsed_answer = processor.post_process_generation(generated_text, task="<OD>", image_size=(image.width, image.height))

print(parsed_answer)

{'<OD>': {'bboxes': [[175.6160125732422, 0.7199999690055847, 797.1840209960938, 478.3199768066406]], 'labels': ['Kermit the Frog in The Muppets']}}


# Live Video Feed

In [3]:
import torch
import cv2
from PIL import Image
from transformers import AutoProcessor, AutoModelForCausalLM


device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

model = AutoModelForCausalLM.from_pretrained(
    "microsoft/Florence-2-large",
    torch_dtype=torch_dtype,
    trust_remote_code=True,
    attn_implementation="eager"
).to(device)

processor = AutoProcessor.from_pretrained(
    "microsoft/Florence-2-large",
    trust_remote_code=True
)


cap = cv2.VideoCapture(0)  # 0 = default webcam

if not cap.isOpened():
    raise RuntimeError("❌ Cannot open webcam.")

print("✅ Webcam opened. Press 'q' to quit.\n")


prompt = ""  

while True:
    ret, frame = cap.read()
    if not ret:
        print("Failed to grab frame.")
        break

    cv2.imshow("Live Camera Feed", frame)

    pil_image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))

    
    inputs = processor(
        text=prompt,
        images=pil_image,
        return_tensors="pt"
    ).to(device, torch_dtype)

    
    generated_ids = model.generate(
        input_ids=inputs["input_ids"],
        pixel_values=inputs["pixel_values"],
        max_new_tokens=512,
        num_beams=3,
        do_sample=False,
        use_cache=False
    )

    
    generated_text = processor.batch_decode(
        generated_ids, 
        skip_special_tokens=False
    )[0]

    parsed_answer = processor.post_process_generation(
        generated_text,
        task="<OD>",   
        image_size=(pil_image.width, pil_image.height)
    )

    print("\n--- NEW FRAME ---")
    print(parsed_answer['<OD>']['labels'])

    # Quit with q
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()


✅ Webcam opened. Press 'q' to quit.


--- NEW FRAME ---
['woman on couch with phone in living room', 'woman on video call with phone and glasses']

--- NEW FRAME ---
['woman on couch in living room with kitchen in background']

--- NEW FRAME ---
['woman on couch with phone in living room', 'woman on phone in kitchen with kitchen cabinets in background']

--- NEW FRAME ---
['woman on couch with phone in living room', 'woman on phone in kitchen with kitchen cabinets in background']

--- NEW FRAME ---
['woman on couch with phone in living room', 'woman on phone in kitchen with wooden cabinets and countertop']

--- NEW FRAME ---
['woman on couch with phone in living room', 'woman on sofa with phone and coffee cup', 'woman on bed with phone on couch in living area', 'woman in living-room with phone']

--- NEW FRAME ---
['woman on couch with phone in living room', 'woman on reclining sofa with remote control']

--- NEW FRAME ---
['woman on couch with phone in living room', 'woman on video ca