# Single Image

In [1]:
import torch
from PIL import Image
from transformers import AutoProcessor, AutoModelForCausalLM 

device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

model = AutoModelForCausalLM.from_pretrained(
    "microsoft/Florence-2-large", 
    torch_dtype=torch_dtype, 
    trust_remote_code=True,
    attn_implementation="eager"
).to(device)

processor = AutoProcessor.from_pretrained("microsoft/Florence-2-large", trust_remote_code=True)

url = "../Pictures_&_Videos/Testing_Pictures/kermit.jpg" 
try:
    image = Image.open(url).convert("RGB")
except FileNotFoundError:
    raise FileNotFoundError(f"Image not found at {url}. Please check the path.")



prompt = ""
inputs = processor(text=prompt, images=image, return_tensors="pt").to(device, torch_dtype)

generated_ids = model.generate(
    input_ids=inputs["input_ids"],
    pixel_values=inputs["pixel_values"],
    max_new_tokens=4096,
    num_beams=3,
    do_sample=False,
    use_cache=False  
)

generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
parsed_answer = processor.post_process_generation(generated_text, task="<OD>", image_size=(image.width, image.height))

print(parsed_answer)

  from .autonotebook import tqdm as notebook_tqdm
`torch_dtype` is deprecated! Use `dtype` instead!


{'<OD>': {'bboxes': [[175.6160125732422, 0.7199999690055847, 797.1840209960938, 478.3199768066406]], 'labels': ['Kermit the Frog in The Muppets']}}


# Live Video Feed

In [9]:
import torch
import cv2
from PIL import Image
from transformers import AutoProcessor, AutoModelForCausalLM


device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

model = AutoModelForCausalLM.from_pretrained(
    "microsoft/Florence-2-large",
    torch_dtype=torch_dtype,
    trust_remote_code=True,
    attn_implementation="eager"
).to(device)

processor = AutoProcessor.from_pretrained(
    "microsoft/Florence-2-large",
    trust_remote_code=True
)


cap = cv2.VideoCapture(0)  # 0 = default webcam

if not cap.isOpened():
    raise RuntimeError("Cannot open webcam.")

print("Webcam opened. Press 'q' to quit.\n")


prompt = ""  

while True:
    ret, frame = cap.read()
    if not ret:
        print("Failed to grab frame.")
        break

    cv2.imshow("Live Camera Feed", frame)

    pil_image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))

    
    inputs = processor(
        text=prompt,
        images=pil_image,
        return_tensors="pt"
    ).to(device, torch_dtype)

    
    generated_ids = model.generate(
        input_ids=inputs["input_ids"],
        pixel_values=inputs["pixel_values"],
        max_new_tokens=4096,
        num_beams=3,
        do_sample=False,
        use_cache=False
    )

    
    generated_text = processor.batch_decode(
        generated_ids, 
        skip_special_tokens=False
    )[0]

    parsed_answer = processor.post_process_generation(
        generated_text,
        task="<OD>",   
        image_size=(pil_image.width, pil_image.height)
    )

    print("\n--- NEW FRAME ---")
    print(parsed_answer['<OD>']['labels'])

    # Quit with q
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()


Webcam opened. Press 'q' to quit.


--- NEW FRAME ---
['man in hospital bed with SAF25 t-shirt']

--- NEW FRAME ---
['man in black t-shirt with SAF25 logo on bed']

--- NEW FRAME ---
['man in hospital bed with SAF25 t-shirt']

--- NEW FRAME ---
['man with beard in bed with SAF25 t-shirt']

--- NEW FRAME ---
['man with beard and mustache in black t-shirt on bed']

--- NEW FRAME ---
['man with beard in bed with SAF25 t-shirt']

--- NEW FRAME ---
['man in hospital bed with SAF25 t-shirt']

--- NEW FRAME ---
['man in hospital bed with SAF25 t-shirt']

--- NEW FRAME ---
['man in bed with green t-shirt and black shirt']

--- NEW FRAME ---
['man in hospital bed with SAF25 t-shirt']

--- NEW FRAME ---
['man in bed with green t-shirt and black shirt']

--- NEW FRAME ---
['man in hospital bed with SAF25 t-shirt']

--- NEW FRAME ---
['man with beard and mustache in black t-shirt on bed']

--- NEW FRAME ---
['man with beard and mustache in black shirt on bed']

--- NEW FRAME ---
['man with beard a

# Video Feed

In [None]:
import torch
import cv2
from PIL import Image
from transformers import AutoProcessor, AutoModelForCausalLM


device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

print("Using device:", device)


model = AutoModelForCausalLM.from_pretrained(
    "microsoft/Florence-2-large",
    torch_dtype=torch_dtype,
    trust_remote_code=True,
    attn_implementation="eager"
).to(device)

processor = AutoProcessor.from_pretrained(
    "microsoft/Florence-2-large",
    trust_remote_code=True
)


cap = cv2.VideoCapture("../Pictures_&_Videos/ROS_BAGS/front_stereo_camera/right/image_compressed/temp_stream.mp4")  # 0 = default webcam

if not cap.isOpened():
    raise RuntimeError("Cannot open webcam.")

print(" Webcam opened. Press 'q' to quit.\n")

cap.set(cv2.CAP_PROP_POS_FRAMES, 100)

prompt = ""

process_every_n_frames = 5


while True:
    ret, frame = cap.read()
    ret, frame = cap.read()
    ret, frame = cap.read()
    ret, frame = cap.read()
    ret, frame = cap.read()
    if not ret:
        print("Failed to grab frame.")
        break

    pil_image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
    pil_image = pil_image.resize((640, 480))
    
    frame = cv2.resize(frame, (640, 480))
    cv2.imshow("Live Camera Feed", frame)

    inputs = processor(
        text=prompt,
        images=pil_image,
        return_tensors="pt"
    ).to(device, torch_dtype)

    
    generated_ids = model.generate(
        input_ids=inputs["input_ids"],
        pixel_values=inputs["pixel_values"],
        max_new_tokens=512,
        num_beams=3,
        do_sample=False,
        use_cache=False
    )

    
    generated_text = processor.batch_decode(
        generated_ids, 
        skip_special_tokens=False
    )[0]

    parsed_answer = processor.post_process_generation(
        generated_text,
        task="<OD>",   
        image_size=(pil_image.width, pil_image.height)
    )

    print("\n--- NEW FRAME ---")
    print(parsed_answer['<OD>']['labels'])

    # Quit with q
    if cv2.waitKey(30) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()


Using device: cuda:0
 Webcam opened. Press 'q' to quit.


--- NEW FRAME ---
['box']

--- NEW FRAME ---
['box']

--- NEW FRAME ---
['box']

--- NEW FRAME ---
['box']

--- NEW FRAME ---
['crate']

--- NEW FRAME ---
['box', 'box']

--- NEW FRAME ---
['purple shipping containers in a warehouse']

--- NEW FRAME ---
['crate']

--- NEW FRAME ---
['box']

--- NEW FRAME ---
['box']

--- NEW FRAME ---
['purple shipping containers in a warehouse']

--- NEW FRAME ---
['purple shipping containers in a warehouse']

--- NEW FRAME ---
['purple shipping containers in a warehouse']

--- NEW FRAME ---
['purple shipping containers in warehouse']

--- NEW FRAME ---
['purple shipping containers in warehouse']

--- NEW FRAME ---
['warehouse with purple crates and yellow caution tape']

--- NEW FRAME ---
['box']

--- NEW FRAME ---
['box']

--- NEW FRAME ---
['warehouse']

--- NEW FRAME ---
['a warehouse']

--- NEW FRAME ---
['warehouse']

--- NEW FRAME ---
['warehouse']

--- NEW FRAME ---
['warehouse']

--- N