In [8]:
import os
import torch
from pathlib import Path
from transformers import Qwen2_5_VLForConditionalGeneration, Qwen2_5_VLProcessor
import imageio.v3 as iio
import numpy as np
import av

In [5]:
MODEL_ID = "Qwen/Qwen2.5-VL-7B-Instruct"
ADAPTER_ID = "kida1122/qwen2.5-7b-instruct-cataract1k"

In [6]:
# Load model and processor
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    MODEL_ID,
    device_map="auto",
    torch_dtype=torch.bfloat16,
)

processor = Qwen2_5_VLProcessor.from_pretrained(MODEL_ID)

# Load adapter
model.load_adapter(ADAPTER_ID)

Loading checkpoint shards: 100%|██████████| 5/5 [00:11<00:00,  2.27s/it]
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.48, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


## Single Video Inference

In [14]:
home = os.getenv("HOME")
video_path = f"{home}/surgery/datasets/cataract1k/videos/test/case_4687_Phacoemulsification_117.50_122.50.mp4"

from IPython.display import Video
Video(video_path, embed=True)

In [15]:
prompt = "What do you see in the video?"

conversation = [
    {
        "role": "user",
        "content": [
            {
                "type": "video",
                "video": video_path
            },
            {
                "type": "text",
                "text": prompt
            }
        ]
    }
]

# Process conversation
inputs = processor.apply_chat_template(
    conversation,
    add_generation_prompt=True,
    tokenize=True,
    video_fps=8,
    padding=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

# Generate response
output_ids = model.generate(**inputs, max_new_tokens=1024)
generated_ids = output_ids[0, inputs.input_ids.shape[1]:]
output_text = processor.decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
print(output_text)

The video shows an eye during a surgery. The camera is focused on the inside of the eye, and we can see the cornea, lens, and other parts of the eye. There are also surgical instruments visible in the video.


## Multi-Video Inference

In [None]:
test_dir = "datasets/cataract1k/videos/test"
test_videos = os.listdir(test_dir)

prompt = "What do you see in the video?"
output_texts = []

for vid in test_videos:
    conversation = [
        {
            "role": "user",
            "content": [
                {
                    "type": "video",
                    "video": os.path.join(video_dir, vid)
                },
                {
                    "type": "text",
                    "text": prompt
                }
            ]
        }
    ]
    
    # Process conversation
    inputs = processor.apply_chat_template(
        conversation,
        add_generation_prompt=True,
        tokenize=True,
        video_fps=8,
        padding=True,
        return_dict=True,
        return_tensors="pt"
    ).to(model.device)
    
    # Generate response
    output_ids = model.generate(**inputs, max_new_tokens=1024)
    generated_ids = output_ids[0, inputs.input_ids.shape[1]:]
    output_text = processor.decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
    
    result = {
        "video": vid,
        "prompt": prompt,
        "response": output_text
    }
    output_texts.append(result)

with open(args.output_file, "w") as f:
    for result in output_texts:
        f.write(f"Video: {result['video']}\n")
        f.write(f"Prompt: {result['prompt']}\n")
        f.write(f"Response: {result['response']}\n")
        f.write("-" * 50 + "\n\n")