In [12]:
#!pip install git+https://github.com/huggingface/transformers
!pip install transformers==4.48.0

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting transformers==4.48.0
  Downloading transformers-4.48.0-py3-none-any.whl.metadata (44 kB)
Downloading transformers-4.48.0-py3-none-any.whl (9.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.7/9.7 MB[0m [31m104.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.50.0.dev0
    Uninstalling transformers-4.50.0.dev0:
      Successfully uninstalled transformers-4.50.0.dev0
Successfully installed transformers-4.48.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [5]:
import os
import torch
from pathlib import Path
from transformers import AutoProcessor
import numpy as np
import decord
import random

In [6]:
model = "qwen"

if model == "qwen": 
    from transformers import Qwen2_5_VLForConditionalGeneration

    MODEL_ID = "Qwen/Qwen2.5-VL-7B-Instruct"
    ADAPTER_ID = "kida1122/qwen2.5-vl-7b-instruct-cataract1k"
    
    model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
        MODEL_ID,
        device_map="auto",
        torch_dtype=torch.bfloat16,
    )
else:
    from transformers import LlavaNextVideoForConditionalGeneration
    MODEL_ID = "llava-hf/LLaVa-NeXT-Video-7b-hf" 
    ADAPTER_ID = "kida1122/llava-next-video-7b-hf-cataract1k"
    
    model = LlavaNextVideoForConditionalGeneration.from_pretrained(
        MODEL_ID,
        device_map="auto",
        torch_dtype=torch.bfloat16,
    )
processor = AutoProcessor.from_pretrained(MODEL_ID)
    
# Load adapter
model.load_adapter(ADAPTER_ID)

Loading checkpoint shards: 100%|██████████| 5/5 [00:11<00:00,  2.34s/it]
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.48, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


## Single Video Inference

In [7]:
home = os.getenv("HOME")
video_path = f"{home}/surgery-sft/datasets/cataract1k/videos/test/case_4687_Capsulorhexis_52.50_57.50.mp4"
from IPython.display import Video
Video(video_path, embed=True)

In [8]:
def sample_images(vr, sample_fps=2):
    """
    Sample frames from a video at a specified frame rate.
    
    Args:
        vr: VideoReader object from decord
        sample_fps: Target frames per second for sampling
        
    Returns:
        Tensor containing sampled video frames
    """
    num_frames = vr._num_frame
    # Calculate frame indices to sample based on the target FPS
    frames_idx = [int(vr.get_avg_fps() / sample_fps)*i for i in range(sample_fps * num_frames // int(vr.get_avg_fps()))] 
    return vr.get_batch(frames_idx)

In [9]:
SYSTEM_MESSAGE = """You are a vision language model specialized in analyzing cataract surgery videos.
Your task is to analyze the provided surgical video frames and extract relevant clinical information."""

In [10]:
prompt = "What do you see in the video?"
vr = decord.VideoReader(video_path)

add_system_message = True

conversation = [
    {
        "role": "user",
        "content": [
            {
                "type": "video",
                "video": video_path
            },
            {
                "type": "text",
                "text": prompt
            }
        ]
    }
]

if add_system_message:
    conversation.insert(0, {
        "role": "system",
        "content": [
            {
                "type": "text",
                "text": SYSTEM_MESSAGE
            }
        ]
    })
    
texts = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)

video_inputs = sample_images(vr)

model_inputs = processor(
    text=texts,
    videos=video_inputs.asnumpy(),
    return_tensors="pt",
    truncation=True,
    max_length=4096,
).to(model.device)

# Generate response
output_ids = model.generate(**model_inputs, 
                            max_new_tokens=1024,
                            do_sample=True,
                            repetition_penalty=1.1,
                            temperature=0.6)
generated_ids = output_ids[0, model_inputs.input_ids.shape[1]:]
output_text = processor.decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
print(output_text)

The video shows an eye undergoing cataract surgery. The cornea has been partially opened, revealing the lens inside which appears cloudy due to the presence of a cataract. Surgical instruments are being used to manipulate the lens. The surgeon's hands are visible holding forceps near the lens, likely preparing for its removal or manipulation as part of the procedure. The surrounding tissue around the eye looks slightly inflamed, indicating that this is an active surgical site.


## Multi-Video Inference

In [11]:
home_path = os.getenv("HOME")
test_dir = f"{home_path}/surgery/datasets/cataract1k/videos/test" ## change according to your folder structure
test_vid_count = 5
test_videos = random.sample([vid for vid in os.listdir(test_dir) if "idle" not in vid], test_vid_count) 

prompt = "What do you see in the video?"
output_texts = []

output_file = f"{home_path}/outputs.txt"

add_system_message = True

for vid in test_videos:
    full_path = os.path.join(test_dir, vid)
    vr = decord.VideoReader(full_path)
        
    conversation = [
        {
            "role": "user",
            "content": [
                {
                    "type": "video",
                    "video": full_path
                },
                {
                    "type": "text",
                    "text": prompt
                }
            ]
        }
    ]
    
    if add_system_message:
        conversation.insert(0, {
            "role": "system",
            "content": [
                {
                    "type": "text",
                    "text": SYSTEM_MESSAGE
                }
            ]
        })
        
    texts = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
    
    video_inputs = sample_images(vr)
    
    model_inputs = processor(
        text=texts,
        videos=video_inputs.asnumpy(),
        return_tensors="pt",
    ).to(model.device)
    
    # Generate response
    output_ids = model.generate(**model_inputs, 
                                max_new_tokens=1024,
                                do_sample=True,
                                repetition_penalty=1.1,
                                temperature=0.6)
    generated_ids = output_ids[0, model_inputs.input_ids.shape[1]:]
    output_text = processor.decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
    
    result = {
        "video": vid,
        "prompt": prompt,
        "response": output_text
    }
    output_texts.append(result)

with open(output_file, "w") as f:
    for result in output_texts:
        f.write(f"Video: {result['video']}\n")
        f.write(f"Prompt: {result['prompt']}\n")
        f.write(f"Response: {result['response']}\n")
        f.write("-" * 50 + "\n\n")