In [None]:
from dotenv import load_dotenv
from pathlib import Path
import sys


sys.path.append(Path("..").resolve().as_posix())
_ = load_dotenv()

In [None]:
from peft import AutoPeftModelForCausalLM
from transformers import AutoProcessor

CHECKPOINT_PATH = "path/to/checkpoint"

model = AutoPeftModelForCausalLM.from_pretrained(CHECKPOINT_PATH)
processor = AutoProcessor.from_pretrained(CHECKPOINT_PATH)

In [None]:
from training_toolkit.src.common.video_readers import get_video_reader


VIDEO_READER = get_video_reader()


def process_raw_video(video_path, model, processor, gen_kwargs):
    video_clip = VIDEO_READER(video_path, 8)

    # Let's use chat template to format the prompt correctly, this time without the caption
    conversation = [
        {
            "role": "user",
            "content": [
                {"type": "text", "text": "Provide a detailed caption for this video."},
                {"type": "video"},
            ],
        },
    ]

    # Set add_generation_prompt to add the "ASSISTANT: " at the end
    prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)

    batch = processor(
        text=prompt,
        videos=[video_clip],
        return_tensors="pt",
    ).to(model.device)

    out = model.generate(**batch, **gen_kwargs)
    generated_text = processor.batch_decode(out, skip_special_tokens=True)

    return generated_text


In [None]:
VIDEO_PATH = "path/to/video.mp4"

process_raw_video(
    VIDEO_PATH, model, processor, gen_kwargs={"max_length": 256, "do_sample": True}
)