In [None]:
import os
from pathlib import Path
import sys

os.environ["CUDA_VISIBLE_DEVICES"] = "0"
sys.path.append(Path(".").resolve().as_posix())

In [None]:
import torch

from llava.constants import (
    IMAGE_TOKEN_INDEX,
    DEFAULT_IMAGE_TOKEN,
    DEFAULT_IM_START_TOKEN,
    DEFAULT_IM_END_TOKEN,
    DEFAULT_IMAGE_PATCH_TOKEN,
)
from llava.conversation import conv_templates, SeparatorStyle
from llava.model.builder import load_pretrained_model
from llava.utils import disable_torch_init
from llava.mm_utils import (
    tokenizer_image_token,
    process_images,
    get_model_name_from_path,
)
from llava.model import (
    LlavaConfig,
    LlavaMistralForCausalLM,
    LlavaLlamaForCausalLM,
    LlavaGemmaForCausalLM,
    LlavaGemmaConfig,
    LlavaPhi3Config,
    LlavaPhi3ForCausalLM,
)

## 1. Load the model

In [None]:
model_path = "path/to/checkpoint"

In [None]:
disable_torch_init()
model_path = os.path.expanduser(model_path)
model_name = get_model_name_from_path(model_path)
tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, None, model_name)

In [None]:
model.get_model().to("cuda", dtype=torch.bfloat16)
model.get_model().mm_projector.to("cuda", dtype=torch.bfloat16)

## 2. Build the prompt

In [None]:
# set generation parameters
conv_mode = "gemma"
num_chunks = 1
chunk_idx = 0
temperature = 0.1
top_p = None
num_beams = 1

In [None]:
qs = "Describe the scene."
cur_prompt = qs

# insert special image tokens into the text prompt
image_tokens = DEFAULT_IMAGE_TOKEN + "\n"
qs = image_tokens + qs

# construct conversation
conv = conv_templates[conv_mode].copy()
conv.append_message(conv.roles[0], qs)
conv.append_message(conv.roles[1], None)
prompt = conv.get_prompt()

print(prompt)

# tokenize the prompt
text_inputs = (
    tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt")
)

for k, v in text_inputs.items():
    text_inputs[k] = v.to("cuda")

In [None]:
# prepare video
video = "path/to/video"

video_tensor = image_processor(video, return_tensors="pt")["pixel_values"].to(
    "cuda", dtype=torch.bfloat16
)

## 3. Generate

In [None]:
with torch.inference_mode(), torch.amp.autocast("cuda"):
    output_ids = model.generate(
        **text_inputs,
        images=video_tensor,
        # image_sizes=[image.size],
        do_sample=True if temperature > 0 else False,
        temperature=temperature,
        top_p=top_p,
        num_beams=num_beams,
        # no_repeat_ngram_size=3,
        max_new_tokens=1024,
        use_cache=True)

outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
outputs