In [1]:
import os
from pathlib import Path
import sys

os.environ["CUDA_VISIBLE_DEVICES"] = "0"
sys.path.append(Path(".").resolve().as_posix())

In [2]:
import torch

from llava.constants import (
    IMAGE_TOKEN_INDEX,
    DEFAULT_IMAGE_TOKEN,
    DEFAULT_IM_START_TOKEN,
    DEFAULT_IM_END_TOKEN,
    DEFAULT_IMAGE_PATCH_TOKEN,
)
from llava.conversation import conv_templates, SeparatorStyle
from llava.model.builder import load_pretrained_model
from llava.utils import disable_torch_init
from llava.mm_utils import (
    tokenizer_image_token,
    process_images,
    get_model_name_from_path,
)
from llava.model import (
    LlavaConfig,
    LlavaMistralForCausalLM,
    LlavaLlamaForCausalLM,
    LlavaGemmaForCausalLM,
    LlavaGemmaConfig,
    LlavaPhi3Config,
    LlavaPhi3ForCausalLM,
)

[2024-05-08 06:09:12,310] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)


In [3]:
model_path = "checkpoints/llava_gemma_mamba_v18_adapter_vcgpt"

In [4]:
disable_torch_init()
model_path = os.path.expanduser(model_path)
model_name = get_model_name_from_path(model_path)
tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, None, model_name)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

<All keys matched successfully>
DONE LOADING


In [5]:
model.get_model().to("cuda", dtype=torch.bfloat16)
model.get_model().mm_projector.to("cuda", dtype=torch.bfloat16)

Sequential(
  (0): Linear(in_features=576, out_features=2048, bias=True)
  (1): GELU(approximate='none')
  (2): Linear(in_features=2048, out_features=2048, bias=True)
)

In [11]:
# set generation parameters
conv_mode = "gemma"
num_chunks = 1
chunk_idx = 0
temperature = 0.5
top_p = None
num_beams = 1

In [12]:
def build_prompt(text):
    # insert special image tokens into the text prompt
    text = f"{DEFAULT_IMAGE_TOKEN}\n{text}"

    # construct conversation
    conv = conv_templates[conv_mode].copy()
    conv.append_message(conv.roles[0], text)
    conv.append_message(conv.roles[1], None)
    prompt = conv.get_prompt()
    return prompt


def get_text_batch(texts, tokenizer):
    prompts = []
    for text in texts:
        prompts.append(build_prompt(text))

    # # tokenize the prompt
    inputs = (
        tokenizer_image_token(prompts, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt")
    )

    for k, v in inputs.items():
        inputs[k] = v.to("cuda")

    return inputs


def get_video_batch(videos, image_processor):
    video_tensor = image_processor(videos, return_tensors="pt")["pixel_values"].to(
        "cuda", dtype=torch.bfloat16
    )
    return video_tensor

In [13]:
texts = [
    "How many birds are there?",
    # "How many birds are there in total?",
    # "How many birds are there roughly?",
    # "How many birds are there at all?"
]

videos = [
    "videos_zero_shot/birds.mp4",
    # "videos_zero_shot/fish.mp4",
    # "videos_zero_shot/human.mp4",
    # "videos_zero_shot/swamp.mp4"
]

text_inputs = get_text_batch(texts, tokenizer)
video_tensor = get_video_batch(videos, image_processor)

In [14]:
text_inputs["input_ids"].device

device(type='cuda', index=0)

In [15]:
with torch.inference_mode(), torch.amp.autocast("cuda"):
    output_ids = model.generate(
        **text_inputs,
        images=video_tensor,
        # image_sizes=[image.size],
        do_sample=True if temperature > 0 else False,
        temperature=temperature,
        top_p=top_p,
        num_beams=num_beams,
        # no_repeat_ngram_size=3,
        max_new_tokens=1024,
        use_cache=True)

outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
outputs

RuntimeError: probability tensor contains either `inf`, `nan` or element < 0