In [1]:
import os
from dotenv import find_dotenv, load_dotenv

_ = load_dotenv(find_dotenv())

hf_token = os.getenv("HF_TOKEN")

In [2]:
from pathlib import Path
import sys

sys.path.append(Path("custom_llava").resolve().as_posix())

In [3]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

In [5]:
model_path = "custom_llava/checkpoints/llava_gemma_v1_pretrain"

In [8]:
from custom_llava.llava.constants import (
    IMAGE_TOKEN_INDEX,
    DEFAULT_IMAGE_TOKEN,
    DEFAULT_IM_START_TOKEN,
    DEFAULT_IM_END_TOKEN,
    DEFAULT_IMAGE_PATCH_TOKEN,
)
from custom_llava.llava.conversation import conv_templates, SeparatorStyle
from custom_llava.llava.model.builder import load_pretrained_model
from custom_llava.llava.utils import disable_torch_init
from custom_llava.llava.mm_utils import (
    tokenizer_image_token,
    process_images,
    get_model_name_from_path,
)
from custom_llava.llava.model import (
    LlavaConfig,
    LlavaMistralForCausalLM,
    LlavaLlamaForCausalLM,
    LlavaGemmaForCausalLM,
)

In [9]:
disable_torch_init()
model_path = os.path.expanduser(model_path)
model_name = get_model_name_from_path(model_path)
# model_name = "llava_gemma"
tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, None, model_name)

KeyError: 'LlavaGemmaConfig'

In [7]:
# model = AutoModelForCausalLM.from_pretrained(
#     model_path,
#     torch_dtype=torch.bfloat16,
#     device_map="cuda",
# )
# tokenizer = AutoTokenizer.from_pretrained(model_path)

In [8]:
model.get_vision_tower().to("cuda")

LanguageBindVideoTower(
  (video_tower): CLIPVisionTransformer(
    (embeddings): CLIPVisionEmbeddings(
      (patch_embedding): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14), bias=False)
      (position_embedding): Embedding(257, 1024)
    )
    (patch_dropout): PatchDropout()
    (pre_layrnorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
    (encoder): CLIPEncoder(
      (layers): ModuleList(
        (0-23): 24 x CLIPEncoderLayer(
          (self_attn): CLIPAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (layer_norm1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): CLIPMLP(
            (activation_fn): GELUActivation()
            (fc1): Linear(in_fe

In [9]:
conv_mode = "gemma"
num_chunks = 1
chunk_idx = 0
temperature = 0.5
top_p = None
num_beams = 1

In [10]:
qs = "What is unusual about this image?"
cur_prompt = qs

# Insert special image tokens into the text prompt

if model.config.mm_use_im_start_end:
    image_tokens = " ".join([DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN] * model.get_vision_tower().config.num_frames) + "\n"
else:
    image_tokens = " ".join([DEFAULT_IMAGE_TOKEN] * model.get_vision_tower().config.num_frames) + "\n"
qs = image_tokens + qs

# Construct conversation prompt

conv = conv_templates[conv_mode].copy()
conv.append_message(conv.roles[0], qs)
conv.append_message(conv.roles[1], None)
prompt = conv.get_prompt()

input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()

# url = "https://www.ilankelman.org/stopsigns/australia.jpg"
# image = Image.open(requests.get(url, stream=True).raw)
# image_tensor = process_images([image], image_processor, model.config)[0]

video = '1.mp4'
video_tensor = image_processor(video, return_tensors='pt')['pixel_values'].to("cuda")
model.get_model().mm_projector.to("cuda", dtype=torch.float16)

Sequential(
  (0): Linear(in_features=1024, out_features=2048, bias=True)
  (1): GELU(approximate='none')
  (2): Linear(in_features=2048, out_features=2048, bias=True)
)

In [11]:
# qs = "What is unusual about this image?"
# cur_prompt = qs

# conv = conv_templates[conv_mode].copy()
# conv.append_message(conv.roles[0], qs)
# conv.append_message(conv.roles[1], None)
# prompt = conv.get_prompt()

# input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()

In [12]:
# embeds = model.get_model().embed_tokens(input_ids)
# print(input_ids.shape)
# embeds.shape

In [13]:
with torch.inference_mode():
    output_ids = model.generate(
        input_ids,
        images=video_tensor.half().cuda(),
        # image_sizes=[image.size],
        do_sample=True if temperature > 0 else False,
        temperature=temperature,
        top_p=top_p,
        num_beams=num_beams,
        # no_repeat_ngram_size=3,
        max_new_tokens=1024,
        use_cache=True)

# print(output_ids)

outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
outputs

"The person is sitting in close proximity to the camera. There is no indication of discomfort or evasion, and the individual appears to be enjoying the process. The lie detector confirms the truthfulness of the answers, indicating a genuine reflection on the subject matter.\nIn this context, the individual's demeanor is light-hearted and candid, with moments of self-deprecation and humor. The person's body language is open, and they speak with a sense of ease, showing no signs of nervousness or deception.\nThe individual's responses to questions about their personal life and experiences are delivered with a mix of humor and sincerity. The lie detector does not signal any deception, and the person's emotional state is one of reflection and openness.\nThe person's body language is relaxed, and they speak with a sense of ease, indicating a level of comfort with the conversation. The lie detector's confirmation of truthfulness suggests a genuine reflection on the subject matter.\nThe indiv

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b-it")

In [None]:
type(tokenizer)
tokenizer.apply_chat_template

In [None]:
from transformers import GemmaTokenizer

In [None]:
print(tokenizer.chat_template)