In [None]:
from dotenv import load_dotenv
from pathlib import Path
import sys


sys.path.append(Path("..").resolve().as_posix())
_ = load_dotenv()

In [None]:
from peft import AutoPeftModelForCausalLM
from transformers import AutoProcessor, PaliGemmaForConditionalGeneration

CHECKPOINT_PATH = "paligemma_2024-07-24_12-18-50/checkpoint-288"

model = AutoPeftModelForCausalLM.from_pretrained(CHECKPOINT_PATH)
# model = PaliGemmaForConditionalGeneration.from_pretrained(CHECKPOINT_PATH)
# model = PaliGemmaForConditionalGeneration.from_pretrained("google/paligemma-3b-mix-224")
processor = AutoProcessor.from_pretrained(CHECKPOINT_PATH)

In [None]:
from PIL import Image


image = Image.open("assets/test_image.jpg")
image

In [None]:
from training_toolkit.src.data_presets.json import JSON_PROMPT, token2json

In [None]:
inputs = processor(images=Image.open("test_image.jpg"), text="extract JSON.")

# Autoregressively generate
# We use greedy decoding here, for more fancy methods see https://huggingface.co/blog/how-to-generate
generated_ids = model.generate(**inputs, max_new_tokens=256, do_sample=True)

# Next we turn each predicted token ID back into a string using the decode method
# We chop of the prompt, which consists of image tokens and our text prompt
image_token_index = model.config.image_token_index
num_image_tokens = len(generated_ids[generated_ids == image_token_index])
num_text_tokens = len(processor.tokenizer.encode(JSON_PROMPT))
num_prompt_tokens = num_image_tokens + num_text_tokens + 2
generated_text = processor.batch_decode(
    generated_ids[:, num_prompt_tokens:],
    skip_special_tokens=True,
    clean_up_tokenization_spaces=False,
)[0]

generated_json = token2json(generated_text)
print(generated_json)

In [None]:
from training_toolkit.src.common.video_readers import get_video_reader


VIDEO_READER = get_video_reader()


def process_raw_video(video_path, model, processor, gen_kwargs):
    video_clip = VIDEO_READER(video_path, 8)

    # Let's use chat template to format the prompt correctly, this time without the caption
    conversation = [
        {
            "role": "user",
            "content": [
                {"type": "text", "text": "Provide a detailed caption for this video."},
                {"type": "video"},
            ],
        },
    ]

    # Set add_generation_prompt to add the "ASSISTANT: " at the end
    prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)

    batch = processor(
        text=prompt,
        videos=[video_clip],
        return_tensors="pt",
    ).to(model.device)

    out = model.generate(**batch, **gen_kwargs)
    generated_text = processor.batch_decode(out, skip_special_tokens=True)

    return generated_text


In [None]:
VIDEO_PATH = "path/to/video.mp4"

process_raw_video(
    VIDEO_PATH, model, processor, gen_kwargs={"max_length": 256, "do_sample": True}
)