In [1]:
!pip install transformers
!pip install torch
!pip install accelerate
!pip install einops

Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)
Collecting nvidia-curand-cu12==10.3.2.106 (from torch)
  Using cached nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl (56.5 MB)
Collectin

In [2]:
import os
from typing import Tuple, List
import requests
import gc

import cv2
import zipfile
from PIL import Image
import pandas as pd
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

In [3]:
class Model:
    """
    A class representing a model for keyframe captioning.

    Attributes:
        model_id (str): The ID of the model.
        revision (str): The revision of the model.
        model (AutoModelForCausalLM): The pretrained model for caption generation.
        tokenizer (AutoTokenizer): The tokenizer for the model.
        prompt (str): The prompt for generating captions.

    Methods:
        __init__(model_id: str, revision: str): Initializes the Model object.
        init_model(model_id: str, revision: str, cache_dir: str = "./model_cache") -> Tuple[AutoModelForCausalLM, AutoTokenizer]: Initializes the model and tokenizer.
        encode_image(image) -> torch.Tensor: Encodes the input image.
        inference(enc_image: torch.Tensor, prompt: str) -> str: Performs inference to generate a caption.
    """

    def __init__(self, model_id: str, revision: str):
        """
        Initializes a Model object.

        Args:
            model_id (str): The ID of the model.
            revision (str): The revision of the model.
        """
        self.model_id = model_id
        self.revision = revision
        self.model, self.tokenizer = self.init_model(model_id, revision)
        self.prompt = "Describe the image with as much detail as possible. Generate as much information that can be turned into meta data as possible."

    def init_model(self, model_id: str, revision: str, cache_dir: str = "./model_cache") -> Tuple[AutoModelForCausalLM, AutoTokenizer]:
        """
        Initializes the model and tokenizer.

        Args:
            model_id (str): The ID of the model.
            revision (str): The revision of the model.
            cache_dir (str, optional): The directory to cache the model. Defaults to "./model_cache".

        Returns:
            model (AutoModelForCausalLM): The pretrained model for caption generation.
            tokenizer (AutoTokenizer): The tokenizer for the model.
        """
        device = "cuda" if torch.cuda.is_available() else "cpu"
        torch_type = torch.float16 if device == "cuda" else torch.float32

        model_dir = os.path.join(cache_dir, model_id.replace('/', '_'), revision)

        # Create the directory if it does not exist
        os.makedirs(model_dir, exist_ok=True)

        if not os.path.exists(os.path.join(model_dir, 'pytorch_model.bin')):
            # Model is not saved locally, download and save it
            model = AutoModelForCausalLM.from_pretrained(
                model_id, trust_remote_code=True, revision=revision,
                torch_dtype=torch_type
            )
            tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision)
        else:
            # Load the model and tokenizer from the local disk
            model = AutoModelForCausalLM.from_pretrained(
                model_dir, torch_dtype=torch_type
            )
            tokenizer = AutoTokenizer.from_pretrained(model_dir)

        model = model.to(device)
        return model, tokenizer

    def encode_image(self, image) -> torch.Tensor:
        """
        Encodes the input image.

        Args:
            image: The input image.

        Returns:
            enc_image: The encoded image.
        """
        enc_image = self.model.encode_image(image)
        return enc_image

    def run_inference(self, enc_image: torch.Tensor, prompt: str) -> str:
        """
        Performs inference to generate a caption.

        Args:
            enc_image: The encoded image.
            prompt (str): The prompt for generating captions.

        Returns:
            prompt_response: The generated caption.
        """
        prompt_response = self.model.answer_question(enc_image, prompt, self.tokenizer)
        return prompt_response

In [4]:
def get_column_values(csv_file: str, column_name: str) -> List[str]:
    df = pd.read_csv(csv_file)
    if column_name in df.columns:
        return df[column_name].values.tolist()
    else:
        return None


def get_filepaths_from_csv(csv_file: str, filename_column: str, directory: str = "videos/keyframes") -> List[str]:
    csv_filepath = os.path.join(directory, csv_file)
    filenames = get_column_values(csv_filepath, filename_column)
    full_filepaths = [f"{os.path.join(directory,''.join(filename.split('_')[:-1]),filename)}" for filename in filenames]
    return full_filepaths


def caption_images(model: Model, base_prompt: str, tasks, csv_file: str = "extracted_keyframes.csv", filename_column: str = 'Filename', directory: str = "videos/keyframes") -> None:
    full_filepaths = get_filepaths_from_csv(csv_file, filename_column, directory)
    subtitle_list = get_column_values(csv_file, 'SubtitleTEST')
    caption_list = []

    for i, filepath in enumerate(full_filepaths):
        image = Image.open(filepath)
        enc_image = model.encode_image(image)
        task_responses = {}
        for task in tasks:
            # Use subtitles if available, otherwise don't include them in the prompt
            if subtitle_list is not None:
                subtitle = subtitle_list[i]
                prompt = f"{base_prompt} \nSUBTITLES: {subtitle}\nTASK: {task}"
            else:
                prompt = f"{base_prompt} \nTASK: {task}"
            prompt_response = model.run_inference(enc_image, prompt)
            task_responses[task] = prompt_response
        caption_list.append(task_responses)

    csv_filepath = os.path.join(directory, csv_file)
    df = pd.read_csv(csv_filepath)
    df["Caption"] = caption_list
    df.to_csv(csv_filepath, index=False)

    return csv_filepath

In [5]:
def unzip_files(zip_file, output_directory):
    with zipfile.ZipFile(zip_file, 'r') as zipf:
        zipf.extractall(output_directory)

input_dir = '/content/keyframes.zip'
output_dir = '/content/keyframes'

unzip_files(input_dir, output_dir )

In [6]:
csv_filepath = "/content/keyframes/extracted_keyframes.csv"

filename_column = 'Filename'
directory = os.path.join("/content","keyframes")

model_id = "vikhyatk/moondream2"
revision = "2024-05-20"
model = Model(model_id, revision)

task_list = [
    "Caption the scene. Describe it with as much information as possible."
    "Generate detailed information for this scene for this scene.",
    "What is the language used in the video this keyframe was captured from?",
    "What kind of video is this, is it a tutorial, a lecture, and the likes.",
]

prompt = f"""
Given keyframe extracted from a scene and the corresponding SUBTITLES - the subtitles transcribed for this scene.
Generate detailed information for this scene for TASK - instructions on what exactly to capture.
Use both the image and the SUBTITLES to infer the information.
If SUBTITLES is not provided, infer the information only from the keyframe image.
If the TASK cannot be completed, then return "NONE".
""".strip()




csv_filepath = caption_images(
    model=model,
    base_prompt=prompt,
    tasks=task_list,
    directory=directory,
    csv_file=csv_filepath
)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/319 [00:00<?, ?B/s]

configuration_moondream.py:   0%|          | 0.00/3.41k [00:00<?, ?B/s]

moondream.py:   0%|          | 0.00/5.29k [00:00<?, ?B/s]

vision_encoder.py:   0%|          | 0.00/10.2k [00:00<?, ?B/s]

modeling_phi.py:   0%|          | 0.00/48.0k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.73G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/7.34k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
