In [1]:
import os
import numpy as np
from glob import glob
from PIL import Image
import random
from torch.utils.data import Dataset
import matplotlib.pyplot as plt
import numpy as np
from PIL import Image
import io

def display_frames_grid(frames, title="Video Sample Frames" , fps = 16):
    """
    Displays the selected video frames in a grid (4x4).

    Args:
        frames (np.ndarray): Array of frames with shape (T, H, W, C).
        title (str): Title of the grid.
    """
    num_frames = frames.shape[0]
    grid_size = int(np.ceil(np.sqrt(num_frames)))  # Approximate square grid

    fig, axes = plt.subplots(grid_size, grid_size, figsize=(10, 10))
    fig.suptitle(title, fontsize=16)

    for i, ax in enumerate(axes.flat):
        if i < num_frames:
            ax.imshow(frames[i])
            ax.axis('off')
        else:
            ax.set_visible(False)

    plt.show()

def create_gif_from_frames(frames,title, fps=1):
    """
    Returns:
        PIL.Image: Animated GIF image.
    """
    pil_images = [Image.fromarray(frame) for frame in frames]
    gif_path = f"./{title}.gif"
    pil_images[0].save(gif_path, save_all=True, append_images=pil_images[1:], duration=int(1000/fps), loop=0)
    return gif_path



class VideoFrameDataset(Dataset):
    def __init__(self, root_dir, sequence_length=16):
        """
        Args:
            root_dir (str): Root directory where the dataset is stored.
            sequence_length (int): Number of frames to select uniformly.
        """
        self.root_dir = root_dir
        self.sequence_length = sequence_length
        self.video_samples = self._get_all_video_samples()

    def _get_all_video_samples(self):
        """
        Returns:
            list of tuples: Each tuple contains (activity, camera, subject_id, session_id)
        """
        video_samples = []
        for activity in os.listdir(self.root_dir):  # Iterate over activities
            activity_path = os.path.join(self.root_dir, activity)
            if not os.path.isdir(activity_path):
                continue  # Skip non-folder items

            for camera in os.listdir(activity_path):  # Iterate over cameras
                camera_path = os.path.join(activity_path, camera)
                if not os.path.isdir(camera_path):
                    continue

                # Find unique subject-session pairs in this camera folder
                all_frames = glob(os.path.join(camera_path, "*.jpg"))
                subject_sessions = set()

                for frame_path in all_frames:
                    filename = os.path.basename(frame_path)
                    parts = filename.split("_")

                    if len(parts) >= 3:
                        subject_id, session_id = parts[0], parts[1]
                        subject_sessions.add((subject_id, session_id))

                # Add all (Activity, Camera, Subject, Session) combinations
                for subject_id, session_id in subject_sessions:
                    video_samples.append((activity, camera, subject_id, session_id))

        return video_samples

    def _get_frames_from_video_sample(self, activity, camera, subject_id, session_id):
        """
        Returns:
            list: Sorted list of frame file paths.
        """
        video_sample_path = os.path.join(self.root_dir, activity, camera)
        all_frames = sorted(glob(os.path.join(video_sample_path, f"{subject_id}_{session_id}_*.jpg")))
        return all_frames

    def _select_uniform_frames(self, frames):
        """
        Returns:
            list: Selected frame file paths.
        """
        if len(frames) < self.sequence_length:
            # Pad by repeating the last frame
            frames += [frames[-1]] * (self.sequence_length - len(frames))
        else:
            step = max(len(frames) // self.sequence_length, 1)
            offset = random.randint(0, step - 1) if step > 1 else 0
            frames = sorted(frames[i] for i in range(offset, len(frames), step)[:self.sequence_length])

        return frames

    def __len__(self):
        """
        Returns the total number of video samples in the dataset.
        """
        return len(self.video_samples)

    def __getitem__(self, idx):
        """
        Returns:
            np.ndarray: Stacked array of selected frames.
            str: Corresponding activity label.
            tuple: (subject_id, session_id) for reference.
        """
        activity, camera, subject_id, session_id = self.video_samples[idx]
        frames = self._get_frames_from_video_sample(activity, camera, subject_id, session_id)

        if not frames:
            raise ValueError(f"No frames found for {activity}/{camera}/{subject_id}_{session_id}")

        selected_frames = self._select_uniform_frames(frames)

        # Load and stack frames as NumPy arrays
        frame_arrays = [np.array(Image.open(frame).convert('RGB')) for frame in selected_frames]
        return np.stack(frame_arrays), activity ,camera, (subject_id, session_id)
#Example Usage:
#dataset = VideoFrameDataset(root_dir="/mnt/Data1/RGB_sd", sequence_length=16)
#print(len(dataset))
# gif = create_gif_from_frames(frames, f"{activity}_{camera}_{subject_id}_{session_id}")


# Install Library and Load Model

In [2]:
!pip install --upgrade -q accelerate bitsandbytes
!pip install git+https://github.com/huggingface/transformers.git
# we need av to be able to read the video
!pip install -q av sentencepiece protobuf


Collecting git+https://github.com/huggingface/transformers.git
  Cloning https://github.com/huggingface/transformers.git to /tmp/pip-req-build-ajq6csxu
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers.git /tmp/pip-req-build-ajq6csxu
  Resolved https://github.com/huggingface/transformers.git to commit c8a2b25f915a7745d57c92635415e2517b739bc8
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone


In [3]:
!nvidia-smi

import torch
print("CUDA Available:", torch.cuda.is_available())
print("CUDA Version:", torch.version.cuda)


Mon Mar 17 11:55:56 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 570.124.06             Driver Version: 570.124.06     CUDA Version: 12.8     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 3060        On  |   00000000:04:00.0  On |                  N/A |
| 32%   27C    P8             10W /  170W |     826MiB /  12288MiB |      9%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                     

In [4]:
from transformers import BitsAndBytesConfig, LlavaNextVideoForConditionalGeneration, LlavaNextVideoProcessor
import torch

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16
)

processor = LlavaNextVideoProcessor.from_pretrained("llava-hf/LLaVA-NeXT-Video-7B-hf")
model = LlavaNextVideoForConditionalGeneration.from_pretrained(
    "llava-hf/LLaVA-NeXT-Video-7B-hf",
    quantization_config=quantization_config,
    device_map='auto'
)

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.50, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [15]:
from torchvision.transforms import ToPILImage

def generate_answer(instance, question, processor, model, max_new_tokens=100):
    """
    Returns:
        str: The generated answer from the model.
    """
    # Unpack the sample
    frames, activity, camera, (subject_id, session_id) = instance

    # Convert each NumPy frame to a PIL Image (as expected by the processor)
    to_pil = ToPILImage()
    frame_images = [to_pil(frame) for frame in frames]

    # Construct the prompt with the provided question
    prompt = (
        f"USER: <video>\n"
        f"Question: {question}\n"
        f"ASSISTANT:"
    )

    # Note: both the prompt and video clip must be wrapped in a list.
    # inputs = processor(
    #     [prompt],
    #     videos=[frame_images],
    #     padding=True,
    #     return_tensors="pt"
    # ).to(model.device)
    inputs = processor(
    videos=[frame_images],
    text=[prompt],
    padding=True,
    return_tensors="pt"
).to(model.device)


    # Generate an answer from the model
    output_ids = model.generate(**inputs, max_new_tokens=max_new_tokens)

    # Decode the generated tokens into a string
    answer = processor.batch_decode(output_ids, skip_special_tokens=True)[0]

    return answer

In [28]:
dataset = VideoFrameDataset(root_dir="/mnt/Data1/RGB_sd", sequence_length=16)
print(len(dataset))
sample = dataset[0]  # This should be a tuple: (frames, activity, camera, (subject_id, session_id))
question = "What is the person doing in this video?"
answer = generate_answer(sample, question, processor, model)
print("Model Answer:", answer.split("ASSISTANT:")[1])

885
Model Answer:  The person in the video is standing in a kitchen and appears to be using a smartphone or tablet to look up information or possibly browse the internet. They are holding the device in their hands and seem to be focused on the screen. The kitchen is equipped with various appliances and counters, suggesting a domestic or commercial setting where food preparation might take place.


In [1]:
import csv
import os

def generate_and_save_answers(dataset, question, processor, model, csv_filename="output_answers.csv", max_new_tokens=100):
    """
    Loops over all dataset instances, generates answers for the given question using the provided
    processor and model, and saves the results incrementally to a CSV file.
    """
    file_exists = os.path.exists(csv_filename)

    with open(csv_filename, mode="a", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=["activity", "camera", "subject_id", "session_id", "question", "answer"])
        if not file_exists:
            writer.writeheader()  # Write the header only if the file is new

        for idx in range(len(dataset)):
            try:
                instance = dataset[idx]
                answer = generate_answer(instance, question, processor, model, max_new_tokens=max_new_tokens)

                # Clean up the answer to keep only the part after "ASSISTANT:"
                cleaned_answer = answer.split("ASSISTANT:")[1]

                # Unpack metadata from the instance (frames, activity, camera, (subject_id, session_id))
                _, activity, camera, (subject_id, session_id) = instance

                # Build the output dictionary
                output_entry = {
                    "activity": activity,
                    "camera": camera,
                    "subject_id": subject_id,
                    "session_id": session_id,
                    "question": question,
                    "answer": cleaned_answer
                }

                # Write the entry immediately to avoid data loss
                writer.writerow(output_entry)
                f.flush()  # Ensure data is written to disk
                print(f"Processed sample {idx}: {output_entry}")

            except Exception as e:
                print(f"Error processing sample {idx}: {e}")
                continue  # Skip to the next sample instead of stopping

    print("Completed processing dataset.")
    return csv_filename


In [30]:
generate_and_save_answers(dataset, question, processor, model, csv_filename="output_answers.csv", max_new_tokens=100)

Processed sample 0: {'activity': 'Cleaning the kitchen', 'camera': 'camera_2_fps_15', 'subject_id': '05', 'session_id': '3', 'question': 'What is the person doing in this video?', 'answer': ' The person in the video is cleaning a kitchen counter. They are using a sponge and a cloth to wipe down the counter, likely to remove dirt, stains, or food particles from the surface.'}
Processed sample 1: {'activity': 'Cleaning the kitchen', 'camera': 'camera_2_fps_15', 'subject_id': '06', 'session_id': '3', 'question': 'What is the person doing in this video?', 'answer': ' The person in the video is cleaning a kitchen counter. They are using a mop to wipe down the counter, and there is a laptop on the counter that is displaying a graphical user interface with a blue background and a white bar at the bottom. The person is wearing a white shirt and appears to be focused on their task.'}
Processed sample 2: {'activity': 'Cleaning the kitchen', 'camera': 'camera_2_fps_15', 'subject_id': '09', 'sessi

'output_answers.csv'