In [1]:
import os
import json

import torch
import numpy as np
from transformers import BitsAndBytesConfig, LlavaNextVideoForConditionalGeneration, LlavaNextVideoProcessor
import PIL
import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

Using device: cuda


In [3]:
def get_dataset(root_dir_data: str, split: str, sample: int = None):
    '''
    Get the dataset from the given root directory.
    
    Args:
        root_dir_data (str): Root directory of the dataset.
        split (str): Split of the dataset.
        sample (int): Number of frames to sample from each video.
        
    Returns:
        Dict[str, Dict[str, Any]]: Dataset.
    '''
    
    def get_captions_meta(root_dir_data, split, id):
        '''
        Get captions from the metadata file.
        
        Args:
            root_dir_data (str): Root directory of the dataset.
            split (str): Split of the dataset.
            id (str): Video id.
            
        Returns:
            Dict[str, str]: Dictionary of captions.
        '''
        
        meta_file = os.path.join(root_dir_data, split, 'metas', f'{id}.json')
        if not os.path.exists(meta_file):
            return None
        with open(meta_file, 'r') as f:
            meta = json.load(f)
            captions = {
                'frame_caption': meta['original_metadata']['frame_caption'],
                'music_caption': meta['original_metadata']['music_caption'],
                'caption': meta['original_metadata']['caption'],
                'polish_caption': meta['original_metadata']['polish_caption']
            }
            
            print("Successfully loaded captions, id: ", id)
            
            return captions
    
    
    def load_frames(root_dir_data, split, id, sample=None):
        
        '''
        Load all frames of a video.
        
        Args:
            root_dir_data (str): Root directory of the dataset.
            split (str): Split of the dataset.
            id (str): Video id.
            
        Returns:
            np.ndarray: np array of frames of shape (num_frames, height, width, 3).
        '''
        frames = []
        images_dir = os.path.join(root_dir_data, split, 'images', id)
        if not os.path.exists(images_dir):
            return None
        for image_file in os.listdir(images_dir):
            # read jpg image
            image = PIL.Image.open(os.path.join(images_dir, image_file))
            frames.append(np.array(image))
            
        if len(frames) == 0:
            return None
        
        frames = np.stack(frames)
        
        if sample is not None:
            # sample "sample" frames from the video
            indices = np.linspace(0, frames.shape[0] - 1, sample).astype(int)
            frames = frames[indices]
            
        # put to device
        frames = torch.tensor(frames).to(device)
        
        print("Successfully loaded frames, id: ", id)
        return frames
    
    
    # Get all video ids from test.json
    id_file = os.path.join(root_dir_data, f'{split}.json')
    assert os.path.exists(id_file), f'{id_file} does not exist'
    video_ids = []
    with open(id_file, 'r') as f:
        video_ids = json.load(f)
    
    dataset = {}
    print(f'Loading {len(video_ids)} videos')
    for id in tqdm.tqdm(video_ids):
        images = load_frames(root_dir_data, split, id, sample=sample)
        if images is None:
            continue
        
        captions = get_captions_meta(root_dir_data, split, id)
        # TODO: video and audio
        if captions is not None:
            dataset[id] = {
                'captions': captions,
                'images': images
            }
            
    print(f'Loaded {len(dataset)} videos')
            
    return dataset

In [4]:
def load_llava_next_model(model_path: str):
    '''
    Load the model from the given path.
    
    Args:
        model_path (str): Path to the model.
        
    Returns:
        torch.nn.Module: Model.
    '''
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.float16
    )

    processor = LlavaNextVideoProcessor.from_pretrained(model_path)
    model = LlavaNextVideoForConditionalGeneration.from_pretrained(
        model_path,
        quantization_config=quantization_config,
        device_map='auto'
    )
    
    model.to(device)
    
    return model, processor

In [20]:
def generate_caption_video(model, processor, dataset):
    '''
    Generate captions for videos in the given dataset.
    
    Args:
        model (torch.nn.Module): Model.
        processor (LlavaNextVideoProcessor): Processor.
        dataset (Dict[str, Dict[str, Any]]): Dataset.
        
    Returns:
        Dict[str, str]: Dictionary of captions.
    '''
    
    conversation_video = [
        {
            "role": "user",
            "content": [
                    {"type": "text", "text": "Describe the video."},
                    {"type": "video"},
                ],
        },
    ]
    
    prompt = processor.apply_chat_template(conversation_video, add_generation_prompt=True)
    print(prompt)
    
    outputs = {}
    
    for id in dataset:
        inputs = processor([prompt], videos=[dataset[id]['images']], padding=True, return_tensors="pt").to(model.device)
        generate_kwargs = {"max_new_tokens": 200, "do_sample": True, "top_p": 0.9}
        output = model.generate(**inputs, **generate_kwargs)
        generated_text = processor.decode(output[0], skip_special_tokens=True)
        # get only output from the assistant
        generated_text = generated_text.split('ASSISTANT: ')[-1]
        print(generated_text)
        outputs[id] = generated_text
        
    return outputs

In [7]:
root_dir_data = '/home/saberwu2002/disk-data/data/MMTrail_processed'
split = 'test'
sample = 30
dataset = get_dataset(root_dir_data, split, sample)

Loading 80 videos


  5%|▌         | 4/80 [00:03<01:14,  1.02it/s]

Successfully loaded frames, id:  0erDutDPHxc
Successfully loaded captions, id:  0erDutDPHxc


 30%|███       | 24/80 [00:07<00:15,  3.69it/s]

Successfully loaded frames, id:  -2x2NMwBDzE
Successfully loaded captions, id:  -2x2NMwBDzE


 46%|████▋     | 37/80 [00:11<00:13,  3.29it/s]

Successfully loaded frames, id:  _mCzE_THQaQ
Successfully loaded captions, id:  _mCzE_THQaQ


 61%|██████▏   | 49/80 [00:15<00:09,  3.36it/s]

Successfully loaded frames, id:  1tqn9d_w3qo
Successfully loaded captions, id:  1tqn9d_w3qo


 64%|██████▍   | 51/80 [00:35<00:31,  1.09s/it]

Successfully loaded frames, id:  1eTqgXxDfvA
Successfully loaded captions, id:  1eTqgXxDfvA


100%|██████████| 80/80 [01:00<00:00,  1.33it/s]

Successfully loaded frames, id:  0CkL_9X3rS8
Successfully loaded captions, id:  0CkL_9X3rS8
Loaded 6 videos





In [8]:
model_path = '/home/saberwu2002/disk-data/checkpoints/llava-next-video-7b-hf'
model, processor = load_llava_next_model(model_path)

Loading checkpoint shards: 100%|██████████| 3/3 [00:16<00:00,  5.66s/it]


In [21]:
captions = generate_caption_video(model, processor, dataset)

USER: <video>
Describe the video. ASSISTANT:


The video shows a man standing in front of a band, wearing a suit and a blue shirt, holding a baton which is pointed towards the musicians. The man is using a pair of binoculars, presumably to watch a performer who is not visible in the shot. The setting appears to be a band performance, with the audience watching the man on stage while the performer continues with the music, probably playing an instrument that's not visible in the shot.
The video shows a woman with long hair and adorned with multiple necklaces and bangles, wearing a blue sari with a gold border, standing in front of a brightly lit background with vibrant colors. She appears to be an actress in a South Asian context, perhaps from a television show given her attire and the environment. She is engaged in a serious conversation with someone off-camera, her expression indicating concern or a thoughtful demeanor, as she listens intently. Another person, not in full view, is seated or standing behind her, contributing to the

In [22]:
# save to json
with open('captions.json', 'w') as f:
    json.dump(captions, f)

### Evaluation

In [23]:
import json
import os
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from sentence_transformers import SentenceTransformer, util

In [24]:
def compute_bleu_score(reference_summary, candidate_summary):

    reference_tokens = nltk.word_tokenize(reference_summary.lower())
    candidate_tokens = nltk.word_tokenize(candidate_summary.lower())

    references = [reference_tokens]
    candidate = candidate_tokens

    chencherry = SmoothingFunction()

    bleu_score = sentence_bleu(
        references,
        candidate,
        smoothing_function=chencherry.method1
    )

    return bleu_score

def compute_similarity(reference_summary, candidate_summary):
    model = SentenceTransformer('/home/saberwu2002/disk-data/checkpoints/sentence-transformers_all-MiniLM-L6-v2')

    embedding1 = model.encode(reference_summary, convert_to_tensor=True)
    embedding2 = model.encode(candidate_summary, convert_to_tensor=True)
    similarity = util.cos_sim(embedding1, embedding2)
    return similarity.item()

In [25]:
def load_metadata_for_video(metadata_folder, video_filename):
    """Loads metadata for a specific video from its corresponding JSON file."""
    
    metadata_file = os.path.join(metadata_folder, f"{os.path.splitext(video_filename)[0]}.json")
    if os.path.exists(metadata_file):
        with open(metadata_file, 'r') as file:
            metadata = json.load(file)
        return metadata["original_metadata"]["caption"]  # Extract the reference title
    return None

def load_generated_titles(output_json):
    """Loads generated titles from the output file."""
    generated_titles = {}
    captions = json.load(open(output_json))
    for video_filename, caption in captions.items():
        generated_titles[video_filename] = caption
    return generated_titles

In [26]:
def evaluate_titles(metadata_file, output_file):
    """Compares generated titles with reference titles and computes BLEU and similarity scores."""
    generated_titles = load_generated_titles(output_file)

    results = {}

    for video_filename, generated_title in generated_titles.items():
        reference_title = load_metadata_for_video(metadata_folder, video_filename)

        if reference_title:
            bleu_score = compute_bleu_score(reference_title, generated_title)
            similarity_score = compute_similarity(reference_title, generated_title)

            results[video_filename] = {
                "reference_title": reference_title,
                "generated_title": generated_title,
                "bleu_score": bleu_score,
                "similarity_score": similarity_score
            }
        else:
            results[video_filename] = {
                "reference_title": None,
                "generated_title": generated_title,
                "bleu_score": None,
                "similarity_score": None,
                "error": "Metadata file not found"
            }
    return results

In [None]:
def save_results(results, output_json):
    """Saves evaluation results to a JSON file."""
    with open(output_json, 'w') as file:
        json.dump(results, file, indent=4)

In [29]:
metadata_folder = "/home/saberwu2002/disk-data/data/MMTrail_processed/test/metas"
output_file = "/home/saberwu2002/CS229-Project/benchmark/llava-next-caption/captions.json"
output_json = "results.json" 

evaluation_results = evaluate_titles(metadata_folder, output_file)
save_results(evaluation_results, output_json)

print(f"Evaluation completed. Results saved to {output_json}")

Evaluation completed. Results saved to results.json
