In [35]:
import os
import time
import json
import glob
import moviepy.editor as mp
import re

import google.generativeai as genai
from google.genai.types import HttpOptions, Part


os.environ["GOOGLE_API_KEY"]='<TODO>'
os.environ["GOOGLE_CLOUD_PROJECT"]='<TODO>'
os.environ["GOOGLE_CLOUD_LOCATION"]='<YOUR_LOCATION>'
os.environ["GOOGLE_GENAI_USE_VERTEXAI"]="True"

"""
split video into 15-seconds clips
"""
def split_video_into_clips(video_path, output_dir, clip_duration=15):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    video = mp.VideoFileClip(video_path)
    video_duration = int(video.duration)

    clip_paths = []

    for start_time in range(0, video_duration, clip_duration):
        end_time = min(start_time + clip_duration, video_duration)
        clip = video.subclip(start_time, end_time)

        clip_filename = os.path.join(output_dir, f"clip_{start_time:05d}_{end_time:05d}.mp4")
        clip.write_videofile(clip_filename, codec="libx264", audio_codec="aac", verbose=False, logger=None)

        clip_paths.append(clip_filename)

    return clip_paths

def retrieve_clips(clips_dir, extension="mp4"):
    pattern = os.path.join(clips_dir, f"*.{extension}")
    clip_paths = sorted(glob.glob(pattern))
    return clip_paths

def generate_caption_from_video(model, clip_path):
    """
    caption a 15-second clip using Gemini model
    """
    media = genai.upload_file(path=clip_path, mime_type="video/mp4")

    while media.state.name == "PROCESSING":
        print("processing video...")
        time.sleep(5)  # check periodically
        media = genai.get_file(media.name)

    response = model.generate_content([
        media,
        "Describe the video in detail."
    ])

    return response.text


"""
Process video clips to generate captions using Gemini model and save results to json.

Args:
    video_file_path (str): Path to the input video file
    clips_dir (str): Directory containing video clips
    output_json_path (str): Path to save the output JSON file
    split_video (bool, optional): Whether to split video into clips. Defaults to False.

Returns:
    list: List of generated captions

The function:
1. Initializes Gemini model
2. Gets list of clips from clips_dir
3. For each clip:
    - Extracts timestamp from filename
    - Generates caption using Gemini
    - Adds caption and timestamp to results
4. Saves all results to JSON file
"""
def process_clips(video_file_path, clips_dir, output_json_path, split_video=False):

    model = genai.GenerativeModel('gemini-2.0-flash-001')  # or 'gemini-2.0-pro-exp'

    result_json = {
        "video_file_path": os.path.basename(video_file_path), 
        "captions": []
    }

    clip_paths = retrieve_clips(clips_dir)
    clip_infos = []
    for clip_path in clip_paths:
        filename = os.path.basename(clip_path)
        parts = filename.replace(".mp4", "").split("_")
        if len(parts) == 3: 
            start_time = int(parts[1])
            end_time = int(parts[2])
        else:
            start_time = 0
            end_time = 15
        clip_infos.append((clip_path, start_time, end_time))

    for clip_path, start_time, end_time in clip_infos:
        print(f"Captioning {clip_path}...")
        caption = generate_caption_from_video(model, clip_path)
        timestamp = f"{start_time:05d}~{end_time:05d}"

        print(caption)

        result_json["captions"].append({
            "timestamp": timestamp,
            "caption": caption
        })

    with open(output_json_path, "w", encoding="utf-8") as f:
        json.dump(result_json, f, ensure_ascii=False, indent=4)

    print(f"Saved results to {output_json_path}")

    return captions



In [None]:
if __name__ == "__main__":
    
    video_file_path = "dataset/MovieChat-1K-test/videos/4.mp4"
    clips_dir = "clips"
    output_json_path = "data/4.json"

    # step 1: 
    # split_video_into_clips(video_file_path, output_dir=output_clips_dir)

    # step 2: captioning
    
    results = process_clips(
        video_file_path=video_file_path,
        clips_dir=clips_dir,
        output_json_path=output_json_path,
        split_video=False
    )

    print(results)
