In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "2,3"

## Split The Video

Split the video into seperate clips using [`PySceneDetect`](https://www.scenedetect.com/)

In [None]:
video_file = 'how_cuda_programming_works_gtc2022.mp4'

In [None]:
# !scenedetect -i "$video_file" split-video -o video_clips

In [None]:
from video2doc.video_understanding import VideoUnderstandingWithAria, load_video, fix_json, timestamp_to_seconds, get_frame_at_timestamp, split_video_into_scenes
from video2doc.whisper import Whisper, extract_audio, merge_chunks_by_timerange

In [None]:
whisper = Whisper(device='cuda:0')
video_understanding_with_aria = VideoUnderstandingWithAria('rhymes-ai/Aria')

In [None]:
import json
def process_video_to_doc(video_file: str, video_name: str, output_folder: str):
    """Process a video file and save scenes with transcriptions to markdown"""
    import os
    from pathlib import Path
    
    # Create output directory if it doesn't exist
    output_path = Path(output_folder)
    output_path.mkdir(parents=True, exist_ok=True)
    
    audio_path = extract_audio(video_file)
    transcription = whisper(audio_path)
    
    frames, timestemps = load_video(video_file)
    scenes = split_video_into_scenes(video_understanding_with_aria, frames, timestemps)
    
    # Create markdown content
    markdown_content = ""
    
    for i, scene in enumerate(scenes, 1):
        start = scene['start_time']
        end = scene['end_time']
        scene_transcription = merge_chunks_by_timerange(transcription['chunks'], start, end)
        
        # Save frame image
        frame = get_frame_at_timestamp(video_file, max(end - 3, (start + end) // 2))
        image_filename = f"{video_name}_scene_{i}.jpg"
        frame.save(output_path / image_filename)
        
        # Add scene to markdown - only transcription and image
        markdown_content += f"{scene_transcription}\n\n"
        markdown_content += f"![Scene {i}]({image_filename})\n\n"
    
    # Write markdown file
    markdown_path = output_path / f"readme.md"
    with open(markdown_path, "a", encoding="utf-8") as f:
        f.write(markdown_content)
    
    return markdown_path

In [None]:
def list_mp4_files(folder_path: str) -> list[str]:
    import os
    import re
    
    def get_scene_number(filename: str) -> int:
        match = re.search(r'Scene-(\d+)', filename)
        return int(match.group(1)) if match else float('inf')
    
    mp4_files = []
    for file in os.listdir(folder_path):
        if file.lower().endswith('.mp4'):
            mp4_files.append(os.path.join(folder_path, file))
    
    return sorted(mp4_files, key=get_scene_number)

In [None]:
video_clips = list_mp4_files('./video_clips')

In [None]:
import torch

for i, video_clip in enumerate(video_clips):
    torch.cuda.empty_cache()
    print(f'Processing video clip: {video_clip}')
    process_video_to_doc(video_clip, f"clip_{i}", './')