In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "3"

## Split The Video

Split the video into seperate clips using [`PySceneDetect`](https://www.scenedetect.com/)

In [2]:
video_file = 'How GPU Computing Works ｜ GTC 2021 [3l10o0DYJXg].mkv'

In [3]:
# !scenedetect -i "$video_file" split-video -o video_clips

In [4]:
from video2doc.video_understanding import VideoUnderstandingWithAriaHQQInt4, load_video, fix_json, timestamp_to_seconds, get_frame_at_timestamp, split_video_into_scenes
from video2doc.whisper import Whisper, extract_audio, merge_chunks_by_timerange

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
whisper = Whisper(device='cuda:0')
video_understanding_with_aria = VideoUnderstandingWithAriaHQQInt4('rhymes-ai/Aria')

Loading checkpoint shards: 100%|█████████████████████| 12/12 [00:02<00:00,  4.12it/s]
100%|████████████████████████████████████████████████| 28/28 [05:08<00:00, 11.01s/it]
100%|█████████████████████████████████████████████| 199/199 [00:00<00:00, 697.48it/s]
100%|██████████████████████████████████████████████| 197/197 [00:12<00:00, 15.19it/s]
100%|█████████████████████████████████████████████| 197/197 [00:00<00:00, 535.32it/s]
  0%|                                                          | 0/5 [00:00<?, ?it/s]

using flash attention


 20%|██████████                                        | 1/5 [00:39<02:36, 39.13s/it]

using flash attention


 40%|████████████████████                              | 2/5 [00:40<00:51, 17.16s/it]

using flash attention


 60%|██████████████████████████████                    | 3/5 [00:43<00:20, 10.29s/it]

using flash attention


 80%|████████████████████████████████████████          | 4/5 [00:44<00:06,  6.89s/it]

using flash attention


100%|██████████████████████████████████████████████████| 5/5 [00:46<00:00,  9.29s/it]


In [6]:
import json
def process_video_to_doc(video_file: str, video_name: str, output_folder: str):
    """Process a video file and save scenes with transcriptions to markdown"""
    import os
    from pathlib import Path
    
    # Create output directory if it doesn't exist
    output_path = Path(output_folder)
    output_path.mkdir(parents=True, exist_ok=True)
    
    audio_path = extract_audio(video_file)
    transcription = whisper(audio_path)
    
    frames, timestemps = load_video(video_file)
    scenes = split_video_into_scenes(video_understanding_with_aria, frames, timestemps)
    
    # Create markdown content
    markdown_content = ""
    
    for i, scene in enumerate(scenes, 1):
        start = scene['start_time']
        end = scene['end_time']
        scene_transcription = merge_chunks_by_timerange(transcription['chunks'], start, end)
        
        # Save frame image
        frame = get_frame_at_timestamp(video_file, max(end - 3, (start + end) // 2))
        image_filename = f"{video_name}_scene_{i}.jpg"
        frame.save(output_path / image_filename)
        
        # Add scene to markdown - only transcription and image
        markdown_content += f"{scene_transcription}\n\n"
        markdown_content += f"![Scene {i}]({image_filename})\n\n"
    
    # Write markdown file
    markdown_path = output_path / f"readme.md"
    with open(markdown_path, "a", encoding="utf-8") as f:
        f.write(markdown_content)
    
    return markdown_path

In [7]:
def list_mp4_files(folder_path: str) -> list[str]:
    import os
    import re
    
    def get_scene_number(filename: str) -> int:
        match = re.search(r'Scene-(\d+)', filename)
        return int(match.group(1)) if match else float('inf')
    
    mp4_files = []
    for file in os.listdir(folder_path):
        if file.lower().endswith('.mp4'):
            mp4_files.append(os.path.join(folder_path, file))
    
    return sorted(mp4_files, key=get_scene_number)

In [8]:
video_clips = list_mp4_files('./video_clips')

In [9]:
import torch

for i, video_clip in enumerate(video_clips):
    torch.cuda.empty_cache()
    print(f'Processing video clip: {video_clip}')
    process_video_to_doc(video_clip, f"clip_{i}", './hqq')

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Processing video clip: ./video_clips/How GPU Computing Works ｜ GTC 2021 [3l10o0DYJXg]-Scene-001.mp4
using flash attention
Processing video clip: ./video_clips/How GPU Computing Works ｜ GTC 2021 [3l10o0DYJXg]-Scene-002.mp4




using flash attention
Processing video clip: ./video_clips/How GPU Computing Works ｜ GTC 2021 [3l10o0DYJXg]-Scene-003.mp4




using flash attention
Processing video clip: ./video_clips/How GPU Computing Works ｜ GTC 2021 [3l10o0DYJXg]-Scene-004.mp4




using flash attention
Processing video clip: ./video_clips/How GPU Computing Works ｜ GTC 2021 [3l10o0DYJXg]-Scene-005.mp4




using flash attention
Processing video clip: ./video_clips/How GPU Computing Works ｜ GTC 2021 [3l10o0DYJXg]-Scene-006.mp4




using flash attention
Processing video clip: ./video_clips/How GPU Computing Works ｜ GTC 2021 [3l10o0DYJXg]-Scene-007.mp4




using flash attention
Processing video clip: ./video_clips/How GPU Computing Works ｜ GTC 2021 [3l10o0DYJXg]-Scene-008.mp4




using flash attention
Processing video clip: ./video_clips/How GPU Computing Works ｜ GTC 2021 [3l10o0DYJXg]-Scene-009.mp4




using flash attention
Processing video clip: ./video_clips/How GPU Computing Works ｜ GTC 2021 [3l10o0DYJXg]-Scene-010.mp4




using flash attention

Attempt 1 failed.
Error: Expecting value: line 1 column 1 (char 0)
Problematic string:
The video can be split into the following scenes:

```json
{
    "scenes": [
        {
            "start_time": "00:00",
            "end_time": "00:40",
            "title": "Opening Visualization",
            "description": "The video begins with a visual representation of various neural network operations using green squares to illustrate interconnectedness. The central text states: 'but not all threads want to work independently.' This is followed by an explanation that threads are rarely completely independent, emphasizing the collaborative nature of neural network threads."
        },
        {
            "start_time": "00:40",
            "end_time": "00:40",
            "title": "Reiteration of Visualization",
            "description": "The visual representation is reiterated with consistent green squares and the same text, reinforcing the earlier point about thread

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Processing video clip: ./video_clips/How GPU Computing Works ｜ GTC 2021 [3l10o0DYJXg]-Scene-011.mp4




using flash attention
Processing video clip: ./video_clips/How GPU Computing Works ｜ GTC 2021 [3l10o0DYJXg]-Scene-012.mp4




using flash attention
Processing video clip: ./video_clips/How GPU Computing Works ｜ GTC 2021 [3l10o0DYJXg]-Scene-013.mp4




using flash attention
Processing video clip: ./video_clips/How GPU Computing Works ｜ GTC 2021 [3l10o0DYJXg]-Scene-014.mp4




using flash attention
