In [2]:
import os
from moviepy.editor import VideoFileClip
from moviepy.video.io.ffmpeg_tools import ffmpeg_extract_subclip
import scenedetect
from scenedetect import VideoManager, SceneManager
from scenedetect.detectors import ContentDetector
import whisperx
import openai
from PIL import Image
import numpy as np
from langchain.chat_models import AzureChatOpenAI
from langchain.schema.messages import HumanMessage, AIMessage
import ast

torchvision is not available - cannot save figures


In [3]:
api_base = 'https://westus3-gpt-4o.openai.azure.com/' # your endpoint should look like the following https://YOUR_RESOURCE_NAME.openai.azure.com/
api_key = "e735d250c3424ed794c472b5ff8f38a0"
deployment_name = "gpt-4o"
api_version = "2024-02-01"

client = AzureChatOpenAI(
    api_key=api_key,  
    api_version=api_version,
    azure_endpoint=api_base,
    deployment_name=deployment_name,
    max_tokens=1024,
    temperature=0
)

In [4]:
# client = OpenAI(api_key="your openai api here")

# Step 1: Scene Detection
def scene_detection(video_path):
    video_manager = VideoManager([video_path])
    scene_manager = SceneManager()
    scene_manager.add_detector(ContentDetector())
    video_manager.set_downscale_factor()
    video_manager.start()
    scene_manager.detect_scenes(frame_source=video_manager)
    scene_list = scene_manager.get_scene_list()
    video_manager.release()
    return scene_list

# Step 2: Split Video into Multiple Clips
def split_video(video_path, scene_list, output_dir):
    clip_paths = []
    for i, scene in enumerate(scene_list):
        start, end = scene[0].get_seconds(), scene[1].get_seconds()
        clip_path = os.path.join(output_dir, f"clip_{i+1}.mp4")
        ffmpeg_extract_subclip(video_path, start, end, targetname=clip_path)
        clip_paths.append(clip_path)
    return clip_paths

# Step 3: Speech Recognition using WhisperX
def speech_recognition(video_path):
    try:
        model = whisperx.load_model("large", device='cpu')
        result = model.transcribe(video_path)
        return result["text"]
    except Exception as e:
        print(f"Error during ASR: {e}")
        return "No transcription."

# Step 4: Generate Clip-Level Video Descriptions
def generate_clip_descriptions(clip_paths, scenes):
    descriptions = []
    for clip_path, scene in zip(clip_paths, scenes):
        clip = VideoFileClip(clip_path)
        frames = sample_frames(clip, 10)
        images = [Image.fromarray(frame) for frame in frames]
        # Convert frames to base64 for OpenAI Image API
        image_files = [convert_to_base64(image) for image in images]
        content = []
        prompt = f"""You are an expert in understanding scene transitions based on visual features in a video.
                  For the given sequence of images per timestamp, identify different scenes in the video.
                  Generate pure audio description for each scene with time ranges.
                  The output should be a list of audio descriptions for each scene.
                  {scene}"""

        for image_file in image_files:
            content.append({
                            "type": "image_url",
                            "image_url": {"url": f"data:image/jpeg;base64,{image_file}"},
                        })
        response = client.invoke(
            [AIMessage(content=prompt),
             HumanMessage(content=content)]
        )

        descriptions.append(response.content)
        
    return "".join(descriptions)

# Helper function to sample frames
def sample_frames(video, num_frames):
    total_frames = int(video.fps * video.duration)
    frame_indices = np.linspace(0, total_frames - 1, num_frames, dtype=int)
    frames = [video.get_frame(i / video.fps) for i in frame_indices]
    return frames

# Helper function to convert PIL image to base64
def convert_to_base64(image):
    from io import BytesIO
    import base64

    buffered = BytesIO()
    image.save(buffered, format="JPEG")
    return base64.b64encode(buffered.getvalue()).decode("utf-8")

# Step 5: Generate a Coherent Script using GPT-4
def generate_script(clip_descriptions):
    prompt = (
        "You are an expert at understanding audio descriptions of different scenes in a video. "
        "Generate full audio description of each scene with non-overlapping time ranges. "
        "Keep as many scenes as possible covering all time ranges. "
        "Use character names wherever possible in the audio descriptions. "
        "Keep the audio description for each time range within one short sentence.\n\n"
    )
    prompt += "\n\n".join(clip_descriptions)
    response = client.chat_completions.create(
        model="gpt-4o",
        messages=[
            {
                "role": "user",
                "content": prompt
            }
        ],
        max_tokens=1000,
    )
    return response.choices[0].message.content

# Example usage
if __name__ == "__main__":
    video_path = r"D:\MM-VID\Y2meta.app-Arcane Season 2 _ Official Teaser Trailer-(720p).mp4"
    output_dir = "clips"
    os.makedirs(output_dir, exist_ok=True)
    
    # Step 1: Scene Detection
    scenes = scene_detection(video_path)
    print("Detected scenes:", scenes)

    # Step 2: Split Video into Multiple Clips
    clip_paths = split_video(video_path, scenes, output_dir)
    print("Video clips created:", clip_paths)

    # Step 3: Speech Recognition using WhisperX
    transcript = speech_recognition(video_path)
    print("Transcript:", transcript)



VideoManager is deprecated and will be removed.


Detected scenes: [(00:00:00.000 [frame=0, fps=24.000], 00:00:02.958 [frame=71, fps=24.000]), (00:00:02.958 [frame=71, fps=24.000], 00:00:06.458 [frame=155, fps=24.000]), (00:00:06.458 [frame=155, fps=24.000], 00:00:10.000 [frame=240, fps=24.000]), (00:00:10.000 [frame=240, fps=24.000], 00:00:12.292 [frame=295, fps=24.000]), (00:00:12.292 [frame=295, fps=24.000], 00:00:13.667 [frame=328, fps=24.000]), (00:00:13.667 [frame=328, fps=24.000], 00:00:15.958 [frame=383, fps=24.000]), (00:00:15.958 [frame=383, fps=24.000], 00:00:17.208 [frame=413, fps=24.000]), (00:00:17.208 [frame=413, fps=24.000], 00:00:18.083 [frame=434, fps=24.000]), (00:00:18.083 [frame=434, fps=24.000], 00:00:20.750 [frame=498, fps=24.000]), (00:00:20.750 [frame=498, fps=24.000], 00:00:21.958 [frame=527, fps=24.000]), (00:00:21.958 [frame=527, fps=24.000], 00:00:23.500 [frame=564, fps=24.000]), (00:00:23.500 [frame=564, fps=24.000], 00:00:24.417 [frame=586, fps=24.000]), (00:00:24.417 [frame=586, fps=24.000], 00:00:26.87

In [5]:
descriptions = []
for clip_path, scene in zip(clip_paths, scenes):
    clip = VideoFileClip(clip_path)
    frames = sample_frames(clip, 10)
    images = [Image.fromarray(frame) for frame in frames]
    # Convert frames to base64 for OpenAI Image API
    image_files = [convert_to_base64(image) for image in images]
    content = []
    prompt = f"""You are an expert in understanding scene transitions based on visual features in a video.
                For the given sequence of images per timestamp, identify different scenes in the video.
                Generate pure audio description for each scene with time ranges.
                The output should be a list of audio descriptions for each scene.
                {scene}"""

    for image_file in image_files:
        content.append({
                        "type": "image_url",
                        "image_url": {"url": f"data:image/jpeg;base64,{image_file}"},
                    })
    # response = client.invoke(
    #     [AIMessage(content=prompt),
    #         HumanMessage(content=content)]
    # )

    # descriptions.append(response.content)
        


In [7]:
# Step 4: Generate Clip Descriptions
clip_descriptions = generate_clip_descriptions(clip_paths, scenes)
print("Clip Descriptions:", clip_descriptions)

# clip_descriptions = clip_descriptions + f'\n\n ASR:{transcript}'

# # Step 5: Generate a Coherent Script
# script = generate_script(clip_descriptions)
# print("Generated Script:", script)

Clip Descriptions: 1. **Scene 1: 00:00:00.000 - 00:00:00.500**
   - Audio Description: "The screen is completely black."

2. **Scene 2: 00:00:00.500 - 00:00:02.958**
   - Audio Description: "A dark, foggy environment slowly becomes visible. Two figures, possibly humanoid, emerge from the mist, illuminated by faint blue lights. The scene is eerie and mysterious, with the figures moving slowly through the fog."1. **Scene 1 (00:00:02.958 - 00:00:03.958)**
   - Audio Description: "The screen is completely black."

2. **Scene 2 (00:00:03.958 - 00:00:06.458)**
   - Audio Description: "A dimly lit room with a broken, elevated platform in the center. The platform has a small, damaged structure on it, resembling a monument or a piece of machinery. The room appears to be in disrepair, with cracks and debris visible."1. **Scene 1 (00:00:06.458 - 00:00:09.000)**
   - Audio Description: "A close-up of armored boots stepping over a large, futuristic weapon with glowing green lights. The background i