In [295]:
# %pip install -q openai 
# %pip install python-dotenv
# %pip install torch torchvision
# %pip install pandas numpy matplotlib 
# %pip install imageio
# %pip install certifi
# %pip install opencv-python moviepy pillow
# %pip install natsort

In [296]:
import os
from openai import OpenAI

from dotenv import load_dotenv

from torchvision import datasets, transforms
from torch.utils.data import DataLoader

import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.models as models

import imageio

from openai import OpenAI 
import cv2
from moviepy.editor import VideoFileClip
import moviepy.editor as mp
import time
import base64
import json
from io import BytesIO
from collections import defaultdict

import subprocess

import requests
from PIL import Image
import matplotlib.pyplot as plt

import numpy as np




In [297]:
load_dotenv()
openai_api_key = os.getenv("API_KEY")

MODEL = "gpt-4o"

client = OpenAI(
    api_key=openai_api_key
)

# Process input video

In [298]:

VIDEO_PATH = "tedtalk/ted_3.mp4"

def process_video(video_path, seconds_per_frame=2):
    base64Frames = []
    base_video_path, _ = os.path.splitext(video_path)

    video = cv2.VideoCapture(video_path)
    total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
    fps = video.get(cv2.CAP_PROP_FPS)
    frames_to_skip = int(fps * seconds_per_frame)
    curr_frame = 0

    # Loop through the video and extract frames at specified sampling rate
    while curr_frame < total_frames - 1:
        video.set(cv2.CAP_PROP_POS_FRAMES, curr_frame)
        success, frame = video.read()
        if not success:
            break
        _, buffer = cv2.imencode(".jpg", frame)
        base64Frames.append(base64.b64encode(buffer).decode("utf-8"))
        curr_frame += frames_to_skip
    video.release()

    # Extract audio from video if audio track is present
    audio_path = None
    clip = VideoFileClip(video_path)
    if clip.audio is not None:
        audio_path = f"{base_video_path}.mp3"
        clip.audio.write_audiofile(audio_path, bitrate="32k")
        clip.audio.close()

    clip.close()

    print(f"Extracted {len(base64Frames)} frames")
    if audio_path:
        print(f"Extracted audio to {audio_path}")
    else:
        print("No audio track found in the video")

    return base64Frames, audio_path

# Extract 1 frame per second. You can adjust the `seconds_per_frame` parameter to change the sampling rate
base64Frames, audio_path = process_video(VIDEO_PATH, seconds_per_frame=1)

transcription_text = ""
if audio_path:
    transcription = client.audio.transcriptions.create(
        model="whisper-1",
        file=open(audio_path, "rb"),
    )
    transcription_text = transcription.text
else:
    transcription_text = "No audio track available to transcribe."

MoviePy - Writing audio in tedtalk/ted_3.mp3


                                                        

MoviePy - Done.
Extracted 11 frames
Extracted audio to tedtalk/ted_3.mp3




In [299]:
## Categorize the video
categorization_response = client.chat.completions.create(
    model=MODEL,
    messages=[
        {"role": "system", "content": """You are a classifier. Classify the provided video into one of the following categories: Vox (talking head), TEDTalk (standing talking), TaiChi, MGIF Respond with only category (one word)."""},
        {"role": "user", "content": [
            "These are the frames from the video.",
            *map(lambda x: {"type": "image_url", 
                            "image_url": {"url": f'data:image/jpg;base64,{x}', "detail": "low"}}, base64Frames),
            {"type": "text", "text": f"The audio transcription is: {transcription_text}"}
            ],
        }
    ],
    temperature=0,
)

# Extract the classification result and store it in a variable
category = categorization_response.choices[0].message.content.strip()

# Print categorization response
print(category)

TEDTalk


# Refinements

generate refine image for each frame

In [300]:
def base64_to_image(base64_string):
    img_data = base64.b64decode(base64_string)
    img = Image.open(BytesIO(img_data))

    # Convert image to 'RGBA' format
    if img.mode != 'RGBA':
        img = img.convert('RGBA')

    img_buffer = BytesIO()
    img.save(img_buffer, format="PNG")
    img_buffer.seek(0)  # Reset buffer position to the beginning
    return img_buffer

# List to hold URLs of refined images
refined_image_urls = []

# Process each base64 encoded frame
for base64_string in base64Frames:
    image_binary = base64_to_image(base64_string)
    image_binary.seek(0)  # Reset buffer position to the beginning after writing

    # Ensure the image is under the size limit (4MB)
    if image_binary.getbuffer().nbytes > 4000000:
        print("Image exceeds the 4MB size limit.")
        continue

    # Call DALL-E 2 to edit the frame
    response = client.images.create_variation(
        model="dall-e-2",
        image=image_binary,
        n=1,
        size="1024x1024",
    )
    
    # Extract the URL of the refined image and store it
    image_url = response.data[0].url
    refined_image_urls.append(image_url)

# Print URLs or do something with the refined images
print(refined_image_urls)


['https://oaidalleapiprodscus.blob.core.windows.net/private/org-kJ4senBWYHe37wdtKBeAwzbB/user-9njMec9uaOq5EPKA2k4J0TEd/img-l6LEaUzNMGNH7PpGsCAttYxQ.png?st=2024-08-02T15%3A17%3A09Z&se=2024-08-02T17%3A17%3A09Z&sp=r&sv=2023-11-03&sr=b&rscd=inline&rsct=image/png&skoid=6aaadede-4fb3-4698-a8f6-684d7786b067&sktid=a48cca56-e6da-484e-a814-9c849652bcb3&skt=2024-08-02T00%3A03%3A05Z&ske=2024-08-03T00%3A03%3A05Z&sks=b&skv=2023-11-03&sig=nd9F8Au6AWGtJcmb5oDZF7mKIRWd4WL9nOHNdPRjeeY%3D', 'https://oaidalleapiprodscus.blob.core.windows.net/private/org-kJ4senBWYHe37wdtKBeAwzbB/user-9njMec9uaOq5EPKA2k4J0TEd/img-tj8N0DuuEQnsky0wxBpuUwgz.png?st=2024-08-02T15%3A17%3A21Z&se=2024-08-02T17%3A17%3A21Z&sp=r&sv=2023-11-03&sr=b&rscd=inline&rsct=image/png&skoid=6aaadede-4fb3-4698-a8f6-684d7786b067&sktid=a48cca56-e6da-484e-a814-9c849652bcb3&skt=2024-08-01T21%3A18%3A34Z&ske=2024-08-02T21%3A18%3A34Z&sks=b&skv=2023-11-03&sig=YD4yzuFSlcVTSbx7CHlDIqQc2sKezVlJkaD6L0iRKlc%3D', 'https://oaidalleapiprodscus.blob.core.windows.

# Save images
output frames

In [301]:
# Function to get video resolution
def get_video_resolution(video_path):
    video = cv2.VideoCapture(video_path)
    width = int(video.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(video.get(cv2.CAP_PROP_FRAME_HEIGHT))
    video.release()
    return width, height

def save_images(urls, base_path, target_width, target_height):
    if not os.path.exists(base_path):
        os.makedirs(base_path)
    
    for i, url in enumerate(urls):
        response = requests.get(url)
        img = Image.open(BytesIO(response.content))
        # Use the LANCZOS resampling filter for high-quality downsampling
        img = img.resize((target_width, target_height), Image.LANCZOS)  # Directly using Image.LANCZOS
        img.save(os.path.join(base_path, f'image_{i}.png'))  # Saves images as image_0.png, image_1.png, etc.
        
# Usage example with paths and video resolution function as previously defined
video_name = VIDEO_PATH.split('/')[1].split('.')[0]  # Extract video name from path
video_results_path = f'results/{video_name}'
original_width, original_height = get_video_resolution(VIDEO_PATH)

# Ensure the specific video results folder exists
if not os.path.exists(video_results_path):
    os.makedirs(video_results_path)

# Final call to save_images with the URL list and the specific path for this video
save_images(refined_image_urls, video_results_path, original_width, original_height)


input frames

In [302]:
def save_images(base64Frames, category, video_name):
    base_path = f'{category}/{video_name}'
    if not os.path.exists(base_path):
        os.makedirs(base_path)

    for i, base64_string in enumerate(base64Frames):
        image_binary = BytesIO(base64.b64decode(base64_string))
        img = Image.open(image_binary)
        img.save(os.path.join(base_path, f'img_{i}.png'))


seconds_per_frame = 1
save_images(base64Frames, category.lower(), video_name)

In [303]:
def create_horizontal_collage(image_paths, output_path, frame_count=7):
    # Open all images and ensure they are the same height
    images = [Image.open(image_path) for image_path in image_paths[:frame_count]]
    widths, heights = zip(*(i.size for i in images))
    
    # Calculate the total width and maximum height of the collage
    total_width = sum(widths)
    max_height = max(heights)

    # Create a new image with the total width and max height
    collage_image = Image.new('RGB', (total_width, max_height))

    # Paste each image into the collage image
    x_offset = 0
    for img in images:
        collage_image.paste(img, (x_offset, 0))
        x_offset += img.width

    # Save the collage image
    collage_image.save(output_path)
    print(f"Collage saved to {output_path}")


# output
frame_indices = list(range(len(base64Frames)))
image_paths = [f"results/{video_name}/image_{index}.png" for index in frame_indices]
output_path = f"results/{video_name}/collage.png"
create_horizontal_collage(image_paths, output_path)


# input
image_paths = [f"{category.lower()}/{video_name}/img_{index}.png" for index in frame_indices]
output_path = f"{category.lower()}/{video_name}/collage.png"
create_horizontal_collage(image_paths, output_path)


Collage saved to results/ted_3/collage.png
Collage saved to tedtalk/ted_3/collage.png
