# Analysis

In [None]:
import os
from concurrent.futures import ThreadPoolExecutor, as_completed
from moviepy.editor import VideoFileClip
import numpy as np

def get_video_duration(file_path):
    try:
        video = VideoFileClip(file_path)
        duration = video.duration
        video.close()
        return duration
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return None

def process_videos(file_paths):
    durations = []
    with ThreadPoolExecutor() as executor:
        future_to_file = {executor.submit(get_video_duration, file): file for file in file_paths}
        for future in as_completed(future_to_file):
            file = future_to_file[future]
            try:
                duration = future.result()
                if duration is not None:
                    durations.append(duration)
                    print(f"{os.path.basename(file)}: {duration:.2f} seconds")
                else:
                    print(f"{os.path.basename(file)}: Unable to get duration")
            except Exception as e:
                print(f"Error processing {file}: {e}")
    return durations

def check_videos_duration(folder_path):
    video_extensions = ('.mp4', '.avi', '.mov', '.mkv', '.flv')
    file_paths = []

    for root, _, files in os.walk(folder_path):
        for file in files:
            if file.lower().endswith(video_extensions):
                file_paths.append(os.path.join(root, file))

    durations = process_videos(file_paths)

    if durations:
        durations = np.array(durations)
        total_videos = len(durations)
        shortest_duration = np.min(durations)
        longest_duration = np.max(durations)
        average_duration = np.mean(durations)
        median_duration = np.median(durations)
        std_deviation = np.std(durations)

        print("\nVideo Duration Statistics:")
        print(f"Total videos: {total_videos}")
        print(f"Shortest duration: {shortest_duration:.2f} seconds")
        print(f"Longest duration: {longest_duration:.2f} seconds")
        print(f"Average duration: {average_duration:.2f} seconds")
        print(f"Median duration: {median_duration:.2f} seconds")
        print(f"Standard deviation: {std_deviation:.2f} seconds")
    else:
        print("No valid video files found.")

folder_path = '/'
check_videos_duration(folder_path)

# delete_invalid_videos

In [None]:
import os
import cv2
from concurrent.futures import ProcessPoolExecutor, as_completed
from tqdm import tqdm

INPUT_FOLDER = "/data1/code/sora/dj_sora_challenge/input/all_cut_13/"
MIN_DURATION = 0.5 

def is_valid_video(video_path, min_duration=MIN_DURATION):
    try:
        cap = cv2.VideoCapture(video_path)
        if not cap.isOpened():
            return False
        
        fps = cap.get(cv2.CAP_PROP_FPS)
        frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        duration = frame_count / fps
        
        cap.release()
        
        return duration >= min_duration
    except Exception as e:
        print(f"Error processing {video_path}: {str(e)}")
        return False

def delete_invalid_videos(input_folder):
    video_files = [f for f in os.listdir(input_folder) if f.lower().endswith(('.mp4', '.avi', '.mov', '.mkv'))]
    
    deleted_videos = []
    
    with ProcessPoolExecutor() as executor:
        futures = {executor.submit(is_valid_video, os.path.join(input_folder, video)): video for video in video_files}
        
        for future in tqdm(as_completed(futures), total=len(video_files), desc="Processing videos"):
            video = futures[future]
            if not future.result():
                video_path = os.path.join(input_folder, video)
                try:
                    os.remove(video_path)
                    deleted_videos.append(video)
                    print(f"Deleted: {video}")
                except Exception as e:
                    print(f"Error deleting {video}: {str(e)}")
    
    print(f"Total videos processed: {len(video_files)}")
    print(f"Invalid videos deleted: {len(deleted_videos)}")


delete_invalid_videos(INPUT_FOLDER)

# Caption

In [None]:
import json
import os
from tqdm import tqdm
import requests
import base64
import cv2
import time
import random

def call_gpt_vision_api_with_retry(api_key, system_prompt, base64_images, max_retries=5, initial_delay=1, max_delay=60):
    for attempt in range(max_retries):
        try:
            result = call_gpt_vision_api(api_key, system_prompt, base64_images)
            if result is not None:
                return result
        except Exception as e:
            print(f"Attempt {attempt + 1} failed: {str(e)}")
        
        if attempt < max_retries - 1:
            delay = min(initial_delay * (2 ** attempt) + random.uniform(0, 1), max_delay)
            print(f"Retrying in {delay:.2f} seconds...")
            time.sleep(delay)
    
    print(f"All {max_retries} attempts failed. Returning None.")
    return None

def call_gpt_vision_api(api_key,
                        system_prompt,
                        base64_images,
                        transcription=None,
                        #user_prompt=None,
                        max_tokens=500,
                        temperature=1.0,
                        use_audio=False,
                        model='gpt-4o-mini-2024-07-18'):
    api_url = ''
    headers = {
        'Content-Type': 'application/json',
        'Authorization': f'Bearer {api_key}'
    }
    if use_audio:
        data = {
        "model": model,
        "messages": [
            {"role": "system", "content":f"{system_prompt}"},
            {"role": "user", "content": [
                "These are the frames from the video.",
                *map(lambda x: {"type": "image_url", 
                                "image_url": {"url": f'data:image/jpg;base64,{x}', "detail": "low"}}, base64_images),
                {"type": "text", "text": f"The audio transcription is: {transcription}"}
                ],
            }
        ],
        "max_tokens": 500
        }
    else:
        data = {
        "model": model,
        "messages": [
            {"role": "system", "content":f"{system_prompt}"},
            {"role": "user", "content": [
                "These are the frames from the video.",
                *map(lambda x: {"type": "image_url", 
                                "image_url": {"url": f'data:image/jpg;base64,{x}', "detail": "low"}}, base64_images)
                ],
            }
        ],
        "max_tokens": 500
        }

    try:
        response = requests.post(api_url, headers=headers, json=data)
        response.raise_for_status()
        result = response.json()

        print(result)
        if 'choices' in result and result['choices']:
            return result['choices'][0]['message']['content']
        else:
            print('No results returned from the API, return None.')
            return None

    except requests.exceptions.HTTPError as errh:
        if errh.response.status_code == 401:
            print('Invalid API key provided.')
        elif errh.response.status_code == 429:
            print(
                'API request limit has been reached. Please try again later.')
        else:
            print(f'HTTP error occurred: {errh}')
    except requests.exceptions.ConnectionError:
        print('Network error occurred. Please check your connection.')
    except requests.exceptions.Timeout:
        print('The request timed out. Please try again later.')
    except requests.exceptions.RequestException as err:
        printt(f'An error occurred: {err}')
    except Exception as e:
        print(f'An unexpected error occurred: {e}')

    print('API request failed, return None.')
    return None
    
def process_video(video_path, seconds_per_frame=1,extract_audio=False):
    base64Frames = []
    base_video_path, _ = os.path.splitext(video_path)

    video = cv2.VideoCapture(video_path)
    total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
    fps = video.get(cv2.CAP_PROP_FPS)
    frames_to_skip = int(fps * seconds_per_frame)
    curr_frame=0

    # Loop through the video and extract frames at specified sampling rate
    while curr_frame < total_frames - 1:
        video.set(cv2.CAP_PROP_POS_FRAMES, curr_frame)
        success, frame = video.read()
        if not success:
            break
        _, buffer = cv2.imencode(".jpg", frame)
        base64Frames.append(base64.b64encode(buffer).decode("utf-8"))
        curr_frame += frames_to_skip
    video.release()
    # if extract_audio:
    #     try:
    #     #Extract audio from video
    #         audio_path = f""
    #         clip = VideoFileClip(video_path)
    #         clip.audio.write_audiofile(audio_path, bitrate="32k")
    #         clip.audio.close()
    #         clip.close()
    #         print(f"Extracted audio to {audio_path}")
    #     except Exception as e:
    #         print(f"Failed to extract audio: {e}")
        
    #     return base64Frames, audio_path

    print(f"Extracted {len(base64Frames)} frames")

    return base64Frames

def process_jsonl(input_file, output_file, api_key):
    with open(input_file, 'r') as infile, open(output_file, 'w') as outfile:
        for line in tqdm(infile, desc="Processing videos"):
            data = json.loads(line)
            video_path = data['videos'][0]
            
            if not os.path.exists(video_path):
                print(f"Video file not found: {video_path}")
                outfile.write(json.dumps(data) + '\n')
                continue

            # Process video and get frames
            base64_frames = process_video(video_path, seconds_per_frame=1, extract_audio=False)

            # Call GPT Vision API with retry
            system_prompt = "You are a helpful video description generator. I'll give you a few of the middle frames of the video clip, which you need to summarize into a description of the video clip. Please provide a video description that complies with these requirements. Describe the basic and necessary information of the video in the third person, be as concise as possible. 2. Output the video description directly. Begin with 'In this video'. 3. Limit the video description to 100 words. Here are the video frames:"
            
            description = call_gpt_vision_api_with_retry(api_key, system_prompt, base64_frames)

            if description:
                data['text'] = f"<__dj__video>{description}<|__dj__eoc|>"
            else:
                print(f"Failed to get description for video after all retries: {video_path}")

            outfile.write(json.dumps(data) + '\n')


input_file = "/data1/code/sora/dj_sora_challenge/input/cut_videos.jsonl"
output_file = "/data1/code/sora/dj_sora_challenge/input/cut_videos_gpt_caption.jsonl" 
api_key = ""  

process_jsonl(input_file, output_file, api_key)

In [2]:
import whisper
model = whisper.load_model(name="large-v3",device="cpu")
tr = whisper.transcribe(model, "/data1/wangqiurui/code/sora/dj_sora_challenge/input/videos/dj_video_00009.mp4")['text']



In [3]:
tr

' But only if you stick to a $20 a day budget. That is a recipe for disaster. One of you wants to take a cab, the other wants to walk. One wants to sleep in a decent hotel.'