In [132]:
from moviepy.editor import AudioFileClip
# Use a pipeline as a high-level helper
from transformers import pipeline
import re
import os
import pandas as pd

In [133]:
def MP4ToMP3(mp4: str, mp3: str) -> None:
    """Convert mp4 to mp3 using moviepy

    Parameters
    ----------
    mp4 : str
        mp4 file path
    mp3 : str
        mp3 file path
    """
    FILETOCONVERT = AudioFileClip(mp4)
    FILETOCONVERT.write_audiofile(mp3)
    FILETOCONVERT.close()

In [134]:
MP4ToMP3("ShortVideo.mov", "ShortVideo.mp3")

python(20258) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(20259) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(20260) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


MoviePy - Writing audio in ShortVideo.mp3


                                                                      

MoviePy - Done.




In [136]:
def transcribe_audio(audio_file:str, whisper_model:str="openai/whisper-medium.en")->dict:
    pipe = pipeline(
        "automatic-speech-recognition", 
        model=whisper_model,
        chunk_length_s=30,
    )
    transcription = pipe(audio_file, return_timestamps=True)
    return transcription



In [137]:
transcription = transcribe_audio("ShortVideo.mp3")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
python(20280) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


In [150]:
def detect_frame_changes(input_video:str, detection_threshold:float=0.03, scene_detection_output:str="scene_detection_output")->None:
    os.makedirs(scene_detection_output, exist_ok=True)
    os.system(
        f"""ffmpeg -i '{input_video}' -filter_complex "select='gt(scene,{detection_threshold})',metadata=print:file='{os.path.join(scene_detection_output, "detected_frames.txt")}'" -start_number 0 -fps_mode vfr {os.path.join(scene_detection_output, "frame%03d.png")}"""
    )

detect_frame_changes("ShortVideo.mov")

ffmpeg version 6.0 Copyright (c) 2000-2023 the FFmpeg developers
  built with Apple clang version 15.0.0 (clang-1500.0.40.1)
  configuration: --prefix=/usr/local/Cellar/ffmpeg/6.0_1 --enable-shared --enable-pthreads --enable-version3 --cc=clang --host-cflags= --host-ldflags='-Wl,-ld_classic' --enable-ffplay --enable-gnutls --enable-gpl --enable-libaom --enable-libaribb24 --enable-libbluray --enable-libdav1d --enable-libjxl --enable-libmp3lame --enable-libopus --enable-librav1e --enable-librist --enable-librubberband --enable-libsnappy --enable-libsrt --enable-libsvtav1 --enable-libtesseract --enable-libtheora --enable-libvidstab --enable-libvmaf --enable-libvorbis --enable-libvpx --enable-libwebp --enable-libx264 --enable-libx265 --enable-libxml2 --enable-libxvid --enable-lzma --enable-libfontconfig --enable-libfreetype --enable-frei0r --enable-libass --enable-libopencore-amrnb --enable-libopencore-amrwb --enable-libopenjpeg --enable-libspeex --enable-libsoxr --enable-libzmq --enable-l

In [146]:
def read_ffmpeg_scene_detection_output(detected_frames_output: str) -> list[dict]:
    # Read the text file
    with open(detected_frames_output, "r") as file:
        content = file.read()
    # Extract frame numbers and pts_time numbers using regular expressions
    frame_numbers = re.findall(r"frame:(\d+)", content)
    # pts_time_numbers = re.findall(r'pts_time:(\d+\.\d+)', content)
    pts_time_numbers = re.findall(r"pts_time:(\d+.\d+)", content)

    return [
        {"frame_id": int(frame_numbers[i]), "pts_time": float(pts_time_numbers[i])}
        for i in range(len(frame_numbers))
    ]

In [151]:
def create_frame_transition_df(
    detected_frames_file:str=os.path.join("scene_detection_output", "detected_frames.txt"),
    same_transition_time_threshold:float=1
)->pd.DataFrame:
    frame_transition_df = pd.DataFrame(read_ffmpeg_scene_detection_output(detected_frames_file))
    mask = (frame_transition_df['pts_time'].diff() > same_transition_time_threshold).to_numpy()
    mask[0] = True
    frame_transition_df = frame_transition_df[mask]
    return frame_transition_df

frame_transition_df = create_frame_transition_df()
frame_transition_df

Unnamed: 0,frame_id,pts_time
0,0,1.2
3,3,15.05
4,4,35.8667
7,7,51.0833
13,13,60.3167


In [148]:
def wrap_transciption_in_df(transcription:dict)->pd.DataFrame:
    time_text_df = pd.DataFrame(transcription["chunks"])
    return time_text_df

time_text_df = wrap_transciption_in_df(transcription)
time_text_df

Unnamed: 0,timestamp,text
0,"(0.0, 2.58)",We used to do a lot of document understanding.
1,"(2.58, 5.28)",This was my internship project a few ages ago.
2,"(5.84, 10.44)",To understand handwriting in very difficult d...
3,"(10.44, 13.16)",so that you can optimize processes in the bac...
4,"(14.28, 17.72)","I have a background in computer vision, so I ..."
5,"(17.78, 22.48)",with mobile apps and previous hackathon chall...
6,"(23.12, 23.72)",I missed there.
7,"(24.06, 26.5)",Previous hackathon challenges used to be arou...
8,"(26.5, 28.5)",You can do a lot of cool stuff with them.
9,"(28.5, 32.0)","You can still do them today, but surprise, yo..."


In [149]:
def create_slide_transitions_df(
    frame_transition_df:pd.DataFrame,
    time_text_df:pd.DataFrame,
)->pd.DataFrame:

    slide_timestamps = ([0.0] + frame_transition_df["pts_time"].tolist() + [time_text_df["timestamp"].iloc[-1][-1]])
    slide_transitions = pd.DataFrame(
        [
            {"timestamps" : (start, end), "slide_num" : slide_num, } 
            for slide_num, (start, end) in enumerate(zip(slide_timestamps, slide_timestamps[1:]), start=1)
        ]
    )
    return slide_transitions

slide_transitions = create_slide_transitions_df(frame_transition_df, time_text_df)
slide_transitions

Unnamed: 0,timestamps,slide_num
0,"(0.0, 1.2)",1
1,"(1.2, 15.05)",2
2,"(15.05, 35.8667)",3
3,"(35.8667, 51.0833)",4
4,"(51.0833, 60.3167)",5
5,"(60.3167, 86.6)",6


In [116]:
# time_text_df.to_csv("time_text_df.csv")
# slide_transitions.to_csv("slide_transitions.csv")

In [121]:
# import ast

# def convert_to_tuple(s):
#     try:
#         return ast.literal_eval(s)
#     except (ValueError, SyntaxError):
#         return None  # Handle cases where conversion is not possible

# # Apply the conversion function to the entire column
# time_text_df = time_text_df.rename(columns={'timestamp': 'timestamps'})
# time_text_df['timestamps'] = time_text_df['timestamps'].apply(convert_to_tuple)
# slide_transitions['timestamps'] = slide_transitions['timestamps'].apply(convert_to_tuple)