In [1]:
from moviepy.editor import AudioFileClip

In [2]:
def MP4ToMP3(mp4: str, mp3: str) -> None:
    """Convert mp4 to mp3 using moviepy

    Parameters
    ----------
    mp4 : str
        mp4 file path
    mp3 : str
        mp3 file path
    """
    FILETOCONVERT = AudioFileClip(mp4)
    FILETOCONVERT.write_audiofile(mp3)
    FILETOCONVERT.close()

In [4]:
MP4ToMP3("ShortVideo.mov", "ShortVideo.mp3")

MoviePy - Writing audio in ShortVideo.mp3


                                                                      

MoviePy - Done.




In [16]:
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline(
    "automatic-speech-recognition", 
    model="openai/whisper-medium.en",
    chunk_length_s=30,
)

config.json: 100%|██████████| 1.95k/1.95k [00:00<00:00, 1.32MB/s]
model.safetensors: 100%|██████████| 3.06G/3.06G [05:27<00:00, 9.34MB/s]
generation_config.json: 100%|██████████| 1.92k/1.92k [00:00<00:00, 1.67MB/s]
tokenizer_config.json: 100%|██████████| 805/805 [00:00<00:00, 4.19MB/s]
vocab.json: 100%|██████████| 798k/798k [00:00<00:00, 2.78MB/s]
tokenizer.json: 100%|██████████| 2.41M/2.41M [00:01<00:00, 1.47MB/s]
merges.txt: 100%|██████████| 456k/456k [00:00<00:00, 1.49MB/s]
normalizer.json: 100%|██████████| 52.7k/52.7k [00:00<00:00, 25.3MB/s]
added_tokens.json: 100%|██████████| 34.6k/34.6k [00:00<00:00, 7.62MB/s]
special_tokens_map.json: 100%|██████████| 1.83k/1.83k [00:00<00:00, 4.15MB/s]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
preprocessor_config.json: 100%|██████████| 185k/185k [00:00<00:00, 9.78MB/s]


In [17]:
transcription = pipe("ShortVideo.mp3", return_timestamps=True)

python(14511) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


In [18]:
transcription

{'text': " We used to do a lot of document understanding. This was my internship project a few ages ago. To understand handwriting in very difficult documents so that you can optimize processes in the back end. I have a background in computer vision, so I played a lot with mobile apps and previous hackathon challenges. I missed there. Previous hackathon challenges used to be around computer vision. You can do a lot of cool stuff with them. You can still do them today, but surprise, you don't have to do them manually. You can just ask the API to do it for you. And we do a lot, we used to do a lot of conversational interfaces. So 2017, when I came to Los Haca, I did chatbots and stuff. So this won't be the project for today. Okay? Okay, so I said a lot of used to used to used to. So what are we currently building? So we have two projects. One of them is computer board contracts, and the other one is understanding satellite images. The first one is how we can understand the law. How can w

In [20]:
import os

In [71]:
detection_threshold = 0.03 # Lower => more sensitive
input_video = "ShortVideo.mov"
scene_detection_output = "scene_detection_output"
os.makedirs(scene_detection_output, exist_ok=True)

os.system(
    f"""ffmpeg -i '{input_video}' -filter_complex "select='gt(scene,{detection_threshold})',metadata=print:file='{scene_detection_output}/detected_frames.txt'" -start_number 0 -fps_mode vfr {scene_detection_output}/frame%03d.png"""
)


ffmpeg version 6.0 Copyright (c) 2000-2023 the FFmpeg developers
  built with Apple clang version 15.0.0 (clang-1500.0.40.1)
  configuration: --prefix=/usr/local/Cellar/ffmpeg/6.0_1 --enable-shared --enable-pthreads --enable-version3 --cc=clang --host-cflags= --host-ldflags='-Wl,-ld_classic' --enable-ffplay --enable-gnutls --enable-gpl --enable-libaom --enable-libaribb24 --enable-libbluray --enable-libdav1d --enable-libjxl --enable-libmp3lame --enable-libopus --enable-librav1e --enable-librist --enable-librubberband --enable-libsnappy --enable-libsrt --enable-libsvtav1 --enable-libtesseract --enable-libtheora --enable-libvidstab --enable-libvmaf --enable-libvorbis --enable-libvpx --enable-libwebp --enable-libx264 --enable-libx265 --enable-libxml2 --enable-libxvid --enable-lzma --enable-libfontconfig --enable-libfreetype --enable-frei0r --enable-libass --enable-libopencore-amrnb --enable-libopencore-amrwb --enable-libopenjpeg --enable-libspeex --enable-libsoxr --enable-libzmq --enable-l

0

In [31]:
import re
def read_ffmpeg_scene_detection_output(detected_frames_output: str) -> list[dict]:
    # Read the text file
    with open(detected_frames_output, "r") as file:
        content = file.read()
    # Extract frame numbers and pts_time numbers using regular expressions
    frame_numbers = re.findall(r"frame:(\d+)", content)
    # pts_time_numbers = re.findall(r'pts_time:(\d+\.\d+)', content)
    pts_time_numbers = re.findall(r"pts_time:(\d+.\d+)", content)

    return [
        {"frame_id": int(frame_numbers[i]), "pts_time": float(pts_time_numbers[i])}
        for i in range(len(frame_numbers))
    ]

In [33]:
import pandas as pd

In [98]:
frame_transition_df = pd.DataFrame(read_ffmpeg_scene_detection_output("scene_detection_output/detected_frames.txt"))
threshold = 1 # Number of seconds
mask = (frame_transition_df['pts_time'].diff() > threshold).to_numpy()
mask[0] = True
frame_transition_df = frame_transition_df[mask]
frame_transition_df

Unnamed: 0,frame_id,pts_time
0,0,1.2
3,3,15.05
4,4,35.8667
7,7,51.0833
13,13,60.3167


In [99]:
time_text_df = pd.DataFrame(transcription["chunks"])
time_text_df

Unnamed: 0,timestamp,text
0,"(0.0, 2.58)",We used to do a lot of document understanding.
1,"(2.58, 5.28)",This was my internship project a few ages ago.
2,"(5.84, 10.44)",To understand handwriting in very difficult d...
3,"(10.44, 13.16)",so that you can optimize processes in the bac...
4,"(14.28, 17.72)","I have a background in computer vision, so I ..."
5,"(17.78, 22.48)",with mobile apps and previous hackathon chall...
6,"(23.12, 23.72)",I missed there.
7,"(24.06, 26.5)",Previous hackathon challenges used to be arou...
8,"(26.5, 28.5)",You can do a lot of cool stuff with them.
9,"(28.5, 32.0)","You can still do them today, but surprise, yo..."


In [100]:
slide_timestamps = ([0.0] + frame_transition_df["pts_time"].tolist() + [time_text_df["timestamp"].iloc[-1][-1]])
slide_transitions = pd.DataFrame(
    [
        {"timestamps" : (start, end), "slide_num" : slide_num, } 
        for slide_num, (start, end) in enumerate(zip(slide_timestamps, slide_timestamps[1:]), start=1)
    ]
)
slide_transitions

Unnamed: 0,timestamps,slide_num
0,"(0.0, 1.2)",1
1,"(1.2, 15.05)",2
2,"(15.05, 35.8667)",3
3,"(35.8667, 51.0833)",4
4,"(51.0833, 60.3167)",5
5,"(60.3167, 86.6)",6
