# Transcribe Video
Transcribe the audio from an mp4 into a text document using openAI wisper

In [10]:
INPUT_MP4 = 'Zach-GenAI-Demo-V5.mp4'
OUTPUT_TRANSCRIPT = 'transcript.txt'

## Turn mp4 into wav file

In [11]:
audio_file_path = 'audio.wav'
import subprocess
command = f" ffmpeg -i {INPUT_MP4} {audio_file_path}"
subprocess.call(command, shell=True)

ffmpeg version 5.1.2 Copyright (c) 2000-2022 the FFmpeg developers
  built with Apple clang version 14.0.0 (clang-1400.0.29.202)
  configuration: --prefix=/opt/homebrew/Cellar/ffmpeg/5.1.2_5 --enable-shared --enable-pthreads --enable-version3 --cc=clang --host-cflags= --host-ldflags= --enable-ffplay --enable-gnutls --enable-gpl --enable-libaom --enable-libaribb24 --enable-libbluray --enable-libdav1d --enable-libmp3lame --enable-libopus --enable-librav1e --enable-librist --enable-librubberband --enable-libsnappy --enable-libsrt --enable-libsvtav1 --enable-libtesseract --enable-libtheora --enable-libvidstab --enable-libvmaf --enable-libvorbis --enable-libvpx --enable-libwebp --enable-libx264 --enable-libx265 --enable-libxml2 --enable-libxvid --enable-lzma --enable-libfontconfig --enable-libfreetype --enable-frei0r --enable-libass --enable-libopencore-amrnb --enable-libopencore-amrwb --enable-libopenjpeg --enable-libspeex --enable-libsoxr --enable-libzmq --enable-libzimg --disable-libjack

0

## Split wav file into chunks to send to API

In [12]:
import os
from concurrent.futures import ThreadPoolExecutor
from pydub import AudioSegment
from pydub.silence import split_on_silence
from tqdm import tqdm

audio_chunks_dir = 'audio-chunks'
#source: https://github.com/temavasilev/audio-splitter/blob/main/audio_splitter/main.py
def save_chunk(chunk, start_time, output_dir, output_format):
    chunk.export(join(output_dir, f'chunk_{str(start_time).zfill(4)}.{output_format}'), format=output_format)

def merge_short_chunks(chunks, min_chunk_length_ms):
    merged_chunks = []
    current_chunk = chunks[0]

    for chunk in chunks[1:]:
        if len(current_chunk) + len(chunk) < min_chunk_length_ms:
            current_chunk += chunk
        else:
            merged_chunks.append(current_chunk)
            current_chunk = chunk

    merged_chunks.append(current_chunk)
    return merged_chunks

def split_audio(input_file=audio_file_path, output_dir=audio_chunks_dir, chunk_length_ms=60_000, output_format='wav', silence_based=True, replace_output_dr=False):
    # Load the input audio file using Pydub
    audio = AudioSegment.from_file(input_file)

    # Create the output directory if it doesn't exist
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    elif replace_output_dr:
       for file in [join(output_dir, f) for f in listdir(output_dir) ]:
           os.remove(file)

    if silence_based:
        # Split the audio file based on silence
        min_silence_len = 100  # Minimum length of silence in milliseconds
        silence_thresh = -40   # Silence threshold in dB
        chunks = split_on_silence(audio, min_silence_len=min_silence_len, silence_thresh=silence_thresh)

        # Merge adjacent chunks shorter than the specified length
        chunks = merge_short_chunks(chunks, chunk_length_ms)

        # Set up progress bar with tqdm
        pbar = tqdm(total=len(chunks), desc="Processing chunks based on silence")

        # Save chunks in parallel using ThreadPoolExecutor
        with ThreadPoolExecutor() as executor:
            for i, chunk in enumerate(chunks):
                executor.submit(save_chunk, chunk, i, output_dir, output_format).add_done_callback(lambda x: pbar.update(1))

    else:
        # Calculate the total length of the audio in milliseconds and the number of full chunks
        audio_length_ms = len(audio)
        num_chunks = audio_length_ms // chunk_length_ms

        # Set up progress bar with tqdm
        pbar = tqdm(total=num_chunks + (audio_length_ms % chunk_length_ms != 0), desc="Processing fixed-size chunks")

        # Split and save chunks in parallel using ThreadPoolExecutor
        with ThreadPoolExecutor() as executor:
            for i in range(num_chunks):
                start_time = i * chunk_length_ms
                end_time = (i + 1) * chunk_length_ms
                chunk = audio[start_time:end_time]
                executor.submit(save_chunk, chunk, start_time, output_dir, output_format).add_done_callback(lambda x: pbar.update(1))

            # Handle the last chunk if there is any remainder
            if audio_length_ms % chunk_length_ms != 0:
                start_time = num_chunks * chunk_length_ms
                end_time = audio_length_ms
                chunk = audio[start_time:end_time]
                executor.submit(save_chunk, chunk, start_time, output_dir, output_format).add_done_callback(lambda x: pbar.update(1))

    # Close progress bar
    pbar.close()


In [13]:
from time import sleep

sleep(1)
split_audio(replace_output_dr=True)

Processing chunks based on silence: 100%|██████████| 8/8 [00:00<00:00, 213.07it/s]


## Loop over wav file chunks send to Wisper and Transcribe

In [14]:
from openai import OpenAI
from dotenv import load_dotenv

load_dotenv('.env', override=True)
client = OpenAI()

In [15]:
from os import listdir
from os.path import join

audio_file_paths = [join(audio_chunks_dir, f) for f in listdir(audio_chunks_dir) ]
audio_file_paths.sort()
audio_file_paths

['audio-chunks/chunk_0000.wav',
 'audio-chunks/chunk_0001.wav',
 'audio-chunks/chunk_0002.wav',
 'audio-chunks/chunk_0003.wav',
 'audio-chunks/chunk_0004.wav',
 'audio-chunks/chunk_0005.wav',
 'audio-chunks/chunk_0006.wav',
 'audio-chunks/chunk_0007.wav']

In [16]:
if os.path.exists(OUTPUT_TRANSCRIPT):
    os.remove(OUTPUT_TRANSCRIPT)

with open(OUTPUT_TRANSCRIPT, "a") as out_txt_file:
    for audio_file_chunk in tqdm(audio_file_paths):
        with open(audio_file_chunk, "rb") as input_audio_file:
            transcription = client.audio.transcriptions.create(
                model="whisper-1",
                file=input_audio_file
            )
            out_txt_file.write(transcription.text)

100%|██████████| 8/8 [00:34<00:00,  4.36s/it]


## Clean Up

In [17]:
import shutil

os.remove(audio_file_path)
shutil.rmtree(audio_chunks_dir)