<a href="https://colab.research.google.com/github/theaidran/subtitles_from_audio/blob/main/subtitles_from_audio.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install necessary libraries
!pip install git+https://github.com/m-bain/whisperx.git > /dev/null
!pip install ffmpeg-python
!pip install torch  # Install PyTorch if not already available
import mimetypes
import subprocess
import json
import os
from google.colab import files  # Import to trigger file downloads in Colab

# Colab form inputs
#@markdown Upload your audio file (wav, mp3, mp4, mkv) to the Colab default directory (menu on the left).
#@markdown Insert your file name:
media_file = "/content/myfile.wav" # @param {"type":"string","placeholder":"your file "}

#@markdown myfile.srt file will be automatically downloaded when ready

#@markdown ---
embed_subtitles_into_mp4_video = False  # @param {"type":"boolean"}
use_black_background = True  #@param {type:"boolean"}
optional_image_file = ""  # @param {"type":"string","placeholder":"jpeg, png, tif, etc"}
font_size = "default"  # @param ["default","12","14","16","18","20","22","24","26","28","30","32","34","36","None"] {"allow-input":true}
output_resolution = "1280x720"  # @param ["3840x2160","1920x1080","1280x720","854x480"]

#@markdown The embedding process time depends on file formats, for the input '.wav' and the output resolution of 1280x720; it takes about half the length of the audio file.

#@markdown File myfile_with_subtitles.mp4 will be in /content when ready
# Import required libraries
import whisperx
import ffmpeg  # Import the ffmpeg-python binding
import torch  # Import torch for device detection
import wave  # To calculate the duration of the .wav file

# Helper function to convert time in seconds to SRT time format
def convert_time_srt_format(seconds):
    milliseconds = int((seconds % 1) * 1000)
    seconds = int(seconds)
    hours = seconds // 3600
    minutes = (seconds % 3600) // 60
    seconds = seconds % 60
    return f"{hours:02}:{minutes:02}:{seconds:02},{milliseconds:03}"

# Function to detect the file type
def detect_file_type(file_path):
    mime_type, _ = mimetypes.guess_type(file_path)
    return mime_type

# Function to get media info using ffprobe
def get_media_info(file_path):
    result = subprocess.run(
        ['ffprobe', '-v', 'error', '-show_entries', 'format=format_name', '-of', 'json', file_path],
        stdout=subprocess.PIPE, stderr=subprocess.PIPE
    )
    return json.loads(result.stdout)

# Function to get audio duration from a .wav or other media files
def get_audio_duration(file_path):
    media_info = get_media_info(file_path)
    format_name = media_info['format']['format_name']

    if 'wav' in format_name:
        with wave.open(file_path, "r") as audio:
            frames = audio.getnframes()
            rate = audio.getframerate()
            duration = frames / float(rate)
    else:
        # For other media formats like mp3, mp4, etc.
        result = subprocess.run(
            ['ffprobe', '-v', 'error', '-show_entries', 'format=duration', '-of', 'default=noprint_wrappers=1:nokey=1', file_path],
            stdout=subprocess.PIPE, stderr=subprocess.PIPE
        )
        duration = float(result.stdout.strip())

    print(f"Detected duration: {duration} seconds for {file_path}")
    return duration

# Helper function to run FFmpeg and capture stderr output for debugging
def run_ffmpeg_with_error_capture(ffmpeg_command):
    try:
        # Capture both stdout and stderr
        ffmpeg_command.run(overwrite_output=True, capture_stdout=True, capture_stderr=True)
    except ffmpeg.Error as e:
        # Capture and print the stderr output to understand the FFmpeg error
        print(f"FFmpeg Error: {e.stderr.decode('utf-8') if e.stderr else 'No stderr output available.'}")
        raise e

# Step 1: Validate form inputs (media file and optional image)
if not media_file:
    raise ValueError("Please provide a media file using the form.")

# Extract the base name of the media file (without extension)
base_name = os.path.splitext(os.path.basename(media_file))[0]

# Step 2: Detect file type and duration
media_type = detect_file_type(media_file)
audio_duration = get_audio_duration(media_file)

print(f"Detected file type: {media_type}")
print(f"Audio/Media duration: {audio_duration}")

# Step 3: Load WhisperX large-v2 model and transcribe the audio
print("Loading WhisperX large-v2 model...")
device = "cuda" if torch.cuda.is_available() else "cpu"  # Automatically set to GPU if available, otherwise use CPU
model = whisperx.load_model("large-v2", device=device)

print("Transcribing media with WhisperX...")
result = model.transcribe(media_file)

# Detect language from the transcription
language_code = result['language']
print(f"Detected language: {language_code}")

# Align the transcription with timestamps
model_a, metadata = whisperx.load_align_model(language_code=language_code, device=device)
result_aligned = whisperx.align(result["segments"], model_a, metadata, media_file, device=device)

# Step 4: Save the transcription as a .srt file with the same base name as media_file
srt_filename = f"{base_name}.srt"
with open(srt_filename, "w") as srt_file:
    for i, segment in enumerate(result_aligned["segments"]):
        start_time = segment["start"]
        end_time = segment["end"]
        text = segment["text"]

        # Convert start and end time to SRT format (hh:mm:ss,ms)
        start_srt = convert_time_srt_format(start_time)
        end_srt = convert_time_srt_format(end_time)

        srt_file.write(f"{i + 1}\n")
        srt_file.write(f"{start_srt} --> {end_srt}\n")
        srt_file.write(f"{text.strip()}\n\n")

print(f"Subtitles generated and saved as: {srt_filename}")

# Properly escape the SRT file path
srt_filename_escaped = srt_filename.replace(":", "\\:").replace(" ", "\\ ")

# Step 5: Handle background (either image, black background, or no background)
output_video = f"{base_name}_output.mp4"

if embed_subtitles_into_mp4_video:
    if optional_image_file:
        print("Creating video with image background and audio...")

        # Use the provided image as the background and resize it to ensure height is divisible by 2
        image_input = (
            ffmpeg.input(optional_image_file, loop=1)
            .filter('scale', 'trunc(iw/2)*2', 'trunc(ih/2)*2')  # Ensure width and height are divisible by 2
        )
        audio_input = ffmpeg.input(media_file)  # Audio input

        # Combine image and audio, with error capture
        run_ffmpeg_with_error_capture(
            ffmpeg.output(image_input, audio_input, output_video, vcodec='libx264', acodec='aac', pix_fmt='yuv420p', shortest=None, strict='experimental')
        )

    elif use_black_background:
        print("Creating video with black background and audio...")

        # Create a black background video with the specified resolution
        run_ffmpeg_with_error_capture(
            ffmpeg.input(f'color=c=black:s={output_resolution}', f='lavfi').output('black_background.mp4', t=1, vcodec='libx264')
        )

        # Define separate inputs for background and audio
        background_input = ffmpeg.input('black_background.mp4', stream_loop=-1)  # Loop the black background
        audio_input = ffmpeg.input(media_file)  # Audio input

        # Combine video and audio, with error capture
        run_ffmpeg_with_error_capture(
            ffmpeg.output(background_input, audio_input, output_video, vcodec='libx264', acodec='aac', pix_fmt='yuv420p', shortest=None, strict='experimental')
        )

    else:
        # If no background is needed, just use the original media file
        print("Using the original video file as input...")
        output_video = media_file

# Step 6: Add subtitles to the video if "embed_subtitles_into_mp4_video" is selected
output_video_with_subtitles = f"{base_name}_with_subtitles.mp4"

if embed_subtitles_into_mp4_video:
    print("Embedding subtitles into the video with custom font size...")

    # Check if the user has selected a custom font size or default
    if font_size != "default":
        font_size_style = f"FontSize={font_size}"
    else:
        font_size_style = ""

    # Apply subtitles to the final video with custom font size using force_style
    run_ffmpeg_with_error_capture(
        ffmpeg.input(output_video).output(
            output_video_with_subtitles,
            vcodec='libx264',
            acodec='aac',
            vf=f"subtitles='{srt_filename_escaped}':force_style='{font_size_style}'"
        )
    )
else:
    # If captions are not embedded, just rename the output video without subtitles
    print("Not embedding subtitles. Skipping subtitle embedding.")
    output_video_with_subtitles = output_video

print(f"Final video with subtitles saved as: {output_video_with_subtitles}")

# Step 7: Automatically trigger download of the SRT file
print(f"Downloading the generated SRT file: {srt_filename}")
files.download(srt_filename)  # Automatically download the SRT file


  Running command git clone --filter=blob:none --quiet https://github.com/m-bain/whisperx.git /tmp/pip-req-build-y89fbyrv
Detected duration: 66.38 seconds for /content/myfile.wav
Detected file type: audio/x-wav
Audio/Media duration: 66.38
Loading WhisperX large-v2 model...


INFO:pytorch_lightning.utilities.migration.utils:Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.4.0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../root/.cache/torch/whisperx-vad-segmentation.bin`


No language specified, language will be first be detected for each audio file (increases inference time).
Model was trained with pyannote.audio 0.0.1, yours is 3.1.1. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.4.1+cu121. Bad things might happen unless you revert torch to 1.x.
Transcribing media with WhisperX...
Detected language: en (1.00) in first 30s of audio...
Detected language: en
Subtitles generated and saved as: myfile.srt
Creating video with black background and audio...
Embedding subtitles into the video with custom font size...
Final video with subtitles saved as: myfile_with_subtitles.mp4
Downloading the generated SRT file: myfile.srt


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>