In [1]:
!pip install -q streamlit
!pip install speechbrain
!pip install transformers
!pip install ffmpeg
!pip install moviepy
!pip install pyngrok

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.9/8.9 MB[0m [31m57.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m164.8/164.8 kB[0m [31m20.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m184.3/184.3 kB[0m [31m23.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.8/4.8 MB[0m [31m98.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m82.1/82.1 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.7/62.7 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for validators (setup.py) ... [?25l[?25hdone
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting speechbrain
  Downloading speechbrain-0.5

In [2]:
%%writefile app.py
import os
import torchaudio
import streamlit as st
from moviepy.editor import VideoFileClip
from speechbrain.pretrained import EncoderDecoderASR
from pyngrok import ngrok

# Function to convert video to audio
def video2audio(video_path, audio_path):
    if not os.path.exists(video_path):
        st.error("The video file could not be found.")
        return
    
    # Load the video file
    video = VideoFileClip(video_path)

    # Extract the audio from the video
    audio = video.audio
    
    # Save the audio file
    audio.write_audiofile(audio_path)


# Function to segment audio into batches
def audio2batch(audio_path, batch_dir_path):
    # create the output directory if it doesn't already exist
    if not os.path.exists(batch_dir_path):
        os.makedirs(batch_dir_path)

    # set the desired batch size in seconds
    batch_size_sec = 5

    # load the audio file using torchaudio
    waveform, sample_rate = torchaudio.load(audio_path)

    # calculate the total duration of the waveform in seconds
    duration_sec = waveform.size(1) / sample_rate

    # calculate the number of batches needed to cover the entire waveform
    num_batches = int(duration_sec / batch_size_sec)

    # iterate over the batches
    for i in range(num_batches):
        # calculate the start and end times of the current batch
        start_time_sec = i * batch_size_sec
        end_time_sec = (i + 1) * batch_size_sec

        # extract the current batch from the waveform
        start_sample = int(start_time_sec * sample_rate)
        end_sample = int(end_time_sec * sample_rate)
        batch_waveform = waveform[:, start_sample:end_sample]

        # save the current batch as a new WAV file
        output_file_path = os.path.join(batch_dir_path, f"batch_{i+1}.wav")
        torchaudio.save(output_file_path, batch_waveform, sample_rate)

        print(f'audit batch::{output_file_path}')
        print(f'batch {i+1} / {num_batches} saved')

    # if there is a partial batch at the end, save it as a separate file
    if num_batches * batch_size_sec < duration_sec:
        # calculate the start and end times of the last batch
        start_time_sec = num_batches * batch_size_sec
        end_time_sec = duration_sec

        # extract the last batch from the waveform
        start_sample = int(start_time_sec * sample_rate)
        end_sample = int(end_time_sec * sample_rate)
        last_batch_waveform = waveform[:, start_sample:end_sample]

        # save the last batch as a new WAV file
        output_file_path = os.path.join(batch_dir_path, f"batch_{num_batches+1}.wav")
        torchaudio.save(output_file_path, last_batch_waveform, sample_rate)
        print('partial_batch saved.')
    return


# Function to transcribe audio batches
def transcribe(asr_model, batch_dir_path):
    # get a list of all the batch files in the directory
    batch_files = os.listdir(batch_dir_path)

    # sort the batch files based on their numerical order
    batch_files.sort(key=lambda x: int(x.split("_")[1].split(".")[0]))

    transcriptions = []
    for i, batch_file in enumerate(batch_files):
        transcription = asr_model.transcribe_file(os.path.join(batch_dir_path, batch_file))
        transcriptions.append(transcription)
        print(f"batch {i+1} / {len(batch_files)} transcribed.")

    subtitles = " ".join(transcriptions)
    return subtitles


# Set up the Streamlit UI
st.title("Video Transcription")


# Function to load the ASR models
def load_models():
    asr_model_1 = EncoderDecoderASR.from_hparams(
        source="speechbrain/asr-transformer-transformerlm-librispeech",
        savedir="pretrained_models/asr-transformer-transformerlm-librispeech",
        run_opts={'device':'cuda'}
    )

    asr_model_2 = EncoderDecoderASR.from_hparams(
        source="speechbrain/asr-crdnn-transformerlm-librispeech",
        savedir="pretrained_models/asr-crdnn-transformerlm-librispeech",
        run_opts={'device':'cuda'}
    )

    asr_model_3 = EncoderDecoderASR.from_hparams(
        source="speechbrain/asr-crdnn-rnnlm-librispeech",
        savedir="pretrained_models/asr-crdnn-rnnlm-librispeech",
        run_opts={'device':'cuda'}
    )

    return asr_model_1, asr_model_2, asr_model_3


# Load the ASR models
asr_model_1, asr_model_2, asr_model_3 = load_models()

# Model selection dropdown
model_options = {
    "EncoderDecoderASR (Transformer)": asr_model_1,
    "EncoderDecoderASR (CRDNN - Transformer)": asr_model_2,
    "EncoderDecoderASR (CRDNN - RNN)": asr_model_3,

}

selected_model = st.selectbox("Select the ASR model", list(model_options.keys()))



# File uploader for video selection
video_file = st.file_uploader("Upload a video file", type=["mp4"])

# Transcription button
transcribe_button = st.button("Transcribe Video")

# Output area for subtitles
subtitles_output = st.empty()

# Perform transcription when the button is clicked
if transcribe_button and video_file:
    # save video file at designated path
    video_path = video_file.name
    with open(video_path, "wb") as f:
        f.write(video_file.read())

    # Convert video to audio
    audio_path = "audio.wav"
    video2audio(video_path, audio_path)

    # Convert audio to batches
    batch_dir_path = "temp_batches/"
    audio2batch(audio_path, batch_dir_path)

    # Perform transcription
    asr_model = model_options[selected_model]
    subtitles = transcribe(asr_model, batch_dir_path)

    # Display subtitles
    subtitles_output.text(subtitles)


Writing app.py


In [None]:
!npm install localtunnel
!streamlit run app.py &>/content/logs.txt &
!npx localtunnel --port 8501 

[K[?25h[37;40mnpm[0m [0m[30;43mWARN[0m [0m[35msaveError[0m ENOENT: no such file or directory, open '/content/package.json'
[0m[37;40mnpm[0m [0m[34;40mnotice[0m[35m[0m created a lockfile as package-lock.json. You should commit this file.
[0m[37;40mnpm[0m [0m[30;43mWARN[0m [0m[35menoent[0m ENOENT: no such file or directory, open '/content/package.json'
[0m[37;40mnpm[0m [0m[30;43mWARN[0m[35m[0m content No description
[0m[37;40mnpm[0m [0m[30;43mWARN[0m[35m[0m content No repository field.
[0m[37;40mnpm[0m [0m[30;43mWARN[0m[35m[0m content No README data
[0m[37;40mnpm[0m [0m[30;43mWARN[0m[35m[0m content No license field.
[0m
+ localtunnel@2.0.2
added 22 packages from 22 contributors and audited 22 packages in 1.683s

3 packages are looking for funding
  run `npm fund` for details

found [92m0[0m vulnerabilities

[K[?25hnpx: installed 22 in 2.121s
your url is: https://dry-webs-show.loca.lt
