# setup

### install dependencies

In [None]:
!pip install speechbrain
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting speechbrain
  Downloading speechbrain-0.5.14-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.0/519.0 kB[0m [31m23.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting hyperpyyaml (from speechbrain)
  Downloading HyperPyYAML-1.2.0-py3-none-any.whl (16 kB)
Collecting sentencepiece (from speechbrain)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m71.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub (from speechbrain)
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m18.4 MB/s[0m eta [36m0:00:00[0m
Collecting ruamel.yaml>=0.17.8 (from hyperpyyaml->speechbrain)
  Downloading ruamel.yaml-0.17.26-

In [None]:
!pip install ffmpeg
!pip install moviepy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting ffmpeg
  Downloading ffmpeg-1.4.tar.gz (5.1 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: ffmpeg
  Building wheel for ffmpeg (setup.py) ... [?25l[?25hdone
  Created wheel for ffmpeg: filename=ffmpeg-1.4-py3-none-any.whl size=6083 sha256=c46248d7a5bf529d5cf55ccdf46a058b115e4c2101fcdb29462b0d332e452f33
  Stored in directory: /root/.cache/pip/wheels/8e/7a/69/cd6aeb83b126a7f04cbe7c9d929028dc52a6e7d525ff56003a
Successfully built ffmpeg
Installing collected packages: ffmpeg
Successfully installed ffmpeg-1.4
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


### toolkit

In [None]:
from speechbrain.pretrained import EncoderDecoderASR
from speechbrain import alignment

In [None]:
import os
import torchaudio

In [None]:
from moviepy.editor import VideoFileClip

# helper functions

### video2audio conversion

In [None]:
def video2audio(video_path, audio_path):
    video = VideoFileClip(video_path)
    audio = video.audio
    audio.write_audiofile(audio_path)

### audio2batch segmentation

In [None]:
def audio2batch(audio_path, batch_dir_path):
    if not os.path.exists(batch_dir_path):
        os.makedirs(batch_dir_path)

    batch_size_sec = 5

    waveform, sample_rate = torchaudio.load(audio_path)

    duration_sec = waveform.size(1) / sample_rate

    num_batches = int(duration_sec / batch_size_sec)

    for i in range(num_batches):
        start_time_sec = i * batch_size_sec
        end_time_sec = (i + 1) * batch_size_sec

        # extract the current batch from the waveform
        start_sample = int(start_time_sec * sample_rate)
        end_sample = int(end_time_sec * sample_rate)
        batch_waveform = waveform[:, start_sample:end_sample]

        output_file_path = os.path.join(batch_dir_path, f"batch_{i+1}.wav")
        torchaudio.save(output_file_path, batch_waveform, sample_rate)

        print(f'batch {i+1} / {num_batches} saved')


    if num_batches * batch_size_sec < duration_sec:
        start_time_sec = num_batches * batch_size_sec
        end_time_sec = duration_sec

        start_sample = int(start_time_sec * sample_rate)
        end_sample = int(end_time_sec * sample_rate)
        last_batch_waveform = waveform[:, start_sample:end_sample]

        output_file_path = os.path.join(batch_dir_path, f"batch_{num_batches+1}.wav")
        torchaudio.save(output_file_path, last_batch_waveform, sample_rate)
        print('partial_batch saved.')
        
    return

### transcription method

In [None]:
def transcribe(asr_model, batch_dir_path):
    batch_files = os.listdir(batch_dir_path)

    # sort the batch files 
    batch_files.sort(key=lambda x: int(x.split("_")[1].split(".")[0]))

    transcriptions = []
    for i, batch_file in enumerate(batch_files):
        transcription = asr_model.transcribe_file(os.path.join(batch_dir_path, batch_file))
        transcriptions.append(transcription)
        print(f"batch {i+1} / {len(batch_files)} transcripted.")

    subtitles = " ".join(transcriptions)
    return subtitles

### setup models and paths

In [None]:
# path to the input WAV file
audio_path = "audio.wav"

# path to the output directory
batch_dir_path = "temp_batches/"

In [None]:
asr_model_1 = EncoderDecoderASR.from_hparams(
    source="speechbrain/asr-transformer-transformerlm-librispeech", 
    savedir="pretrained_models/asr-transformer-transformerlm-librispeech", 
    run_opts={'device':'cuda'}
)

asr_model_2 = EncoderDecoderASR.from_hparams(
    source="speechbrain/asr-crdnn-transformerlm-librispeech", 
    savedir="pretrained_models/asr-crdnn-transformerlm-librispeech", 
    run_opts={'device':'cuda'}
)

asr_model_3 = EncoderDecoderASR.from_hparams(
    source="speechbrain/asr-crdnn-rnnlm-librispeech",
    savedir="pretrained_models/asr-crdnn-rnnlm-librispeech",
    run_opts={'device':'cuda'}
)

Downloading (…)ain/hyperparams.yaml:   0%|          | 0.00/4.65k [00:00<?, ?B/s]

Downloading normalizer.ckpt:   0%|          | 0.00/1.70k [00:00<?, ?B/s]

Downloading asr.ckpt:   0%|          | 0.00/291M [00:00<?, ?B/s]

Downloading lm.ckpt:   0%|          | 0.00/381M [00:00<?, ?B/s]

Downloading tokenizer.ckpt:   0%|          | 0.00/324k [00:00<?, ?B/s]

Downloading (…)ain/hyperparams.yaml:   0%|          | 0.00/5.39k [00:00<?, ?B/s]

Downloading (…)main/normalizer.ckpt:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

Downloading asr.ckpt:   0%|          | 0.00/436M [00:00<?, ?B/s]

Downloading lm.ckpt:   0%|          | 0.00/381M [00:00<?, ?B/s]

Downloading tokenizer.ckpt:   0%|          | 0.00/324k [00:00<?, ?B/s]

# implementation

### input

In [None]:
video_path = "video_sample.mp4"
asr_model = asr_model_2

### blackbox

In [None]:
video2audio(video_path, audio_path)
audio2batch(audio_path, batch_dir_path)

MoviePy - Writing audio in audio.wav




MoviePy - Done.
batch 1 / 11 saved
batch 2 / 11 saved
batch 3 / 11 saved
batch 4 / 11 saved
batch 5 / 11 saved
batch 6 / 11 saved
batch 7 / 11 saved
batch 8 / 11 saved
batch 9 / 11 saved
batch 10 / 11 saved
batch 11 / 11 saved
partial_batch saved.


In [None]:
subtitles = transcribe(asr_model, batch_dir_path)




batch 1 / 12 transcripted.
batch 2 / 12 transcripted.
batch 3 / 12 transcripted.
batch 4 / 12 transcripted.
batch 5 / 12 transcripted.
batch 6 / 12 transcripted.
batch 7 / 12 transcripted.
batch 8 / 12 transcripted.
batch 9 / 12 transcripted.
batch 10 / 12 transcripted.
batch 11 / 12 transcripted.
batch 12 / 12 transcripted.


### outputs

In [None]:
print(subtitles)

WELCOME TO THE ENGLISH IN A MINUTE MORE JOY'S GOING TO HARDEN'S AND YOU GOT TO BE AROUND FRIENDS AIN'T FOOD AND MAY BE LISTENED TO MUSED IT BUT DO YOUR PARTIES EVER HAVE THE ANIMAL HARDY ANIMAL HE'S ONE OF THEM ARE YOU AS READY AS I I AM FOR THE WEEK END OH YES I'M GOING TO MY BROTHER'S BIRTH DAY PARTY ON FRIDAY MY COUSIN IS HAVING HER GRADUATION PARTY SATURDAY AND I'M DE LA HAYE OAKYARD HE CALLED SONTAG'S WELL YOU ARE QUITE HARDY ANIMAL HERE THAT'S NOTHING YOU SHOULD HAVE SEEN ME IN COLLEGE A PARTY ANIMAL IS NOT AN ACTUAL ANIMAL IT IS A PERSON WHO REALLY ENJOYS GOING TO PARTIES BOTH OF THEM TIME IS SPENT FINDING OUT WHERE THE LATEST AND BEST PARTY IS GOING TO BE AND THAT'S ENGLISH IN A MINUTE
