In [1]:
import logging
logging.basicConfig(level=logging.INFO, format='%(levelname)s - %(asctime)s - %(message)s')

import pandas as pd
import numpy as np

from pydub import AudioSegment

from evaluate import load

import os

pyannote_model = 'pyannote/speaker-diarization-3.1'
embedding_model = "pyannote/embedding" # speechbrain/spkrec-ecapa-voxceleb

import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

from IPython.display import clear_output

use_vad = True
cosine_sim_lim = 0.2

whisper_model = 'turbo'
silero_threshold = 0.5
whisper_beam = 3
whisper_ns_prob = 0.2

whisper_config = {
    'beam_size': 3,
    'no_speech_threshold': 0.2,
    'condition_on_previous_text': False
}

import data_extraction as da
import stt
import utils

INFO - 2025-06-03 16:29:01,839 - PyTorch version 2.7.0 available.
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/eye4got/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
films_list_df = da.get_or_create_subtitles_data(os.path.join(da.sub_dir, 'movie_index.parquet'), da.sub_dir)

# TODO: add download scripts for transcript downloads

# zenodo_get.download(
#     record_or_doi=4881008,
#     output_dir=os.path.join('data')
# )

# Unzip zip files

In [3]:
credits_df = da.get_credits_timestamps()

In [4]:
with open(os.path.join(da.transcription_dir, 'manual', 'Annie Hall.txt')) as fileobj:
    raw_annie_hall_man_txt = fileobj.read()
annie_hall_man_txt = raw_annie_hall_man_txt.replace('\n', ' ')

def calc_cer_wer(movie_name: str, ref_txt: str):
    trans_df = pd.read_parquet(os.path.join(da.transcription_dir, da.transcript_df_fp.format(movie_name=movie_name)))
    trans_df = trans_df[trans_df['text'].ne(' Thank you.')]['text']
    trans_txt = ''.join(trans_df.str.replace('[\.,"\?]', '', regex=True)).lower().replace('-', ' ')
    
    cer, wer = load("cer"), load("wer")
    cer_score = cer.compute(predictions=[trans_txt], references=[ref_txt])
    wer_score = wer.compute(predictions=[trans_txt], references=[ref_txt])
    
    return cer_score, wer_score

In [5]:
whisper_config

{'beam_size': 3,
 'no_speech_threshold': 0.2,
 'condition_on_previous_text': False}

In [6]:
config_list = []
for use_vad in [True]: # (True, False):
    for silero_threshold in [0.5]: # (0.4, 0.45, 0.5):
        for whisper_model in ['turbo']: #, 'large'):
            for embedding_model in ["speechbrain/spkrec-ecapa-voxceleb", "pyannote/embedding"]:
                for cosine_sim_lim in [0.2]: #(0.15, 0.2, 0.25):
                    for beam_size in (1, 3, 7):
                        for no_speech_threshold in (0.1, 0.2):
                            config = {
                                'use_vad': use_vad,
                                'silero_threshold': silero_threshold,
                                'whisper_model': whisper_model,
                                'embedding_model': embedding_model,
                                'cosine_sim_lim': cosine_sim_lim,
                                'beam_size': beam_size,
                                'no_speech_threshold': no_speech_threshold
                            }
                            
                            config_list.append(config)

In [None]:
movie_name = 'Annie Hall'
mp3_filename = 'Annie Hall.mp3'

results = []

for ii, config in enumerate(config_list):
    logging.info(f'NEW CONFIG RUN: \t{ii} / {len(config_list)}')
    da.wipe_movie_files(movie_name)
    
    use_vad = config['use_vad']
    silero_threshold = config['silero_threshold']
    whisper_model = config['whisper_model']
    embedding_model = config['embedding_model']
    cosine_sim_lim = config['cosine_sim_lim']
    whisper_config['beam_size'] = config['beam_size']
    whisper_config['no_speech_threshold'] = config['no_speech_threshold']
    
    vad_df_path = os.path.join(da.voice_activity_dir, f'{movie_name}-vad.parquet')
    seg_df_path = os.path.join(da.diarization_dir, f'{movie_name}-diarization.parquet')
    curr_transcript_fp = os.path.join(da.transcription_dir, da.transcript_df_fp.format(movie_name=movie_name))
    wav_filepath = os.path.join(da.trans_mp3_dir, f'{movie_name}_speech_only.wav')

    if use_vad:
        stt.apply_silero_vad_to_wav(mp3_filename, wav_filepath, vad_df_path, silero_threshold, credits_df)
        
    else:
        full_audio = AudioSegment.from_mp3(os.path.join(da.trans_mp3_dir, mp3_filename))
        if movie_name in credits_df.movie.values:
            credits_ts = credits_df[credits_df.movie.eq(movie_name)]['credits_start_sec'].iloc[0]
            full_audio = full_audio[:int(credits_ts*1000)]
        full_audio.export(wav_filepath, format="wav")
        del full_audio
            
    stt.apply_diarization(movie_name, wav_filepath, pyannote_model, seg_df_path, device)
    stt.transcribe_segments(curr_transcript_fp, seg_df_path, wav_filepath, whisper_model, whisper_config, embedding_model, cosine_sim_lim, device)
        
    # Delete Wav File afterwards as they are quick to generate and consume too much space
    if os.path.exists(wav_filepath):
        os.remove(wav_filepath)
        
    cer, wer = calc_cer_wer(movie_name, annie_hall_man_txt)
    results.append({'cer': cer, 'wer': wer})

INFO - 2025-06-03 16:29:06,980 - NEW CONFIG RUN: 	0 / 12
INFO - 2025-06-03 16:29:06,982 - Applying Silero VAD to Annie Hall
INFO - 2025-06-03 16:29:47,501 - Slicing up Audio from Annie Hall to speech only


In [41]:
# Torch (pyannote) isn't familiar with MP3 files, so convert to wav for effective performance
# Perform diarization to help separate narration in audio description from dialogue in original movie
# Finally use OpenAI's Whisper to convert to a transcript

mp3_files = [x for x in os.listdir(da.trans_mp3_dir) if os.path.splitext(x)[-1].lower() == '.mp3']

for mp3_filename in mp3_files:
    movie_name = utils.remove_ext(mp3_filename)
    vad_df_path = os.path.join(da.voice_activity_dir, f'{movie_name}-vad.parquet')
    seg_df_path = os.path.join(da.diarization_dir, f'{movie_name}-diarization.parquet')
    curr_transcript_fp = os.path.join(da.transcription_dir, da.transcript_df_fp.format(movie_name=movie_name))
    wav_filepath = os.path.join(da.trans_mp3_dir, f'{movie_name}_speech_only.wav')

    # If either diarization or transcript is missing, we'll need to generate the wav file
    if not os.path.exists(curr_transcript_fp) or not os.path.exists(seg_df_path):
        if use_vad:
            stt.apply_silero_vad_to_wav(mp3_filename, wav_filepath, vad_df_path, silero_threshold, credits_df)
            
        else:
            full_audio = AudioSegment.from_mp3(os.path.join(da.trans_mp3_dir, mp3_filename))
            if movie_name in credits_df.movie.values:
                credits_ts = credits_df[credits_df.movie.eq(movie_name)]['credits_start_sec'].iloc[0]
                full_audio = full_audio[:int(credits_ts*1000)]
            full_audio.export(wav_filepath, format="wav")
            del full_audio
            
    # Only perform diarization if parquet doesn't exist
    if not os.path.exists(seg_df_path):
        stt.apply_diarization(movie_name, wav_filepath, pyannote_model, seg_df_path, device)

    # Only perform transcription if parquet doesn't exist
    if not os.path.exists(curr_transcript_fp):
        stt.transcribe_segments(curr_transcript_fp, seg_df_path, wav_filepath, whisper_model, whisper_config, embedding_model, device)
        
    # Delete Wav File afterwards as they are quick to generate and consume too much space
    if os.path.exists(wav_filepath):
        os.remove(wav_filepath)
    
utils.clean_up_missed_wav_files(da.trans_mp3_dir)

INFO - 2025-06-03 12:08:10,111 - Applying Silero VAD to Annie Hall


### Subtitles Editted

File Completely Empty: X-Men, Finding Neverland, Mr Mrs Smith
Grease: Line 6916
Hangover Part II: Timestamps messed up line 5578
Super Mario Bros. Movie: Line 3877, Missing hours 
The Social Network: Counter 658, 1507, 1526

Index Titles Edited:
- Goodbye Columbus
- Monsters Inc
- What's Up, Doc


In [None]:
# full_subs_df_list = []

# for movie_cat in ('Blockbusters', 'Oscar'):
#     cat_mask = films_list_df.fame_category.eq(movie_cat)
#     for year in films_list_df.year.unique():
#         year_dir = os.path.join(da.sub_by_year_dir, movie_cat, str(year))
#         for movie_fp in os.listdir(year_dir):
#             full_subs_df_list.append(da.extract_single_subs_file(os.path.join(year_dir, movie_fp)))
                
# full_subs_df = pd.concat(full_subs_df_list)

# full_subs_df['movie'] = full_subs_df['movie'].str.strip().str.replace('-', ' ')
# films_list_df['movie'] = films_list_df['movie'].str.strip().str.replace('-', ' ').str.replace("'", ' ').str.replace('&', 'and')

In [None]:
# films_list_df.loc[films_list_df.movie.eq('Don t Look Up'), 'movie'] = 'Dont Look Up'
# films_list_df.loc[films_list_df.movie.eq('Goodbye,Columbus'), 'movie'] = 'Goodbye Columbus'
# films_list_df.loc[films_list_df.movie.eq('Summer of  42'), 'movie'] = 'Summer of 42'
# films_list_df.loc[films_list_df.movie.eq('What s Up, Doc_'), 'movie'] = 'What s Up, Doc'
# films_list_df.loc[films_list_df.movie.eq('Monsters, Inc.'), 'movie'] = 'Monsters Inc'

# combined_subs_df = full_subs_df.merge(films_list_df, how='left')
# combined_subs_df.to_parquet(da.sub_df_dir)

In [None]:
# all_transcripts_df_list = []
# longitudinal_movies = [utils.remove_ext(x) for x in os.listdir(da.trans_mp3_dir)]

# for filename in os.listdir(da.transcription_dir):
#     if filename.split('-')[0] in longitudinal_movies:
#         all_transcripts_df_list.append(pd.read_parquet(os.path.join(da.transcription_dir, filename)))

# all_transcripts_df = pd.concat(all_transcripts_df_list)
# all_transcripts_df.to_parquet(da.all_transcripts_df_dir)