In [1]:
import logging
logging.basicConfig(level=logging.INFO, format='%(levelname)s - %(asctime)s - %(message)s')

import pandas as pd

from evaluate import load

import os

pyannote_model = 'pyannote/speaker-diarization-3.1'
embedding_model = "pyannote/embedding" # speechbrain/spkrec-ecapa-voxceleb

import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

from IPython.display import clear_output

use_vad = True
narr_cosine_sim_lim = 0.14
diag_cosine_sim_lim = 1

whisper_model = 'turbo'
silero_threshold = 0.5

whisper_config = {
    'beam_size': 1,
    'no_speech_threshold': 0.1,
    'condition_on_previous_text': False
}

import data_extraction as da
import stt

import warnings
warnings.filterwarnings("ignore")

logging.getLogger("speechbrain").setLevel(logging.WARNING)
logging.getLogger("pyannote").setLevel(logging.WARNING)

INFO - 2025-06-11 13:28:05,489 - PyTorch version 2.7.0 available.
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/eye4got/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
INFO - 2025-06-11 13:28:10,315 - Applied quirks (see `speechbrain.utils.quirks`): [allow_tf32, disable_jit_profiling]
INFO - 2025-06-11 13:28:10,316 - Excluded quirks specified by the `SB_DISABLE_QUIRKS` environment (comma-separated list): []


In [2]:
films_list_df = da.get_or_create_subtitles_data(os.path.join(da.sub_dir, 'movie_index.parquet'), da.sub_dir)

# TODO: add download scripts for transcript downloads

# zenodo_get.download(
#     record_or_doi=4881008,
#     output_dir=os.path.join('data')
# )

# Unzip zip files

In [3]:
credits_df = da.get_credits_timestamps()

In [4]:
with open(os.path.join(da.transcription_dir, 'manual', 'Annie Hall.txt')) as fileobj:
    raw_annie_hall_man_txt = fileobj.read()
annie_hall_man_txt = raw_annie_hall_man_txt.replace('\n', ' ')

def calc_cer_wer(movie_name: str, ref_txt: str):
    trans_df = pd.read_parquet(os.path.join(da.transcription_dir, da.transcript_df_fp.format(movie_name=movie_name)))
    trans_df = trans_df[trans_df['text'].ne(' Thank you.')]['text']
    trans_txt = ''.join(trans_df.str.replace('[\.,"\?]', '', regex=True)).lower().replace('-', ' ')
    
    cer, wer = load("cer"), load("wer")
    cer_score = cer.compute(predictions=[trans_txt], references=[ref_txt])
    wer_score = wer.compute(predictions=[trans_txt], references=[ref_txt])
    
    return cer_score, wer_score

In [5]:
config_list = []
for use_vad in [True]: # (True, False):
    for silero_threshold in [0.5]: # (0.4, 0.45, 0.5):
        for whisper_model in ['turbo']: # , 'large'
            for embedding_model in ["pyannote/embedding"]: # "speechbrain/spkrec-ecapa-voxceleb"
                for cosine_sim_lim in [0.14]: # 0.15, 0.2, 
                    for beam_size in [1]:
                        for no_speech_threshold in [0.1]: # , 0.2
                            config = {
                                'use_vad': use_vad,
                                'silero_threshold': silero_threshold,
                                'whisper_model': whisper_model,
                                'embedding_model': embedding_model,
                                'cosine_sim_lim': cosine_sim_lim,
                                'beam_size': beam_size,
                                'no_speech_threshold': no_speech_threshold
                            }
                            
                            config_list.append(config)

In [6]:
movie_name = 'Annie Hall'
mp3_filename = 'Annie Hall.mp3'

results = []

for ii, config in enumerate(config_list):
    logging.info(f'NEW CONFIG RUN: \t{ii} / {len(config_list)}')
    # da.wipe_movie_files(movie_name)
    
    use_vad = config['use_vad']
    silero_threshold = config['silero_threshold']
    whisper_model = config['whisper_model']
    embedding_model = config['embedding_model']
    cosine_sim_lim = config['cosine_sim_lim']
    whisper_config['beam_size'] = config['beam_size']
    whisper_config['no_speech_threshold'] = config['no_speech_threshold']
    
    vad_df_path = os.path.join(da.voice_activity_dir, f'{movie_name}-vad.parquet')
    seg_df_path = os.path.join(da.diarization_dir, f'{movie_name}-diarization.parquet')
    curr_transcript_fp = os.path.join(da.transcription_dir, da.transcript_df_fp.format(movie_name=movie_name))
    wav_filepath = os.path.join(da.trans_mp3_dir, f'{movie_name}_speech_only.wav')

    stt.apply_silero_vad_to_wav(mp3_filename, wav_filepath, vad_df_path, silero_threshold, credits_df)
    stt.apply_diarization(movie_name, wav_filepath, pyannote_model, seg_df_path, device)
    stt.add_pyannote_cosine_sim(seg_df_path, wav_filepath, min_seg_sec=0.3, device=device)
    stt.transcribe_segments(curr_transcript_fp, seg_df_path, wav_filepath, whisper_model, whisper_config, narr_cosine_sim_lim, diag_cosine_sim_lim, device)
        
    cer, wer = calc_cer_wer(movie_name, annie_hall_man_txt)
    results.append({'cer': cer, 'wer': wer})
    print(results[-1])

INFO - 2025-06-11 13:28:10,374 - NEW CONFIG RUN: 	0 / 1
INFO - 2025-06-11 13:28:10,375 - Applying Silero VAD to Annie Hall
INFO - 2025-06-11 13:28:49,472 - Slicing up audio from Annie Hall to speech only
INFO - 2025-06-11 13:28:50,954 - Started pyannote pipeline for Annie Hall
INFO - 2025-06-11 13:38:01,037 - Lightning automatically upgraded your loaded checkpoint from v1.2.7 to v2.5.0.post0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../../.cache/torch/pyannote/models--pyannote--embedding/snapshots/4db4899737a38b2d618bbd74350915aa10293cb2/pytorch_model.bin`


Model was trained with pyannote.audio 0.0.1, yours is 3.3.2. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.8.1+cu102, yours is 2.7.0+cu126. Bad things might happen unless you revert torch to 1.x.


INFO - 2025-06-11 13:38:29,007 - Segment: 1 / 222
INFO - 2025-06-11 13:39:00,396 - Segment: 51 / 222
INFO - 2025-06-11 13:39:42,854 - Segment: 101 / 222
INFO - 2025-06-11 13:40:01,105 - Segment: 151 / 222
INFO - 2025-06-11 13:40:19,586 - Segment: 201 / 222


{'cer': 0.19520435967302452, 'wer': 0.2622857142857143}
