In [1]:
import logging
logging.basicConfig(level=logging.INFO, format='%(levelname)s - %(asctime)s - %(message)s')

import pandas as pd
import numpy as np

from evaluate import load

import os

diarization_model = 'pyannote/speaker-diarization-3.1' # 'BUT-FIT/DiCoW_v2'

embedding_model = "pyannote/embedding" # speechbrain/spkrec-ecapa-voxceleb

import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

from IPython.display import clear_output

use_vad = True
narr_cosine_sim_lim = 0.14
diag_cosine_sim_lim = 0.3

whisper_model = 'turbo'
silero_threshold = 0.5

whisper_config = {
    'beam_size': 7,
    'no_speech_threshold': 0.1,
    'condition_on_previous_text': False
}

import data_extraction as da
import stt

import warnings
warnings.filterwarnings("ignore")

logging.getLogger("speechbrain").setLevel(logging.WARNING)
logging.getLogger("pyannote").setLevel(logging.WARNING)

# TODO: Try use DiCoW to improve pyannote whisper combination https://github.com/BUTSpeechFIT/DiCoW/blob/c0e86b9ac529c086aa2197b95abf04f8fabb4fd7/app.py

INFO - 2025-06-17 20:05:10,937 - PyTorch version 2.7.1 available.
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/eye4got/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
films_list_df = da.get_or_create_subtitles_data(os.path.join(da.sub_dir, 'movie_index.parquet'), da.sub_dir)

# TODO: add download scripts for transcript downloads

# zenodo_get.download(
#     record_or_doi=4881008,
#     output_dir=os.path.join('data')
# )

# Unzip zip files

In [3]:
credits_df = da.get_credits_timestamps()

In [4]:
with open(os.path.join(da.transcription_dir, 'manual', 'Annie Hall.txt')) as fileobj:
    raw_annie_hall_man_txt = fileobj.read()
annie_hall_man_txt = raw_annie_hall_man_txt.replace('\n', ' ')

def calc_cer_wer(movie_name: str, ref_txt: str):
    trans_df = pd.read_parquet(os.path.join(da.transcription_dir, da.transcript_df_fp.format(movie_name=movie_name)))
    trans_df = trans_df[trans_df['text'].ne(' Thank you.')]['text']
    trans_txt = ''.join(trans_df.str.replace('[\.,"\?]', '', regex=True)).lower().replace('-', ' ')
    
    cer, wer = load("cer"), load("wer")
    cer_score = cer.compute(predictions=[trans_txt], references=[ref_txt])
    wer_score = wer.compute(predictions=[trans_txt], references=[ref_txt])
    
    return cer_score, wer_score

In [11]:
config_list = []
for silero_threshold in [0.5]: # (0.4, 0.45, 0.5):
    for whisper_model in ['turbo']: # , 'large'
        for cosine_sim_lim in [0.14]: # np.arange(0.4, 1.2, 0.2): # 0.15, 0.2, 
            for beam_size in [7]:
                for speech_cond in [True, False]:
                    for no_speech_threshold in [0.1]: # , 0.2
                        for min_seg_sec in [0]: #np.arange(0, 0.6, 0.1):
                            config = {
                                'use_vad': use_vad,
                                'silero_threshold': silero_threshold,
                                'whisper_model': whisper_model,
                                'cosine_sim_lim': cosine_sim_lim,
                                'beam_size': beam_size,
                                'no_speech_threshold': no_speech_threshold,
                                'condition_on_previous_text': speech_cond,
                                'min_seg_sec': min_seg_sec
                            }
                            
                            config_list.append(config)

In [13]:
movie_name = 'Annie Hall'
mp3_filename = 'Annie Hall.mp3'

vad_df_path = os.path.join(da.voice_activity_dir, f'{movie_name}-vad.parquet')
seg_df_path = os.path.join(da.diarization_dir, f'{movie_name}-diarization.parquet')
curr_transcript_fp = os.path.join(da.transcription_dir, da.transcript_df_fp.format(movie_name=movie_name))
wav_filepath = os.path.join(da.trans_mp3_dir, f'{movie_name}_speech_only.wav')

# Perform initial calculations which aren't impacted by config parameter sweeps
stt.apply_silero_vad_to_wav(mp3_filename, wav_filepath, vad_df_path, silero_threshold=silero_threshold, credits_df=credits_df)
stt.apply_diarization(movie_name, wav_filepath, diarization_model, seg_df_path, vad_df_path, device)
stt.add_pyannote_cosine_sim(seg_df_path, wav_filepath, min_seg_sec=0, device=device)

INFO - 2025-06-17 20:24:40,448 - Applying Silero VAD to Annie Hall
INFO - 2025-06-17 20:25:22,117 - Slicing up audio from Annie Hall to speech only
INFO - 2025-06-17 20:25:26,252 - Started pyannote pipeline for Annie Hall
INFO - 2025-06-17 20:33:06,623 - Lightning automatically upgraded your loaded checkpoint from v1.2.7 to v2.5.0.post0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../../.cache/torch/pyannote/models--pyannote--embedding/snapshots/4db4899737a38b2d618bbd74350915aa10293cb2/pytorch_model.bin`


Model was trained with pyannote.audio 0.0.1, yours is 3.3.2. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.8.1+cu102, yours is 2.7.1+cu126. Bad things might happen unless you revert torch to 1.x.


In [14]:
results = []

for ii, config in enumerate(config_list):
    logging.info(f'NEW CONFIG RUN: \t{ii} / {len(config_list)}')
    # da.wipe_movie_files(movie_name)
    
    use_vad = config['use_vad']
    silero_threshold = config['silero_threshold']
    whisper_model = config['whisper_model']
    cosine_sim_lim = config['cosine_sim_lim']
    whisper_config['beam_size'] = config['beam_size']
    whisper_config['no_speech_threshold'] = config['no_speech_threshold']
    whisper_config['condition_on_previous_text'] = config['condition_on_previous_text']
    min_seg_sec = config['min_seg_sec']
    
    # stt.apply_pyannote_vad_to_wav(mp3_filename, wav_filepath, vad_df_path, device, credits_df)
    # stt.apply_silero_vad_to_wav(mp3_filename, wav_filepath, vad_df_path, silero_threshold=silero_threshold, credits_df=credits_df)
    # stt.apply_diarization(movie_name, wav_filepath, diarization_model, seg_df_path, device)
    # stt.add_pyannote_cosine_sim(seg_df_path, wav_filepath, min_seg_sec=min_seg_sec, device=device)
    stt.transcribe_segments(curr_transcript_fp, seg_df_path, wav_filepath, whisper_model, whisper_config, narr_cosine_sim_lim, device)
    
    cer, wer = calc_cer_wer(movie_name, annie_hall_man_txt)
    results.append({'cer': cer, 'wer': wer})
    print(results[-1])

INFO - 2025-06-17 20:33:21,996 - NEW CONFIG RUN: 	0 / 2
INFO - 2025-06-17 20:33:34,615 - Segment: 1 / 248
INFO - 2025-06-17 20:34:05,664 - Segment: 51 / 248
INFO - 2025-06-17 20:34:35,852 - Segment: 101 / 248
INFO - 2025-06-17 20:35:03,173 - Segment: 151 / 248
INFO - 2025-06-17 20:35:35,840 - Segment: 201 / 248
INFO - 2025-06-17 20:36:11,706 - NEW CONFIG RUN: 	1 / 2


{'cer': 0.16, 'wer': 0.21657142857142858}


INFO - 2025-06-17 20:36:23,398 - Segment: 1 / 248
INFO - 2025-06-17 20:37:16,218 - Segment: 51 / 248
INFO - 2025-06-17 20:40:08,283 - Segment: 101 / 248
INFO - 2025-06-17 20:40:34,215 - Segment: 151 / 248
INFO - 2025-06-17 20:41:02,713 - Segment: 201 / 248


{'cer': 0.16, 'wer': 0.21657142857142858}


In [None]:
trans_df = pd.read_parquet(os.path.join(da.transcription_dir, da.transcript_df_fp.format(movie_name=movie_name)))
trans_df = trans_df[trans_df['text'].ne(' Thank you.')]['text']
trans_txt = ''.join(trans_df.str.replace('[\.,"\?]', '', regex=True)).lower().replace('-', ' ')

stt.visualise_wer_differences(trans_txt, annie_hall_man_txt)

in a logo the capital letters [1m[31mu and a[0m|[1m[33myou and[0m unite into a single shape united artists words appear annie hall a jack rollins charles h [1m[31mjoffe production[0m|[1m[33mjaffe productions[0m copyright united artists corporation 1977 edited by ralph rosenblum art director mel bourne costume designer ruth morley director of photography gordon willis written by woody allen and marshall brickman produced by charles h [1m[31mjoffe[0m|[1m[33mjaffe[0m directed by woody allen a bespectacled man in a tweed jacket speaks directly to us young [1m[31malvy[0m|[1m[33malvi[0m sits next to his mother [1m[31mnodding his head[0m|[1m[33mwhy are you depressed harvey[0m a modest two [1m[31mstorey[0m|[1m[33mstory[0m house a snack bar on the boardwalk bumper cars in a classroom a series of teachers scrawl on the blackboard and scowl at the students young [1m[31malvy[0m|[1m[33malvie[0m slaps his forehead he goes over to the girl at the next desk an