In [None]:
import logging
logging.basicConfig(level=logging.INFO, format='%(levelname)s - %(asctime)s - %(message)s')

import pandas as pd
import numpy as np

from pydub import AudioSegment

from evaluate import load

import os

pyannote_model = 'pyannote/speaker-diarization-3.1'
embedding_model = "pyannote/embedding" # speechbrain/spkrec-ecapa-voxceleb

import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

from IPython.display import clear_output

use_vad = True
narr_cosine_sim_lim = 0.14
diag_cosine_sim_lim = 1

whisper_model = 'turbo'
silero_threshold = 0.5

whisper_config = {
    'beam_size': 1,
    'no_speech_threshold': 0.1,
    'condition_on_previous_text': False
}

import data_extraction as da
import stt
import utils

import warnings
warnings.filterwarnings("ignore")

logging.getLogger("speechbrain").setLevel(logging.WARNING)
logging.getLogger("pyannote").setLevel(logging.WARNING)

INFO - 2025-06-10 12:20:10,702 - PyTorch version 2.7.0 available.
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/eye4got/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
INFO - 2025-06-10 12:20:17,777 - Applied quirks (see `speechbrain.utils.quirks`): [disable_jit_profiling, allow_tf32]
INFO - 2025-06-10 12:20:17,777 - Excluded quirks specified by the `SB_DISABLE_QUIRKS` environment (comma-separated list): []


In [2]:
films_list_df = da.get_or_create_subtitles_data(os.path.join(da.sub_dir, 'movie_index.parquet'), da.sub_dir)

# TODO: add download scripts for transcript downloads

# zenodo_get.download(
#     record_or_doi=4881008,
#     output_dir=os.path.join('data')
# )

# Unzip zip files

In [4]:
with open(os.path.join(da.transcription_dir, 'manual', 'Annie Hall.txt')) as fileobj:
    raw_annie_hall_man_txt = fileobj.read()
annie_hall_man_txt = raw_annie_hall_man_txt.replace('\n', ' ')

def calc_cer_wer(movie_name: str, ref_txt: str):
    trans_df = pd.read_parquet(os.path.join(da.transcription_dir, da.transcript_df_fp.format(movie_name=movie_name)))
    trans_df = trans_df[trans_df['text'].ne(' Thank you.')]['text']
    trans_txt = ''.join(trans_df.str.replace('[\.,"\?]', '', regex=True)).lower().replace('-', ' ')
    
    cer, wer = load("cer"), load("wer")
    cer_score = cer.compute(predictions=[trans_txt], references=[ref_txt])
    wer_score = wer.compute(predictions=[trans_txt], references=[ref_txt])
    
    return cer_score, wer_score

In [None]:
# Torch (pyannote) isn't familiar with MP3 files, so convert to wav for effective performance
# Perform diarization to help separate narration in audio description from dialogue in original movie
# Finally use OpenAI's Whisper to convert to a transcript

mp3_files = [x for x in os.listdir(da.trans_mp3_dir) if os.path.splitext(x)[-1].lower() == '.mp3']

for mp3_filename in mp3_files:
    movie_name = utils.remove_ext(mp3_filename)
    vad_df_path = os.path.join(da.voice_activity_dir, f'{movie_name}-vad.parquet')
    seg_df_path = os.path.join(da.diarization_dir, f'{movie_name}-diarization.parquet')
    curr_transcript_fp = os.path.join(da.transcription_dir, da.transcript_df_fp.format(movie_name=movie_name))
    wav_filepath = os.path.join(da.trans_mp3_dir, f'{movie_name}_speech_only.wav')

    # If either diarization or transcript is missing, we'll need to generate the wav file
    if not os.path.exists(curr_transcript_fp) or not os.path.exists(seg_df_path):
        stt.apply_silero_vad_to_wav(mp3_filename, wav_filepath, vad_df_path, silero_threshold)
            
    # Only perform diarization if parquet doesn't exist
    if not os.path.exists(seg_df_path):
        stt.apply_diarization(movie_name, wav_filepath, pyannote_model, seg_df_path, device)
        stt.add_pyannote_cosine_sim(seg_df_path, wav_filepath, min_seg_sec=0.3, device=device)

    # Only perform transcription if parquet doesn't exist
    if not os.path.exists(curr_transcript_fp):
        stt.transcribe_segments(curr_transcript_fp, seg_df_path, wav_filepath, whisper_model, whisper_config, narr_cosine_sim_lim, diag_cosine_sim_lim, device)
        
    # Delete Wav File afterwards as they are quick to generate and consume too much space
    if os.path.exists(wav_filepath):
        os.remove(wav_filepath)
    
utils.clean_up_missed_wav_files(da.trans_mp3_dir)

INFO - 2025-06-10 12:20:17,879 - Applying Silero VAD to Mission Impossible - Dead Reckoning Part One
INFO - 2025-06-10 12:21:19,747 - Slicing up audio from Mission Impossible - Dead Reckoning Part One to speech only
INFO - 2025-06-10 12:22:10,234 - Started pyannote pipeline for Mission Impossible - Dead Reckoning Part One
INFO - 2025-06-10 12:33:50,925 - Lightning automatically upgraded your loaded checkpoint from v1.2.7 to v2.5.0.post0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../../.cache/torch/pyannote/models--pyannote--embedding/snapshots/4db4899737a38b2d618bbd74350915aa10293cb2/pytorch_model.bin`


Model was trained with pyannote.audio 0.0.1, yours is 3.3.2. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.8.1+cu102, yours is 2.7.0+cu126. Bad things might happen unless you revert torch to 1.x.


INFO - 2025-06-10 12:34:23,797 - Segment: 1 / 329
INFO - 2025-06-10 12:35:43,948 - Segment: 51 / 329
INFO - 2025-06-10 12:36:50,401 - Segment: 101 / 329
INFO - 2025-06-10 12:37:54,142 - Segment: 151 / 329
INFO - 2025-06-10 12:39:08,209 - Segment: 201 / 329
INFO - 2025-06-10 12:40:38,609 - Segment: 251 / 329
INFO - 2025-06-10 12:41:56,882 - Segment: 301 / 329


### Subtitles Editted

File Completely Empty: X-Men, Finding Neverland, Mr Mrs Smith
Grease: Line 6916
Hangover Part II: Timestamps messed up line 5578
Super Mario Bros. Movie: Line 3877, Missing hours 
The Social Network: Counter 658, 1507, 1526

Index Titles Edited:
- Goodbye Columbus
- Monsters Inc
- What's Up, Doc


In [8]:
full_subs_df_list = []
movie_names = set()

for movie_cat in ('Blockbusters', 'Oscar'):
    cat_mask = films_list_df.fame_category.eq(movie_cat)
    for year in films_list_df.year.unique():
        year_dir = os.path.join(da.sub_by_year_dir, movie_cat, str(year))
        for movie_fp in os.listdir(year_dir):
            curr_df = da.extract_single_subs_file(os.path.join(year_dir, movie_fp))
            curr_movie = utils.remove_ext(movie_fp)
            
            # Handle repeat titles like The Little Mermaid by adding the year to subsequent productions
            if curr_movie in movie_names:
                curr_df['movie'] = curr_movie + f' ({str(year)})'
            else:
                movie_names.add(curr_movie)
            
            full_subs_df_list.append(curr_df)
                
full_subs_df = pd.concat(full_subs_df_list)

full_subs_df['movie'] = full_subs_df['movie'].str.strip().str.replace('-', ' ')
films_list_df['movie'] = films_list_df['movie'].str.strip().str.replace('-', ' ').str.replace("'", ' ').str.replace('&', 'and')

In [9]:
films_list_df.loc[films_list_df.movie.eq('Don t Look Up'), 'movie'] = 'Dont Look Up'
films_list_df.loc[films_list_df.movie.eq('Goodbye,Columbus'), 'movie'] = 'Goodbye Columbus'
films_list_df.loc[films_list_df.movie.eq('Summer of  42'), 'movie'] = 'Summer of 42'
films_list_df.loc[films_list_df.movie.eq('What s Up, Doc_'), 'movie'] = 'What s Up, Doc'
films_list_df.loc[films_list_df.movie.eq('Monsters, Inc.'), 'movie'] = 'Monsters Inc'

combined_subs_df = full_subs_df.merge(films_list_df, how='left')
combined_subs_df.to_parquet(da.sub_df_dir)

In [10]:
all_transcripts_df_list = []
longitudinal_movies = [utils.remove_ext(x) for x in os.listdir(da.trans_mp3_dir)]

for filename in os.listdir(da.transcription_dir):
    movie = filename.removesuffix(da.transcript_df_fp.format(movie_name=''))
    if movie in longitudinal_movies:
        all_transcripts_df_list.append(pd.read_parquet(os.path.join(da.transcription_dir, filename)))

all_transcripts_df = pd.concat(all_transcripts_df_list)
all_transcripts_df.to_parquet(da.all_transcripts_df_dir)