In [1]:
import logging
logging.basicConfig(level=logging.INFO, format='%(levelname)s - %(asctime)s - %(message)s')

import pandas as pd

from pydub import AudioSegment
from pyannote.audio import Pipeline
import json

from silero_vad import read_audio, get_speech_timestamps, load_silero_vad

with open('config.json') as fileobj:
    hf_token = json.load(fileobj)['hugging_face_token']

pyannote_model = 'pyannote/speaker-diarization-3.1'

import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

import os

import utils
from itertools import chain
import gc

import tiktoken
import whisper
whisper_tokenizer = whisper.tokenizer.get_tokenizer(tiktoken.get_encoding(tiktoken.list_encoding_names()[-1]), num_languages=1)

whisper_model = 'turbo'
silero_threshold = 0.5
whisper_beam = 3
whisper_ns_prob = 0.2

import data_extraction as da

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/eye4got/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# TODO: upload files to Kaggle?
# Raw files vs transcript (as one parquet)

In [3]:
films_list_df = da.get_or_create_subtitles_data(os.path.join(da.sub_dir, 'movie_index.parquet'), da.sub_dir)

# TODO: add download scripts for transcript downloads

# zenodo_get.download(
#     record_or_doi=4881008,
#     output_dir=os.path.join('data')
# )

# Unzip zip files

In [None]:
# Torch (pyannote) isn't familiar with MP3 files, so convert to wav for effective performance
# Perform diarization to help separate narration in audio description from dialogue in original movie
# Finally use OpenAI's Whisper to convert to a transcript

whisper_sr = 16000
mp3_files = [x for x in os.listdir(da.trans_mp3_dir) if os.path.splitext(x)[-1].lower() == '.mp3']
mp3_files = ['Annie Hall.mp3', 'The Silence Of The Lambs.mp3', 'The Departed.mp3'] + mp3_files

for mp3_filename in mp3_files:
    movie_name = utils.remove_ext(mp3_filename)
    
    if movie_name in ('Oppenheimer', 'Killers Of The Flower Moon', 'Guardians of the Galaxy Vol. 3', 'Elvis', 'All Quiet on the Western Front', 'Avatar The Way of Water'):
        continue
    
    vad_df_path = os.path.join(da.voice_activity_dir, f'{movie_name}-vad.parquet')
    seg_df_path = os.path.join(da.diarization_dir, f'{movie_name}-diarization.parquet')
    curr_transcript_fp = os.path.join(da.transcription_dir, da.transcript_df_fp.format(movie_name=movie_name))
    wav_filepath = os.path.join(da.trans_mp3_dir, f'{movie_name}_speech_only.wav')
    
    if not os.path.exists(curr_transcript_fp) or not os.path.exists(seg_df_path):
        
        logging.info(f'Applying Silero VAD to {movie_name}')
        silero_model = load_silero_vad()
        
        full_silero_audio = read_audio(os.path.join(da.trans_mp3_dir, mp3_filename))
        speech_timestamps = get_speech_timestamps(full_silero_audio, silero_model, threshold=silero_threshold, speech_pad_ms=200)
        pd.DataFrame(speech_timestamps).to_parquet(vad_df_path)
        utils.cleanup_model(silero_model)

        # Now cut audio down to just dialogue
        full_audio = AudioSegment.from_mp3(os.path.join(da.trans_mp3_dir, mp3_filename))
        dialogue_only_audio = AudioSegment.empty()

        for seg in speech_timestamps:
            dialogue_only_audio += full_audio[seg['start']:seg['end']]
            
        dialogue_only_audio.export(wav_filepath, format="wav")
        
    # Only perform diarization if parquet of dialogue doesn't exist
    if not os.path.exists(seg_df_path):
        logging.info(f'Started pyannote pipeline for {movie_name}')
        pyannote_pipeline = Pipeline.from_pretrained(pyannote_model, use_auth_token=hf_token)
        pyannote_pipeline.to(device)
        
        dz = pyannote_pipeline({'audio': wav_filepath})
        
        # Extract start and end times from segments object and split integer out from 'SPEAKER_x' labels
        records = [(x[0].start, x[0].end, int(x[2].split('_')[-1])) for x in dz.itertracks(yield_label = True)]
        segments_df = pd.DataFrame(records, columns=['start', 'end', 'speaker'])
        
        agg_seg_df = da.aggregate_segments(segments_df)
        
        # Assume narrator speaks first (describing opening logos etc)
        narrator_id = agg_seg_df['speaker'].iloc[0]
        agg_seg_df['is_dialogue'] = agg_seg_df['speaker'].ne(narrator_id)
        agg_seg_df['movie_name'] = movie_name
        
        agg_seg_df['start_frame'] = (whisper_sr * agg_seg_df['start']).astype(int)
        agg_seg_df['end_frame'] = (whisper_sr * agg_seg_df['end']).astype(int)
        
        agg_seg_df.to_parquet(seg_df_path)
        
        utils.cleanup_model(pyannote_pipeline)
        del dz
        
    segments_df = pd.read_parquet(seg_df_path)
    narrator_df = segments_df[~segments_df.is_dialogue].copy()
        
    if not os.path.exists(curr_transcript_fp):
        model = whisper.load_model(whisper_model, device=device)
        audio = whisper.load_audio(wav_filepath)
        seg_start_arr, seg_end_arr = narrator_df['start_frame'].values, narrator_df['end_frame'].values
        
        segment_list = []

        for ii in range(len(seg_start_arr)):
            if ii % 50 == 0:
                logging.info(f'{movie_name} Segment: {ii + 1} / {len(seg_start_arr)}')
            segment = audio[seg_start_arr[ii]: seg_end_arr[ii]]
            
            segment_list.append(model.transcribe(segment, language='en', beam_size=whisper_beam, no_speech_threshold=whisper_ns_prob, condition_on_previous_text=False)) 
        
        narrator_df['text'] = [x['text'] for x in segment_list]
        narrator_df.to_parquet(curr_transcript_fp)
        
        utils.cleanup_model(model)
        del audio
        
    # Delete Wav File afterwards as they are quick to generate and consume too much space
    if os.path.exists(wav_filepath):
        os.remove(wav_filepath)

INFO - 2025-05-29 12:53:28,214 - Applying Silero VAD to Annie Hall
INFO - 2025-05-29 12:54:13,512 - Started pyannote pipeline for Annie Hall
INFO - 2025-05-29 12:54:13,946 - Applied quirks (see `speechbrain.utils.quirks`): [disable_jit_profiling, allow_tf32]
INFO - 2025-05-29 12:54:13,947 - Excluded quirks specified by the `SB_DISABLE_QUIRKS` environment (comma-separated list): []
It can be re-enabled by calling
   >>> import torch
   >>> torch.backends.cuda.matmul.allow_tf32 = True
   >>> torch.backends.cudnn.allow_tf32 = True
See https://github.com/pyannote/pyannote-audio/issues/1370 for more details.

  std = sequences.std(dim=-1, correction=1)


In [None]:
# Cleanup Missed wav files
wav_files = [x for x in os.listdir(da.trans_mp3_dir) if os.path.splitext(x)[-1].lower() == '.wav']

for path in wav_files:
    os.remove(os.path.join(da.trans_mp3_dir, path))

### Subtitles Editted

File Completely Empty: X-Men, Finding Neverland, Mr Mrs Smith
Grease: Line 6916
Hangover Part II: Timestamps messed up line 5578
Super Mario Bros. Movie: Line 3877, Missing hours 
The Social Network: Counter 658, 1507, 1526

Index Titles Edited:
- Goodbye Columbus
- Monsters Inc
- What's Up, Doc


In [None]:
full_subs_df_list = []

for movie_cat in ('Blockbusters', 'Oscar'):
    cat_mask = films_list_df.fame_category.eq(movie_cat)
    for year in films_list_df.year.unique():
        year_dir = os.path.join(da.sub_by_year_dir, movie_cat, str(year))
        for movie_fp in os.listdir(year_dir):
            full_subs_df_list.append(da.extract_single_subs_file(os.path.join(year_dir, movie_fp)))
                
full_subs_df = pd.concat(full_subs_df_list)

full_subs_df['movie'] = full_subs_df['movie'].str.strip().str.replace('-', ' ')
films_list_df['movie'] = films_list_df['movie'].str.strip().str.replace('-', ' ').str.replace("'", ' ').str.replace('&', 'and')

In [None]:
films_list_df.loc[films_list_df.movie.eq('Don t Look Up'), 'movie'] = 'Dont Look Up'
films_list_df.loc[films_list_df.movie.eq('Goodbye,Columbus'), 'movie'] = 'Goodbye Columbus'
films_list_df.loc[films_list_df.movie.eq('Summer of  42'), 'movie'] = 'Summer of 42'
films_list_df.loc[films_list_df.movie.eq('What s Up, Doc_'), 'movie'] = 'What s Up, Doc'
films_list_df.loc[films_list_df.movie.eq('Monsters, Inc.'), 'movie'] = 'Monsters Inc'

combined_subs_df = full_subs_df.merge(films_list_df, how='left')
combined_subs_df.to_parquet(da.sub_df_dir)

In [None]:
all_transcripts_df_list = []
longitudinal_movies = [utils.remove_ext(x) for x in os.listdir(da.trans_mp3_dir)]

for filename in os.listdir(da.transcription_dir):
    if filename.split('-')[0] in longitudinal_movies:
        all_transcripts_df_list.append(pd.read_parquet(os.path.join(da.transcription_dir, filename)))

all_transcripts_df = pd.concat(all_transcripts_df_list)
all_transcripts_df.to_parquet(da.all_transcripts_df_dir)