In [1]:
import logging
logging.basicConfig(level=logging.INFO, format='%(levelname)s - %(asctime)s - %(message)s')

import pandas as pd
import whisper

import tiktoken
whisper_tokenizer = whisper.tokenizer.get_tokenizer(tiktoken.get_encoding(tiktoken.list_encoding_names()[-1]), num_languages=1)

from pydub import AudioSegment
from pyannote.audio import Pipeline
import json

with open('config.json') as fileobj:
    hf_token = json.load(fileobj)['hugging_face_token']

pyannote_pipeline = Pipeline.from_pretrained('pyannote/speaker-diarization-3.1', use_auth_token=hf_token)

import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
pyannote_pipeline.to(device)

import os
from itertools import chain

from IPython.display import clear_output
clear_output()

import utils
import data_extraction as da

whisper_model = 'turbo'

audio_dir = os.path.join('data', 'audio-vault')
transcription_dir = os.path.join(audio_dir, 'transcriptions')
diarization_dir = os.path.join(audio_dir, 'diarization_segments')

vsd_dir = os.path.join('data', 'VSD', 'VSD2014_officialrelease', 'VSD_2014_December_official_release', 'Hollywood-dev')
vsd_features_dir = os.path.join(vsd_dir, 'features')
vsd_annotations_dir = os.path.join(vsd_dir, 'annotations')

utils.ensure_dir_exists(transcription_dir)

transcript_df_fp = '{movie_name}-transcript.parquet'

# TODO:
Goal is to identify parts of audio transcript that align with violent scenes

1) Find dialogue either side of timestamped violence in subtitles
2) Get dialogue from narration
3) Match bookends with narration, assess length of newly created scene (words/timestamps)



To assess if no further progress is made with matching:
    temperature, avg_logprob, compression_ratio, no_speech_prob

In [2]:
# Extract and tokenize subtitles for matching
subs_dict = da.extract_subs()

for movie, subs_df in subs_dict.items():
    subs_df['dialogue'] = da.clean_dialogue(subs_df['raw_dialogue'])
    subs_df['tokens'] = subs_df['dialogue'].apply(lambda x: whisper_tokenizer.encode(x))

Utilised following repo for diarization https://github.com/Majdoddin/nlp/blob/main/Pyannote_plays_and_Whisper_rhymes_v_2_0.ipynb

In [3]:
# Torch (pyannote) isn't familiar with MP3 files, so convert to wav for effective performance

mp3_files = [x for x in os.listdir(audio_dir) if os.path.splitext(x)[-1] == '.mp3']

for mp3_filename in mp3_files:
    movie_name = utils.remove_ext(mp3_filename)
    
    # Only convert if there isn't already a local converted copy
    if not os.path.exists(os.path.join(audio_dir, movie_name + '.wav')):
        logging.info(f'.wav file missing for {movie_name}, converting ...')
        audio = AudioSegment.from_mp3(os.path.join(audio_dir, mp3_filename))
        audio.export(os.path.join(audio_dir, movie_name + '.wav'), format="wav")

In [4]:
# Perform diarization to help separate narration in audio description from dialogue in original movie
# This helps us match only dialogue in both
seg_df_list = []
wav_files = [x for x in os.listdir(audio_dir) if os.path.splitext(x)[-1] == '.wav']

for wav_filename in wav_files:
    movie_name = utils.remove_ext(wav_filename)
    seg_df_path = os.path.join(diarization_dir, f'{movie_name}-diarization.parquet')
    
    # Only perform diarization if parquet of dialogue doesn't exist
    if not os.path.exists(seg_df_path):
        logging.info(f'Started pyannote pipeline for {movie_name}')
        dz = pyannote_pipeline({'audio': os.path.join(audio_dir, wav_filename)})
        
        # Extract start and end times from segments object and split integer out from 'SPEAKER_x' labels
        records = [(x[0].start, x[0].end, int(x[2].split('_')[-1])) for x in dz.itertracks(yield_label = True)]
        segments_df = pd.DataFrame(records, columns=['start', 'end', 'speaker'])
        
        # Assume narrator speaks first (describing opening logos etc)
        narrator_id = segments_df['speaker'].iloc[0]
        segments_df['is_dialogue'] = segments_df['speaker'].ne(narrator_id)
        segments_df['movie_name'] = movie_name
        
        segments_df.to_parquet(seg_df_path)
        
    seg_df_list.append(pd.read_parquet(seg_df_path))
    
full_seg_df = pd.concat(seg_df_list)

whisper_sr = 16000
full_seg_df['start_frame'] = (whisper_sr * full_seg_df['start']).astype(int)
full_seg_df['end_frame'] = (whisper_sr * full_seg_df['end']).astype(int)

In [None]:
# Perform transcription on each segment identified by the diarization algorithm

# Quote from repo: "pyannote.audio seems to miss the first 0.5 seconds of the audio, and, therefore, we prepend a spcacer"
# spacer = AudioSegment.silent(duration=2000)

full_seg_df_list = []

for wav_filename in wav_files:
    movie_name = utils.remove_ext(wav_filename)
    
    curr_transcript_fp = os.path.join(transcription_dir, transcript_df_fp.format(movie_name=movie_name))
    
    if not os.path.exists(curr_transcript_fp):
        model = whisper.load_model(whisper_model, device=device)
        audio = whisper.load_audio(os.path.join(audio_dir, wav_filename))
        segments_df = full_seg_df[full_seg_df.movie_name.eq(movie_name)]
        seg_start_arr, seg_end_arr = segments_df['start_frame'].values, segments_df['end_frame'].values
        
        segment_list = []

        for ii in range(len(seg_start_arr)):
            if ii % 50 == 0:
                logging.info(f'{movie_name} Segment: {ii + 1} / {len(seg_start_arr)}')
            segment = audio[seg_start_arr[ii]: seg_end_arr[ii]]
            
            segment_list.append(model.transcribe(segment, language='en')) 
        
        segments_df['text'] = [x['text'] for x in segment_list]
        segments_df['tokens'] = [list(chain.from_iterable([y['tokens'] for y in x['segments']])) for x in segment_list]
        
        segments_df.to_parquet(curr_transcript_fp)
        
    full_seg_df_list.append(pd.read_parquet(curr_transcript_fp))
    
full_seg_df = pd.concat(full_seg_df_list)

INFO - 2025-05-20 20:43:26,501 - FightClub Segment: 1 / 3315
INFO - 2025-05-20 20:43:44,239 - FightClub Segment: 51 / 3315


In [None]:
full_seg_df[full_seg_df.movie_name.str.startswith('Pir')]

Unnamed: 0,start,end,speaker,is_dialogue,movie_name,start_frame,end_frame,text,tokens
0,0.030969,1.026594,22,False,PiratesOfTheCarribeanTheCurseOfTheBlackPearl,495,16425,A warning appears.,"[50365, 316, 9164, 7038, 13, 50410]"
1,1.245969,4.890969,22,False,PiratesOfTheCarribeanTheCurseOfTheBlackPearl,19935,78255,This product is authorized for private use on...,"[50365, 639, 1674, 307, 28312, 337, 4551, 764,..."
2,5.329719,8.097219,22,False,PiratesOfTheCarribeanTheCurseOfTheBlackPearl,85275,129555,Now a glowing pinpoint of light arcs through ...,"[50365, 823, 257, 27064, 40837, 295, 1442, 103..."
3,8.333469,9.919719,22,False,PiratesOfTheCarribeanTheCurseOfTheBlackPearl,133335,158715,Walt Disney Home Entertainment.,"[50365, 28260, 8653, 8719, 25758, 13, 50437]"
4,12.029094,12.687219,22,False,PiratesOfTheCarribeanTheCurseOfTheBlackPearl,192465,202995,a notice.,"[50365, 257, 3449, 13, 50415]"
...,...,...,...,...,...,...,...,...,...
3148,8569.510344,8574.960969,22,False,PiratesOfTheCarribeanTheCurseOfTheBlackPearl,137112165,137199375,"In the moonlight, Jack the monkey swims towar...","[50365, 682, 264, 48058, 11, 4718, 264, 17847,..."
3149,8575.821594,8578.150344,22,False,PiratesOfTheCarribeanTheCurseOfTheBlackPearl,137213145,137250405,He reaches into the chest and picks up a meda...,"[50365, 634, 14235, 666, 264, 7443, 293, 16137..."
3150,8579.989719,8585.220969,22,False,PiratesOfTheCarribeanTheCurseOfTheBlackPearl,137279835,137363535,He transforms into a skeleton. Shaking his bo...,"[50365, 634, 35592, 666, 257, 25204, 13, 50431..."
3151,8588.663469,8597.421594,22,False,PiratesOfTheCarribeanTheCurseOfTheBlackPearl,137418615,137558745,"In a logo, storm clouds loom over a desert hi...","[50365, 682, 257, 9699, 11, 7679, 12193, 450, ..."


In [None]:
# https://github.com/linto-ai/whisper-timestamped