In [None]:
import logging
logging.basicConfig(level=logging.INFO, format='%(levelname)s - %(asctime)s - %(message)s')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import whisper

import tiktoken
whisper_tokenizer = whisper.tokenizer.get_tokenizer(tiktoken.get_encoding(tiktoken.list_encoding_names()[-1]), num_languages=1)

from pydub import AudioSegment
from pyannote.audio import Pipeline
import json

with open('config.json') as fileobj:
    hf_token = json.load(fileobj)['hugging_face_token']

pyannote_pipeline = Pipeline.from_pretrained('pyannote/speaker-diarization-3.1', use_auth_token=hf_token)

import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
pyannote_pipeline.to(device)

from sentence_transformers.cross_encoder import CrossEncoder

import os
import pickle
import difflib
from itertools import chain

from IPython.display import clear_output
clear_output()

import utils
import data_extraction as da

whisper_model = 'turbo'

audio_dir = os.path.join('data', 'audio-vault')
transcription_dir = os.path.join(audio_dir, 'transcriptions')
diarization_dir = os.path.join(audio_dir, 'diarization_segments')

vsd_dir = os.path.join('data', 'VSD', 'VSD2014_officialrelease', 'VSD_2014_December_official_release', 'Hollywood-dev')
vsd_features_dir = os.path.join(vsd_dir, 'features')
vsd_annotations_dir = os.path.join(vsd_dir, 'annotations')

utils.ensure_dir_exists(transcription_dir)

transcript_df_fp = '{movie_name}-transcript.parquet'
annot_cats = ['blood', 'carchase', 'coldarms', 'explosions', 'fights', 'fire', 'firearms', 'gore', 'gunshots', 'screams']

# TODO:
Goal is to identify parts of audio transcript that align with violent scenes

1) Find dialogue either side of timestamped violence in subtitles
2) Get dialogue from narration
3) Match bookends with narration, assess length of newly created scene (words/timestamps)



To assess if no further progress is made with matching:
    temperature, avg_logprob, compression_ratio, no_speech_prob

In [38]:
def clean_dialogue(dialogue: pd.Series) -> pd.Series:
    return dialogue.str.lower().str.replace('.', '').str.replace('"', '').str.replace(',', '').str.replace('-', '')

In [39]:
# Extract and tokenize subtitles for matching
subs_dict = da.extract_subs()

for movie, subs_df in subs_dict.items():
    subs_df['dialogue'] = clean_dialogue(subs_df['raw_dialogue'])
    subs_df['tokens'] = subs_df['dialogue'].apply(lambda x: whisper_tokenizer.encode(x))

Utilised following repo for diarization https://github.com/Majdoddin/nlp/blob/main/Pyannote_plays_and_Whisper_rhymes_v_2_0.ipynb

In [40]:
# Torch (pyannote) isn't familiar with MP3 files, so convert to wav for effective performance

mp3_files = [x for x in os.listdir(audio_dir) if os.path.splitext(x)[-1] == '.mp3']

for mp3_filename in mp3_files:
    movie_name = utils.remove_ext(mp3_filename)
    
    # Only convert if there isn't already a local converted copy
    if not os.path.exists(os.path.join(audio_dir, movie_name + '.wav')):
        logging.info(f'.wav file missing for {movie_name}, converting ...')
        audio = AudioSegment.from_mp3(os.path.join(audio_dir, mp3_filename))
        audio.export(os.path.join(audio_dir, movie_name + '.wav'), format="wav")

In [41]:
# Perform diarization to help separate narration in audio description from dialogue in original movie
# This helps us match only dialogue in both
seg_df_list = []
wav_files = [x for x in os.listdir(audio_dir) if os.path.splitext(x)[-1] == '.wav']

for wav_filename in wav_files:
    movie_name = utils.remove_ext(wav_filename)
    seg_df_path = os.path.join(diarization_dir, f'{movie_name}-diarization.parquet')
    
    # Only perform diarization if parquet of dialogue doesn't exist
    if not os.path.exists(seg_df_path):
        logging.info(f'Started pyannote pipeline for {movie_name}')
        dz = pyannote_pipeline({'audio': os.path.join(audio_dir, wav_filename)})
        
        # Extract start and end times from segments object and split integer out from 'SPEAKER_x' labels
        records = [(x[0].start, x[0].end, int(x[2].split('_')[-1])) for x in dz.itertracks(yield_label = True)]
        segments_df = pd.DataFrame(records, columns=['start', 'end', 'speaker'])
        
        # Assume narrator speaks first (describing opening logos etc)
        narrator_id = segments_df['speaker'].iloc[0]
        segments_df['is_dialogue'] = segments_df['speaker'].ne(narrator_id)
        segments_df['movie_name'] = movie_name
        
        segments_df.to_parquet(seg_df_path)
        
    seg_df_list.append(pd.read_parquet(seg_df_path))
    
full_seg_df = pd.concat(seg_df_list)

whisper_sr = 16000
full_seg_df['start_frame'] = (whisper_sr * full_seg_df['start']).astype(int)
full_seg_df['end_frame'] = (whisper_sr * full_seg_df['end']).astype(int)

In [42]:
# Perform transcription on each segment identified by the diarization algorithm

# Quote from repo: "pyannote.audio seems to miss the first 0.5 seconds of the audio, and, therefore, we prepend a spcacer"
# spacer = AudioSegment.silent(duration=2000)

full_seg_df_list = []

for wav_filename in wav_files:
    movie_name = utils.remove_ext(wav_filename)
    
    curr_transcript_fp = os.path.join(transcription_dir, transcript_df_fp.format(movie_name=movie_name))
    
    if not os.path.exists(curr_transcript_fp):
        model = whisper.load_model(whisper_model, device=device)
        audio = whisper.load_audio(os.path.join(audio_dir, wav_filename))
        segments_df = full_seg_df[full_seg_df.movie_name.eq(movie_name)]
        seg_start_arr, seg_end_arr = segments_df['start_frame'].values, segments_df['end_frame'].values
        
        segment_list = []

        for ii in range(len(seg_start_arr)):
            if ii % 50 == 0:
                logging.info(f'{movie_name} Segment: {ii + 1} / {len(seg_start_arr)}')
            segment = audio[seg_start_arr[ii]: seg_end_arr[ii]]
            
            # TODO:Add word_timestamps=True, to get greater precision once pipeline works
            segment_list.append(model.transcribe(segment, prompt='Only')) 
        
        segments_df['text'] = [x['text'] for x in segment_list]
        segments_df['tokens'] = [list(chain.from_iterable([y['tokens'] for y in x['segments']])) for x in segment_list]
        
        segments_df.to_parquet(curr_transcript_fp)
        
    full_seg_df_list.append(pd.read_parquet(curr_transcript_fp))
    
full_seg_df = pd.concat(full_seg_df_list)

In [43]:
# Whisper adds whitespace which affects tokens
full_seg_df['cleaned_text'] = clean_dialogue(full_seg_df['text'].str.strip())
full_seg_df['cleaned_tokens'] = full_seg_df['cleaned_text'].apply(lambda x: whisper_tokenizer.encode(x))

In [45]:
# https://github.com/linto-ai/whisper-timestamped

In [47]:
curr_movie = 'SavingPrivateRyan'
curr_sub_df = subs_dict[curr_movie].copy()
curr_transcript_df = full_seg_df[full_seg_df['movie_name'].eq(curr_movie)].drop(columns=['text', 'tokens'])
curr_sub_df = curr_sub_df.drop(columns=['raw_time_str', 'raw_start_time', 'raw_end_time', 'raw_dialogue'])

In [48]:
curr_sub_df['time_diffs'] = -1
diffs = (curr_sub_df['start_time'].iloc[1:].values - curr_sub_df['end_time'].iloc[:-1].values)
curr_sub_df.loc[1:, 'time_diffs'] = (diffs / 10 ** 9).astype(int)

In [49]:
dialogue = [curr_sub_df['dialogue'].iloc[0]]
start_times = [curr_sub_df['start_time'].iloc[0]]
end_times = [curr_sub_df['end_time'].iloc[0]]
tokens = [curr_sub_df['tokens'].iloc[0]]

diffs = (curr_sub_df['start_time'].iloc[1:].values - curr_sub_df['end_time'].iloc[:-1].values) / 10 ** 9

for ii in range(1, curr_sub_df.shape[0]):
    if curr_sub_df['time_diffs'].iloc[ii] < 2:
        dialogue[-1] += (' ' + curr_sub_df['dialogue'].iloc[ii])
        end_times[-1] = curr_sub_df['end_time'].iloc[ii]
        tokens[-1] += curr_sub_df['tokens'].iloc[ii]
    else:
        dialogue.append(curr_sub_df['dialogue'].iloc[ii])
        start_times.append(curr_sub_df['start_time'].iloc[ii])
        end_times.append(curr_sub_df['end_time'].iloc[ii])
        tokens.append(curr_sub_df['tokens'].iloc[ii])
        
red_curr_sub_df = pd.DataFrame([dialogue, start_times, end_times, tokens]).transpose()
red_curr_sub_df.columns = ['dialogue', 'start_time', 'end_time', 'tokens']

In [84]:
curr_transcript_df

Unnamed: 0,start,end,speaker,is_dialogue,movie_name,start_frame,end_frame,cleaned_text,cleaned_tokens
0,3.237219,11.354094,49,False,SavingPrivateRyan,51795,181665,in a logo a ribbon of white stars flies toward...,"[259, 257, 9699, 257, 20921, 295, 2418, 6105, ..."
1,20.854719,30.524094,49,False,SavingPrivateRyan,333675,488385,in a logo a fishing bobber drops into a pool o...,"[259, 257, 9699, 257, 10180, 27292, 607, 11438..."
2,31.080969,36.514719,49,False,SavingPrivateRyan,497295,584235,amid billowing clouds we float past giant bloc...,"[335, 327, 2961, 9637, 12193, 321, 15706, 1791..."
3,44.749719,45.306594,49,False,SavingPrivateRyan,715995,724905,titles,"[27689, 904]"
4,46.251594,48.884094,49,False,SavingPrivateRyan,740025,782145,dreamworks pictures and paramount pictures pre...,"[67, 1572, 18357, 5242, 293, 6220, 792, 5242, ..."
...,...,...,...,...,...,...,...,...,...
3113,10144.994094,10145.584719,49,False,SavingPrivateRyan,162319905,162329355,paramount,"[2181, 335, 792]"
3114,10145.989719,10147.052844,49,False,SavingPrivateRyan,162335835,162352845,a viacom company,"[64, 1932, 326, 298, 2237]"
3115,10150.596594,10151.271594,49,False,SavingPrivateRyan,162409545,162420345,dreamworks,"[67, 1572, 18357]"
3116,10151.490969,10152.300969,49,False,SavingPrivateRyan,162423855,162436815,skg,"[5161, 70]"


In [80]:
transcript_list = list(curr_transcript_df[curr_transcript_df.is_dialogue]['cleaned_text'].values)
sub_arr = red_curr_sub_df['dialogue'].values
sub_trans_idx = np.zeros(len(sub_arr), dtype=np.int32)
conf = np.zeros(len(sub_arr), dtype=np.float32)

model = CrossEncoder("cross-encoder/stsb-distilroberta-base")

### First pass, find maximum match
for ii in range(len(sub_arr)):
    if ii % 50 == 0:
        logging.info(f'Segment: {ii + 1} / {len(sub_arr)}')
    result = model.rank(sub_arr[ii], transcript_list, show_progress_bar=False, top_k=1)
    sub_trans_idx[ii] = result[0]['corpus_id']
    conf[ii] = result[0]['score']
            
red_curr_sub_df['max_ratio_idx'] = sub_trans_idx
test_df = red_curr_sub_df.merge(curr_transcript_df[['cleaned_text', 'cleaned_tokens', 'speaker', 'is_dialogue']], left_on='max_ratio_idx', right_index=True, how='left')

INFO - 2025-02-25 13:10:10,376 - Use pytorch device: cuda
INFO - 2025-02-25 13:10:10,587 - Segment: 1 / 327
INFO - 2025-02-25 13:11:35,962 - Segment: 51 / 327


KeyboardInterrupt: 

In [77]:
result

[{'corpus_id': 184, 'score': np.float32(0.7958899)}]

In [165]:
whisper_tokenizer.encode('6 jun 1944')

[21, 8156, 35133]

In [154]:
curr_transcript_df['tokens'].iloc[36]

[41, 2613, 1386, 11, 35133, 11, 13472, 6969, 318, 20814, 11, 49575, 14866, 13]

In [156]:
red_curr_sub_df

Unnamed: 0,raw_dialogue,start_time,end_time,tokens,max_ratio_idx
0,Father.,0 days 00:03:00.508000,0 days 00:03:02.223000,"[44773, 13]",1514
1,6 Jun 1944. Sektor Dog Green Pantai Omaha.,0 days 00:03:55.187000,0 days 00:04:03.395000,"[21, 8492, 35133, 13, 50, 8192, 284, 13472, 69...",1500
2,Clear the ramp! 30 seconds! May God bless you ...,0 days 00:04:47.615000,0 days 00:05:10.725000,"[34, 5797, 264, 12428, 0, 2217, 3949, 0, 1891,...",194
3,Clear the path climbs out!,0 days 00:05:42.461000,0 days 00:05:45.135000,"[34, 5797, 264, 3100, 48439, 484, 0]",45
4,To the edge!,0 days 00:06:01.021000,0 days 00:06:02.887000,"[13342, 264, 4691, 0]",45
...,...,...,...,...,...
322,Every day I am reminded of the words of the bo...,0 days 02:39:15.039000,0 days 02:39:39.341000,"[15536, 786, 286, 669, 15920, 295, 264, 2283, ...",687
323,James.,0 days 02:39:52.076000,0 days 02:39:53.690000,"[32263, 13, 1, 35372, 1147, 2619, 389, 16932, ...",3053
324,"""Kapten John H Miller.""",0 days 02:39:55.705000,0 days 02:39:58.126000,"[1, 35372, 1147, 2619, 389, 16932, 889]",3053
325,- Say that my life is good. - What? Say that I...,0 days 02:40:06.382000,0 days 02:40:14.177000,"[12, 6463, 300, 452, 993, 307, 665, 13, 359, 7...",1500


In [161]:
curr_sub_df

Unnamed: 0,raw_dialogue,start_time,end_time,tokens,time_diffs
0,Father.,0 days 00:03:00.508000,0 days 00:03:02.223000,"[44773, 13]",-1
1,6 Jun 1944.,0 days 00:03:55.187000,0 days 00:03:58.366000,"[21, 8492, 35133, 13, 50, 8192, 284, 13472, 69...",52
2,Sektor Dog Green Pantai Omaha.,0 days 00:03:59.358000,0 days 00:04:03.395000,"[50, 8192, 284, 13472, 6969, 430, 394, 1301, 4...",0
3,Clear the ramp! 30 seconds! May God bless you ...,0 days 00:04:47.615000,0 days 00:04:51.951000,"[34, 5797, 264, 12428, 0, 2217, 3949, 0, 1891,...",44
4,"Troops left and right the ship, clear climb ou...",0 days 00:04:51.952000,0 days 00:04:56.752000,"[51, 340, 3370, 1411, 293, 558, 264, 5374, 11,...",0
...,...,...,...,...,...
1670,James.,0 days 02:39:52.076000,0 days 02:39:53.690000,"[32263, 13, 1, 35372, 1147, 2619, 389, 16932, ...",12
1671,"""Kapten John H Miller.""",0 days 02:39:55.705000,0 days 02:39:58.126000,"[1, 35372, 1147, 2619, 389, 16932, 889]",2
1672,- Say that my life is good. - What?,0 days 02:40:06.382000,0 days 02:40:11.074000,"[12, 6463, 300, 452, 993, 307, 665, 13, 359, 7...",8
1673,Say that I am a good person.,0 days 02:40:11.554000,0 days 02:40:14.177000,"[42263, 300, 286, 669, 257, 665, 954, 13]",0


In [38]:
curr_transcript_df[33:]

Unnamed: 0,start,end,speaker,is_dialogue,movie_name,start_frame,end_frame,text,tokens
33,258.252219,260.952219,49,False,SavingPrivateRyan,4132035,4175235,"With his family gathered around him, he swallo...","[20943, 702, 1605, 13032, 926, 796, 11, 415, 1..."
34,263.804094,266.892219,49,False,SavingPrivateRyan,4220865,4270275,We draw closer until his frosty blue eyes fill...,"[4360, 2642, 4966, 1826, 702, 19623, 88, 3344,..."
35,271.802844,277.067844,49,False,SavingPrivateRyan,4348845,4433085,"Now, on a beachhead, X-shaped obstacles jut ou...","[13267, 11, 322, 257, 7534, 1934, 11, 1783, 12..."
36,277.658469,282.653469,49,False,SavingPrivateRyan,4442535,4522455,"June 6, 1944, Dog Green Sector, Omaha Beach.","[41, 2613, 1386, 11, 35133, 11, 13472, 6969, 3..."
37,287.597844,293.774094,49,False,SavingPrivateRyan,4601565,4700385,"Out at sea, landing crafts loaded with GIs bou...","[28353, 412, 4158, 11, 11202, 27831, 13210, 36..."
...,...,...,...,...,...,...,...,...,...
3113,10144.994094,10145.584719,49,False,SavingPrivateRyan,162319905,162329355,Paramount.,"[47, 12835, 792, 13]"
3114,10145.989719,10147.052844,49,False,SavingPrivateRyan,162335835,162352845,a Viacom company.,"[64, 6626, 326, 298, 2237, 13]"
3115,10150.596594,10151.271594,49,False,SavingPrivateRyan,162409545,162420345,DreamWorks.,"[35, 1572, 28846, 82, 13]"
3116,10151.490969,10152.300969,49,False,SavingPrivateRyan,162423855,162436815,SKG,"[50, 42, 38]"


In [35]:
test_df

Unnamed: 0,raw_dialogue,start_time,end_time,tokens_x,max_ratio_idx,diff,text,tokens_y,speaker,is_dialogue
0,Father.,0 days 00:03:00.508000,0 days 00:03:02.223000,"[44773, 13]",1514,,.,[13],55,True
1,6 Jun 1944.,0 days 00:03:55.187000,0 days 00:03:58.366000,"[21, 8492, 35133, 13]",1106,-408.0,Baker Company 506.,"[33, 4003, 13918, 2625, 21, 13]",27,True
2,Sektor Dog Green Pantai Omaha.,0 days 00:03:59.358000,0 days 00:04:03.395000,"[50, 8192, 284, 13472, 6969, 430, 394, 1301, 4...",1203,97.0,or three.,"[284, 1045, 13]",28,True
3,Clear the ramp! 30 seconds! May God bless you ...,0 days 00:04:47.615000,0 days 00:04:51.951000,"[34, 5797, 264, 12428, 0, 2217, 3949, 0, 1891,...",45,-1158.0,Clear the room!,"[34, 5797, 264, 1808, 0]",26,True
4,"Troops left and right the ship, clear climb ou...",0 days 00:04:51.952000,0 days 00:04:56.752000,"[51, 340, 3370, 1411, 293, 558, 264, 5374, 11,...",1886,1841.0,"Toot, toot.","[51, 6259, 11, 281, 310, 13]",28,True
...,...,...,...,...,...,...,...,...,...,...
1670,James.,0 days 02:39:52.076000,0 days 02:39:53.690000,"[32263, 13]",2984,-75.0,James.,"[32263, 13]",33,True
1671,"""Kapten John H Miller.""",0 days 02:39:55.705000,0 days 02:39:58.126000,"[1, 35372, 1147, 2619, 389, 16932, 889]",3053,69.0,and John H. Miller.,"[474, 2619, 389, 13, 16932, 13]",45,True
1672,- Say that my life is good. - What?,0 days 02:40:06.382000,0 days 02:40:11.074000,"[12, 6463, 300, 452, 993, 307, 665, 13, 359, 7...",2333,-720.0,This is good.,"[5723, 307, 665, 13]",44,True
1673,Say that I am a good person.,0 days 02:40:11.554000,0 days 02:40:14.177000,"[42263, 300, 286, 669, 257, 665, 954, 13]",1413,-920.0,All that for a general.,"[7868, 300, 337, 257, 2674, 13]",9,True


In [31]:
curr_sub_df

Unnamed: 0,raw_dialogue,start_time,end_time,tokens,max_ratio_idx,diff
0,Father.,0 days 00:03:00.508000,0 days 00:03:02.223000,"[44773, 13]",1514,
1,6 Jun 1944.,0 days 00:03:55.187000,0 days 00:03:58.366000,"[21, 8492, 35133, 13]",1106,-408.0
2,Sektor Dog Green Pantai Omaha.,0 days 00:03:59.358000,0 days 00:04:03.395000,"[50, 8192, 284, 13472, 6969, 430, 394, 1301, 4...",1203,97.0
3,Clear the ramp! 30 seconds! May God bless you ...,0 days 00:04:47.615000,0 days 00:04:51.951000,"[34, 5797, 264, 12428, 0, 2217, 3949, 0, 1891,...",45,-1158.0
4,"Troops left and right the ship, clear climb ou...",0 days 00:04:51.952000,0 days 00:04:56.752000,"[51, 340, 3370, 1411, 293, 558, 264, 5374, 11,...",1886,1841.0
...,...,...,...,...,...,...
1670,James.,0 days 02:39:52.076000,0 days 02:39:53.690000,"[32263, 13]",2984,-75.0
1671,"""Kapten John H Miller.""",0 days 02:39:55.705000,0 days 02:39:58.126000,"[1, 35372, 1147, 2619, 389, 16932, 889]",3053,69.0
1672,- Say that my life is good. - What?,0 days 02:40:06.382000,0 days 02:40:11.074000,"[12, 6463, 300, 452, 993, 307, 665, 13, 359, 7...",2333,-720.0
1673,Say that I am a good person.,0 days 02:40:11.554000,0 days 02:40:14.177000,"[42263, 300, 286, 669, 257, 665, 954, 13]",1413,-920.0


In [None]:
### Second pass, interpolate where single value is obviously wrong
curr_sub_df['diff'] = curr_sub_df['max_ratio_idx'].diff()
# Find when segments "go backwards", adding a false at the beginning as comparisons result in array one shorter than length
# curr_sub_df['not_incr'] = np.concatenate([[False], sub_trans_idx[:len(sub_trans_idx) - 1] > sub_trans_idx[1:]])

for ii in curr_sub_df.index[curr_sub_df['diff'].lt(-5)].drop(curr_sub_df.shape[0]-1, errors='ignore'):
    
    prev_diff, curr_diff, next_diff = curr_sub_df['diff'].iloc[ii-1:ii+2]
    fix_prev = False
    fix_curr = False
    
    if (0 < prev_diff + curr_diff < 10) and (curr_diff + next_diff < -5):
        search_start = curr_sub_df['max_ratio_idx'].iloc[ii-2]
        search_end = curr_sub_df['max_ratio_idx'].iloc[ii]
        fix_prev = True
    elif (0 < curr_diff + next_diff < 10) and (curr_diff + prev_diff < -5):
        search_start = curr_sub_df['max_ratio_idx'].iloc[ii-1]
        search_end = curr_sub_df['max_ratio_idx'].iloc[ii+1]
        fix_curr = True
        
    if fix_prev or fix_curr:
        max_ratio = 0
        last_found = 0
        
        for jj in range(search_start + 1, search_end):
            seq_match = difflib.SequenceMatcher(None, sub_arr[ii], transcript_arr[jj])
            if seq_match.ratio() > max_ratio:
                max_ratio = seq_match.ratio()
                last_found = jj
                
        update_idx = ii if fix_curr else ii - 1
        idx_updates[update_idx] = last_found

In [12]:
curr_sub_df.drop(columns=['start_time', 'end_time', 'tokens'])

Unnamed: 0,raw_dialogue,max_ratio_idx,diff
0,Father.,334,
1,6 Jun 1944.,43,-291.0
2,Sektor Dog Green Pantai Omaha.,44,1.0
3,Clear the ramp! 30 seconds! May God bless you ...,54,10.0
4,"Troops left and right the ship, clear climb ou...",92,38.0
...,...,...,...
1670,James.,1130,-515.0
1671,"""Kapten John H Miller.""",1643,513.0
1672,- Say that my life is good. - What?,838,-805.0
1673,Say that I am a good person.,1047,209.0


In [24]:
ii = 2
curr_transcript_df[['id', 'text']].iloc[20*ii:20*(ii+1)].values

array([[40, 'We draw closer until his frosty blue eyes fill our view.'],
       [41,
        'Now, on a beachhead, X-shaped obstacles jut out of the water.'],
       [42, 'Words appear.'],
       [43, 'June 6, 1944.'],
       [44, 'Dog Green Sector, Omaha Beach.'],
       [45,
        'Out at sea, landing crafts loaded with GIs bounce through heavy surf.'],
       [46, 'One man vomits over the side.'],
       [47,
        "Another soldier's hand trembles as he unscrews his canteen."],
       [48,
        "He takes a drink, and we glimpse Captain's bars painted on his steel pot helmet."],
       [49, 'The Captain has tight, thin lips.'],
       [50, 'A greenish cast colors his tense face.'],
       [51,
        'He looks around uneasily at the two rows of men crammed into the small craft.'],
       [52,
        'A round-faced sergeant stuffs a wad of chewing tobacco in his mouth.'],
       [53, 'A GI pukes, then another.'],
       [54, 'Clear the room!'],
       [55, '30 seconds!'],
   

In [34]:
curr_sub_df['updates'] = idx_updates
curr_sub_df[['max_ratio_idx', 'diff', 'updates']].iloc[500:600]

Unnamed: 0,max_ratio_idx,diff,updates
500,302,-569.0,0
501,999,697.0,0
502,876,-123.0,0
503,301,-575.0,878
504,879,578.0,0
...,...,...,...
595,1015,1.0,0
596,1017,2.0,0
597,1123,106.0,1020
598,1021,-102.0,0


In [None]:
curr_sub_df['smoothed_ratio_idx'] = curr_sub_df['max_ratio_idx'].rolling(11, center=True, min_periods=1).mean()
curr_sub_df['diff_ratio'] = np.abs((curr_sub_df['max_ratio_idx'] / curr_sub_df['smoothed_ratio_idx']) - 1)
curr_sub_df['high_diff'] = curr_sub_df['diff_ratio'].gt(0.2)

# curr_sub_df = curr_sub_df.drop(columns=['raw_start_time', 'raw_end_time', 'raw_time_str', 'tokens'])

In [None]:
outliers = [39, 102, 153, 182, 187, 189, 199, 203, 206, 207, 209, 219]

In [66]:
ii = 7
curr_sub_df[['max_ratio_idx']].iloc[30*ii:30*(ii+1)]

Unnamed: 0,max_ratio_idx
210,363
211,364
212,365
213,367
214,369
215,370
216,372
217,373
218,376
219,107


In [191]:
incr_arr = sub_trans_idx[:len(sub_trans_idx) - 1] > sub_trans_idx[1:]
doubles_arr = incr_arr[1:] & incr_arr[:len(incr_arr) - 1]
triples_arr = doubles_arr[1:] & doubles_arr[:len(doubles_arr) - 1]
quad_arr = triples_arr[1:] & triples_arr[:len(triples_arr) - 1]

In [179]:
sub_trans_idx[362:374]

array([ 626,  628,  629,  566, 3340,  675,  633,  634,  636,  637,  640,
        641], dtype=int32)

In [193]:
np.arange(len(quad_arr))[quad_arr]

array([1076])

In [153]:
curr_sub_df = curr_sub_df.set_index(sub_trans_idx)

combined_df = curr_transcript_df.join(curr_sub_df.add_prefix('sub_'))

In [None]:
raw_dev_movies = [x.split('_')[0] for x in os.listdir(vsd_annotations_dir)]
cat_breakdown = {movie: raw_dev_movies.count(movie) > 1 for movie in set(raw_dev_movies)}

NameError: name 'raw_dev_movies' is not defined

In [None]:
df_list = []
annotation_fps = os.listdir(vsd_annotations_dir)

for movie in transcriptions.keys():
    for annot_filepath in [x for x in annotation_fps if movie in x]:
        cat = utils.remove_ext(annot_filepath.split('_')[1])
        raw_annot_df = da.extract_vsd_annotations(os.path.join(vsd_annotations_dir, annot_filepath))
        raw_annot_df['movie'] = movie
        raw_annot_df['annotation_cat'] = cat
        df_list.append(raw_annot_df)
        
annotations_df = pd.concat(df_list)

In [37]:
import h5py
f = h5py.File(os.path.join(vsd_features_dir, 'IAmLegend_auditory.mat'))
print(list(f.keys()))

# Amplitude-Envelope
# Band Energy Ratio
# Bandwidth
# Mel-Frequency Cepstral Coefficient
# Root Mean Square Energy
# Spectral Centroid
# Spectral Flux
# Zero-Crossing Rate

import numpy as np
raw_data = f.get('BW')
data = np.array(raw_data)

['AE', 'BER', 'BW', 'MFCC', 'RMS', 'SC', 'SF', 'ZCR']
