In [None]:
import logging
logging.basicConfig(level=logging.INFO, format='%(levelname)s - %(asctime)s - %(message)s')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import whisper

import tiktoken
whisper_tokenizer = whisper.tokenizer.get_tokenizer(tiktoken.get_encoding(tiktoken.list_encoding_names()[-1]), num_languages=1)

import json

with open('config.json') as fileobj:
    hf_token = json.load(fileobj)['hugging_face_token']

import torch

from sklearn.metrics import RocCurveDisplay, ConfusionMatrixDisplay, confusion_matrix

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments
)

import os

from IPython.display import clear_output
clear_output()

import utils
import data_extraction as da

enc_model = 'bert-base-uncased'
whisper_model = 'turbo'

audio_dir = os.path.join('data', 'audio-vault')
transcription_dir = os.path.join(audio_dir, 'transcriptions')
diarization_dir = os.path.join(audio_dir, 'diarization_segments')

vsd_dir = os.path.join('data', 'VSD', 'VSD2014_officialrelease', 'VSD_2014_December_official_release', 'Hollywood-dev')
vsd_features_dir = os.path.join(vsd_dir, 'features')
vsd_annotations_dir = os.path.join(vsd_dir, 'annotations')

utils.ensure_dir_exists(transcription_dir)

transcript_df_fp = '{movie_name}-transcript.parquet'
annot_cats = ['blood', 'carchase', 'coldarms', 'explosions', 'fights', 'fire', 'firearms', 'gore', 'gunshots', 'screams']

In [88]:
from nltk import word_tokenize

from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import SentenceTransformer, util
import re

### TODO:
- Rerun text to speech with English as set language
- Group Narration
- Identify closest segment

In [None]:
def calc_speed_ratio(first_annot, last_annot, last_seg):
    return 1 + (last_seg - last_annot) / (last_annot - first_annot)

movie_offsets = {
    # Offset for first event, then multiplier for transcript speed differences
    # 'PiratesOfTheCarribeanTheCurseOfTheBlackPearl': (95, 1 + (7386 - 7160) / (7160 - 1867)),
    'PiratesOfTheCarribeanTheCurseOfTheBlackPearl': (95, calc_speed_ratio(1772, 7065, 7291)),
    'SavingPrivateRyan': (14, calc_speed_ratio(663, 8907, 9263)),
    # 'Eragon': (25, 1),
    'FightClub': (-162, calc_speed_ratio(2542, 7676, 7782)),
    'IAmLegend': (25, 1),
    'ReservoirDogs': (-13, calc_speed_ratio(565, 5490, 5465))
}

In [90]:
# Extract and tokenize subtitles for matching
subs_dict = da.extract_subs()
subs_df_list = []

for movie, subs_df in subs_dict.items():
    subs_df['dialogue'] = da.clean_dialogue(subs_df['raw_dialogue'])
    subs_df['tokens'] = subs_df['dialogue'].apply(lambda x: whisper_tokenizer.encode(x))
    subs_df['movie'] = movie
    subs_df_list.append(subs_df)
    
subs_df = pd.concat(subs_df_list)

In [91]:
annotations_df = da.get_vsd_movie_annotations(vsd_annotations_dir, list(movie_offsets.keys()))
gore_mask = annotations_df.annotation_cat.eq('gore')
blood_mask = annotations_df.annotation_cat.eq('blood') & annotations_df.desc.isin(['low', 'medium', 'high'])
gory_df = annotations_df[gore_mask | blood_mask].copy()
gory_df['full_annotation_cat'] = gory_df['annotation_cat'] + '-' + gory_df['desc']
gory_df = gory_df.drop(columns=['desc'])

In [92]:
# Convert from 25 fps to seconds, and add ARBITRARY offset based on observation (movies out of sync)
gory_df[['start_sec', 'end_sec']] = gory_df[['start', 'finish']] / 25

gory_df = da.convert_time_to_readable_txt(gory_df, [('start_sec', 'start_txt'), ('end_sec', 'end_txt')])
# Buffering start and finish 1 second either side (no need to be precise)
gory_df['start_sec'] -= 1
gory_df['end_sec'] += 1
gory_df['seg_duration'] = gory_df['end_sec'] - gory_df['start_sec'] 

In [93]:
seg_df = da.get_segments(transcription_dir, gory_df['movie'].unique(), transcript_df_fp, whisper_tokenizer)
seg_df['duration'] = seg_df['end'] - seg_df['start']

gory_df = gory_df.sort_values('start')

# Add individual offsets to align each movie with audio recording
for movie, (offset, ratio) in movie_offsets.items():
    vec = seg_df.loc[seg_df.movie.eq(movie), ['start']]
    first_annot = gory_df['start_sec'][gory_df.movie.eq(movie)].iloc[0]
    # seg_df.loc[seg_df.movie.eq(movie), ['start']] = first_annot + (vec - offset - first_annot) / ratio

seg_df['end'] = seg_df['start'] + seg_df['duration']

seg_df = da.convert_time_to_readable_txt(seg_df, [('start', 'start_txt'), ('end', 'end_txt')])

In [None]:
tokenizer = AutoTokenizer.from_pretrained(enc_model)
seg_df['num_tokens'] = [len(x) for x in tokenizer(list(seg_df['text']))['input_ids']]
tokenizer.model_max_length - 5

In [94]:
def process_movie_group(group):
    # Identify where speaker changes within this movie group
    speaker_change = (group['speaker'] != group['speaker'].shift()).cumsum()
    
    # Create a group key that only applies to consecutive narrator rows
    group_key = group['is_dialogue'].eq(False).where(lambda x: x, pd.NA) * speaker_change

    narrator_agg = (
        group[group['is_dialogue'].eq(False)]
        .groupby(group_key)
        .agg({
            'speaker': 'first',
            'is_dialogue': 'first',
            'start_txt': 'first',
            'end_txt': 'last',
            
            'duration': 'sum',
            
            'text': ' '.join,
            'start': 'min',
            'end': 'max'
            })
    )

    combined_df = pd.concat([group[group.is_dialogue.eq(True)], narrator_agg], sort=False).sort_values('start')
    
    return combined_df

# Apply to each movie
agg_seg_df = (
    seg_df.groupby('movie')
        .apply(lambda group: process_movie_group(group), include_groups=False)
        .reset_index(level=0)
)

agg_seg_df = agg_seg_df[agg_seg_df.is_dialogue.eq(False)]

In [96]:
agg_seg_df = agg_seg_df.sort_values(['movie', 'start']).reset_index(drop=True)

In [163]:
# Mark each segment as containing gore by looping through all the gory annotations
# TODO: group narration and identify closest piece if no overlap
agg_seg_df['has_gore'] = False
agg_seg_df['has_blood'] = False
agg_seg_df['blood_cat'] = pd.NA

agg_seg_df['gore_start'] = -1 
agg_seg_df['gore_end'] = -1 

agg_seg_df['blood_start'] = -1 
agg_seg_df['blood_end'] = -1 

seg_df_list = []

for movie in gory_df.movie.unique():
    
    curr_gory_df = gory_df[gory_df.movie.eq(movie)].copy()
    curr_seg_df = agg_seg_df[agg_seg_df.movie.eq(movie)].copy()
    
    for ii in range(curr_gory_df.shape[0]):
        
        cat = 'gore' if 'gore' in curr_gory_df['annotation_cat'].iloc[ii] else 'blood'
        
        # Identify any segments that overlap with gory segment
        anot_start_after_seg = curr_seg_df.start > curr_gory_df['end_sec'].iloc[ii]
        seg_end_before_anot = curr_seg_df.end < curr_gory_df['start_sec'].iloc[ii]
        curr_anot_mask = np.logical_not(anot_start_after_seg | seg_end_before_anot)
        
        if sum(curr_anot_mask) > 0:
            curr_seg_df[f'{cat}_start'] = np.where(curr_anot_mask, curr_gory_df['start_sec'].iloc[ii], curr_seg_df[f'{cat}_start'])
            curr_seg_df[f'{cat}_end'] = np.where(curr_anot_mask, curr_gory_df['end_sec'].iloc[ii], curr_seg_df[f'{cat}_end'])
            
            curr_seg_df[f'has_{cat}'] = curr_seg_df[f'has_{cat}'] | curr_anot_mask
            
            if cat == 'blood':
                curr_seg_df['blood_cat'] = np.where(curr_anot_mask, curr_gory_df['full_annotation_cat'].iloc[ii], curr_seg_df['blood_cat'])
                
        # No overlap, so find closest segment   
        else:
            annot_midpoint = (curr_gory_df['start_sec'].iloc[ii] + curr_gory_df['end_sec'].iloc[ii]) / 2
            closest_start_idx = next(iter(curr_seg_df.start[curr_seg_df.start > annot_midpoint].index), None)
            closest_start = abs(curr_seg_df.start[closest_start_idx] - annot_midpoint) if closest_start_idx else curr_seg_df.end.iloc[-1]
            closest_end_idx = max(0, closest_start_idx - 1) if closest_start_idx else curr_seg_df.index[-1]
            closest_end = abs(curr_seg_df.end[closest_end_idx] - annot_midpoint) if closest_end_idx else curr_seg_df.end.iloc[-1]
            
            closest_seg_idx = closest_end_idx
            
            if closest_start is not None and closest_end is not None and closest_start < closest_end:
                closest_seg_idx = closest_start_idx
                
            curr_seg_df[f'{cat}_start'][closest_seg_idx] = curr_gory_df['start_sec'].iloc[ii]
            curr_seg_df[f'{cat}_end'][closest_seg_idx] = curr_gory_df['end_sec'].iloc[ii]
            
            curr_seg_df[f'has_{cat}'][closest_seg_idx] = True
            
            if cat == 'blood':
                curr_seg_df['blood_cat'][closest_seg_idx] = curr_gory_df['full_annotation_cat'].iloc[ii]
                
    seg_df_list.append(curr_seg_df)
    
cat_seg_df = pd.concat(seg_df_list)

# Hide warnings that think I don't understand how copying works
clear_output()

In [164]:
cat_seg_df['cat'] = np.select(
    [cat_seg_df['has_gore'] | cat_seg_df['blood_cat'].isin(['blood-high', 'blood-medium']), cat_seg_df.has_blood],
    ['gore', 'blood'],
    default='neutral'
)
cat_seg_df['cat'].value_counts()

cat
neutral    1366
gore        216
blood        31
Name: count, dtype: int64

In [None]:
# cat_seg_df['text'] = cat_seg_df['text'].str.replace('"', '').str.replace('\s+', ' ', regex=True).str.strip()

Token indices sequence length is longer than the specified maximum sequence length for this model (1087 > 512). Running this sequence through the model will result in indexing errors


In [166]:
test = tokenizer(list(cat_seg_df['text']))

In [167]:
test.encodings[4].tokens

['[CLS]',
 'captain',
 'shove',
 '##s',
 'men',
 'over',
 'the',
 'side',
 'out',
 'of',
 'the',
 'line',
 'of',
 'fire',
 '.',
 'many',
 'yards',
 'from',
 'the',
 'sand',
 ',',
 'they',
 'flop',
 'into',
 'the',
 'water',
 '.',
 'their',
 'gear',
 'drag',
 '##s',
 'them',
 'under',
 '.',
 'a',
 'guy',
 'sinks',
 'straight',
 'down',
 '.',
 'and',
 'slice',
 'through',
 'the',
 'water',
 'like',
 'torpedoes',
 '.',
 'as',
 'two',
 'gi',
 '##s',
 'struggle',
 'out',
 'of',
 'their',
 'heavy',
 'packs',
 ',',
 'red',
 'clouds',
 'fill',
 'the',
 'water',
 'and',
 'they',
 'sink',
 '.',
 'another',
 'struggling',
 'soldier',
 'gap',
 '##es',
 ',',
 'stiff',
 '##ens',
 ',',
 'then',
 'stills',
 '.',
 'troops',
 'try',
 'to',
 'run',
 'on',
 'the',
 'ocean',
 'bottom',
 '.',
 'at',
 'the',
 'obstacle',
 ',',
 'dead',
 'men',
 'lie',
 'across',
 'the',
 'steel',
 'x',
 "'",
 's',
 '.',
 'other',
 'troops',
 'take',
 'cover',
 'behind',
 'them',
 '.',
 'the',
 'captain',
 'drag',
 '##s',
 'a

In [168]:
[len(x.tokens) for ii, x in enumerate(test.encodings) if len(x.tokens) > 512]

[529, 1036, 1087]