In [24]:
# import nltk
# nltk.download('wordnet')
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('averaged_perceptron_tagger')

import re
from nltk import word_tokenize, WordNetLemmatizer, pos_tag
from nltk.corpus import stopwords, wordnet
from pprint import pprint
import string

In [2]:
# SUBTITLE FILENAMES

folder_fn = 'srt/'
subtitle_fns = ["howl's moving castle", "ponyo", 
                "princess mononoke", "spirited away"]
fn_ext = '.srt'

In [65]:
# SUBTITLE PREPROCESSING

# expected subtitle format:
# numeric timestamp index
# timestamp
# subtitles on >= one line
# linebreak

# end of a subtitle line is expected to be a full word (i.e., not hyphenated)

# subtitles may have the following qualities:
# 1. markup (bold, italics, font colour) is between <> 
# e.g., <font color="#00ff00"></font> 
# 2. begin with hyphens
# 3. contain punctuation
# 4. begin (subtitle index 0) and end with subtitle credits of the same format 
# 5. end with "The End"
# 6. contain credits at the start of the subtitles 

def get_raw_subs(filename):
    subs = []
    sub_data = []
    
    with open(filename, 'r') as f:
        for line in f:          
            # collect sub lines of some timestamp
            if line != '\n':
                sub_data.append(line[:-1]) # remove newline
                continue
            
            sub_idx = sub_data[0]
            is_sub_author_credits = (sub_idx == '0')
            if is_sub_author_credits: 
                sub_data = []
                continue
            
            # add sub line to all subs
            for i, sub_line in enumerate(sub_data):            
                if i > 1: # index 0 and 1 are sub index and timestamp
                    subs.append(sub_line)
                
            sub_data = []
    
    return subs

def preprocess_subs(raw):
    # remove markup
    markup = '<[^>]*>'
    subs = [re.sub(markup, '', r) for r in raw]
    
    # tokenisation case-folding
    subs = [word_tokenize(s.lower()) for s in subs]
    
    # remove stopwords and punctuation
    stops = set(stopwords.words('english'))
    extra_stops = {'...', '... ...', "'ll", "'s", 'ca', 'je', 'j', 'g', 'jer', 
                   "'re", "'ve", "''", "n't", "'d", "'m", 'wo', 'u', '``', 'l'}
    low_info_stops = {}
    stops = stops.union(string.punctuation)
    stops = stops.union(extra_stops)
    stops = stops.union(low_info_stops)
    subs = [[t for t in sub if t not in stops] for sub in subs]
    
    # TODO
    # remove low information words based on how frequently they appear in 
    # the intra-document and inter-document levels
    # 1. sparsely occurring words for each document; and 
    # 2. words frequent over the WHOLE collection
    
    # lemmatize
    pos = get_pos_tags(subs) # different POS lemmatize differently
    subs = [t for sub in subs for t in sub] # flatten
    lmtzr = WordNetLemmatizer()
    subs = [lmtzr.lemmatize(s, pos[i]) for i, s in enumerate(subs)]
        
    return subs

def get_pos_tags(subs):
    pos = [pos_tag(sub) for sub in subs]
    pos = [get_wordnet_pos(tag) for sub in pos for t, tag in sub] # flatten, get wordnet pos
    return pos

def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else: # default POS in WordNetLemmatizer is noun
        return wordnet.NOUN

def get_vocabulary(bag_of_words):
    return set(bag_of_words)

for sfn in subtitle_fns:
    filename = folder_fn + sfn + fn_ext
    raw = get_raw_subs(filename)
    bag_of_words = preprocess_subs(raw)
    vocabulary = get_vocabulary(bag_of_words)
    print(vocabulary)

{'pay', 'assures', 'like', 'dangerous', 'deliver', 'junk', 'always', 'darkness', '50', 'ship', 'son', 'together', 'morning', 'happen', 'sparkling', 'cowardly', 'middle', 'age', 'spell', 'danger', 'body', 'final', 'try', 'serve', 'people', 'able', 'hurry', 'cast', 'friendly', 'heavy', 'puppy', 'advanced', 'eat', 'good', 'make', 'say', 'force', 'believe', 'breakfast', 'mean', 'tantrum', 'ruin', 'wider', 'unpredictable', 'track', 'gang', 'gunpowder', 'die', 'drove', 'longing', 'handsome', 'restore', 'nearby', 'would', 'must', 'weakness', 'information', 'heartless', 'tasty', 'appreciate', 'sophie', 'clean', 'feel', 'interesting', 'blind', 'turn', 'confidential', 'weary', 'brings', 'evil', 'humiliate', 'everyone', 'lovely', 'hang', 'way', 'scarecrow', 'heart', 'fun', 'trust', 'tell', 'maybe', 'airship', 'kept', 'thought', 'call', 'look', 'carry', 'true', 'neighbor', 'chair', 'luck', 'path', 'keep', 'masquerade', 'pick', 'wear', 'catch', 'courtyard', 'matter', 'help', 'consume', 'study', 'dr

{'pay', 'weastern', 'nago', 'like', 'dangerous', 'always', 'laby', 'fail', 'together', 'morning', 'breathing', 'happen', 'finish', 'rid', 'strong', 'neck', 'danger', 'grab', 'forest', 'impressive', 'body', 'appear', 'final', 'threw', 'melt', 'snap', 'step', 'hope', 'saddle', 'black', 'try', 'obvious', 'eating', 'end', 'nonsense', 'imperial', 'people', 'bone', 'koruku', 'shipment', 'tribe', 'contemplate', 'creature', 'moutain', 'grim', 'calm', 'able', 'hurry', 'slowly', 'journey', 'demand', 'sleep', 'smell', 'heavy', 'among', 'japan', 'kaya', 'settle', 'eat', 'suffering', 'shut', 'set', 'domain', 'hand', 'good', 'soon', 'fool', 'full', 'race', 'make', 'say', 'entrails', 'afraid', 'trail', 'starvation', 'indeed', 'believe', 'mean', 'generation', 'plant', 'powerful', 'devil', 'rot', 'mountain', 'yet', 'across', 'track', 'everybody', 'ahead', 'warn', 'die', 'injure', 'moutains', 'hall', 'nippon', 'struck', 'messenger', 'weak', 'handsome', 'urgent', 'party', 'wild', 'suggest', 'nearby', 'el

{'pay', 'jerk', 'like', 'dangerous', 'furious', 'always', 'darkness', 'mommy', 'fail', 'together', 'morning', 'dad', 'appetizer', 'lazy', 'crybaby', 'happen', 'finish', 'anybody', 'strong', 'traumatize', 'middle', 'neck', 'diversion', 'okay', 'spell', 'grab', 'body', 'final', 'hope', 'handful', 'gravely', 'step', 'black', 'try', 'nonsense', 'end', 'serve', 'people', 'scratch', 'calm', 'able', 'bos', 'mover', 'cause', 'hurry', 'journey', 'worm', 'cast', 'woke', 'sleep', 'smell', 'gut', 'mm-hmm', 'dragon', 'gim', 'bye', 'eat', 'shut', 'set', 'hand', 'good', 'soon', 'full', 'weakling', 'make', 'say', 'afraid', 'tender', 'believe', 'breakfast', 'joke', 'mean', 'sixth', 'powerful', 'entrance', 'yet', 'buts', 'across', 'bleed', 'track', 'everybody', 'warn', 'replace', 'bleeding', 'die', 'gon', 'mmm', 'formula', 'rescue', 'cling', 'would', 'must', 'gentleman', 'shake', 'good-bye', 'else', 'sign', 'voice', 'clip', 'wh', 'yesterday', 'ya', 'contract', 'clean', 'feel', 'leftover', 'guy', 'memory

In [None]:
# topic modelling

# use gensim 
# collapsed gibbs sampling to approximate posterior 

# paper used T = 55 topics to be extracted 



In [None]:
# topic similarity

# cosine similarity between topic components 

In [None]:
# visualise total similarity matrix 
# white = 1, black = 0 