In [75]:
# import nltk
# nltk.download('wordnet')
# nltk.download('punkt')
# nltk.download('stopwords')

import re
from nltk import TweetTokenizer, WordNetLemmatizer
from nltk.corpus import stopwords
from pprint import pprint
import string

In [94]:
# SUBTITLE FILENAMES

folder_fn = 'srt/'
subtitle_fns = ["howl's moving castle", "ponyo", 
                "princess mononoke", "spirited away"]
fn_ext = '.srt'

In [99]:
# SUBTITLE PREPROCESSING

# expected subtitle format:
# numeric timestamp index
# timestamp
# subtitles on >= one line
# linebreak

# end of a subtitle line is expected to be a full word (i.e., not hyphenated)

# subtitles may have the following qualities:
# 1. markup (bold, italics, font colour) is between <> 
# e.g., <font color="#00ff00"></font> 
# 2. begin with hyphens
# 3. contain punctuation
# 4. begin (subtitle index 0) and end with subtitle credits of the same format 
# 5. end with "The End"
# 6. contain credits at the start of the subtitles 

def get_raw_subs(filename):
    subs = []
    sub_data = []
    
    with open(filename, 'r') as f:
        for line in f:          
            # collect sub lines of some timestamp
            if line != '\n':
                sub_data.append(line[:-1]) # remove newline
                continue
            
            is_sub_author_credits = (sub_data[0] == '0')
            if is_sub_author_credits: 
                sub_data = []
                continue
            
            # add sub line to all subs
            for i, sub_line in enumerate(sub_data):            
                if i > 1: # index 0 and 1 are sub index and timestamp
                    subs.append(sub_line)
                
            sub_data = []
    
    return subs

def preprocess_subs(raw):
    subs = ' '.join(raw)
    
    # remove markup
    markup = '<[^>]*>'
    subs = re.sub(markup, '', subs)
    
    # tokenisation case-folding
    subs = subs.lower()
    subs = TweetTokenizer().tokenize(subs) # tokens include contractions
    
    # remove stopwords and punctuation
    stops = set(stopwords.words('english'))
    extra_stops = {'...', '... ...'}
    stops = stops.union(extra_stops)
    subs = [t for t in subs if t not in stops and t not in string.punctuation]

    # TODO
    # remove low information words based on how frequently they appear in 
    # the intra-document and inter-document levels
    # 1. sparsely occurring words for each document; and 
    # 2. words frequent over the WHOLE collection
    
    # lemmatize
    lmtzr = WordNetLemmatizer()
    subs = [lmtzr.lemmatize(s) for s in subs]
    
    return subs

def get_vocabulary(bag_of_words):
    return set(bag_of_words)

for sfn in subtitle_fns:
    filename = folder_fn + sfn + fn_ext
    raw = get_raw_subs(filename)
    bag_of_words = preprocess_subs(raw)
    vocabulary = get_vocabulary(bag_of_words)

In [None]:
# topic modelling

# use gensim 
# collapsed gibbs sampling to approximate posterior 

# paper used T = 55 topics to be extracted 



In [None]:
# topic similarity

# cosine similarity between topic components 

In [None]:
# visualise total similarity matrix 
# white = 1, black = 0 