In [30]:
# import nltk
# nltk.download('wordnet')
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('averaged_perceptron_tagger')
# !pip install gensim

import re
import string
import glob
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

from gensim.models import LdaModel
from gensim.models.ldamulticore import LdaMulticore
from gensim.corpora.dictionary import Dictionary
from gensim.test.utils import datapath
from nltk import word_tokenize, WordNetLemmatizer, pos_tag
from nltk.corpus import stopwords, wordnet

from pprint import pprint

In [7]:
# SUBTITLE PREPROCESSING

# expected subtitle format:
# numeric timestamp index
# timestamp
# subtitles on >= one line
# linebreak

# end of a subtitle line is expected to be a full word (i.e., not hyphenated)

# subtitles may have the following qualities:
# 1. markup (bold, italics, font colour) is between <> 
# e.g., <font color="#00ff00"></font> 
# 2. begin with hyphens
# 3. contain punctuation
# 4. begin (subtitle index 0) and end with subtitle credits of the same format 
# 5. end with "The End"
# 6. contain credits at the start of the subtitles 

def get_subtitle_corpus(subtitle_fns):
    return [preprocess_subs(get_raw_subs(movie)) for movie in subtitle_fns]
        
def get_raw_subs(filename):
    subs = []
    sub_data = []
    
    with open(filename, 'r') as f:
        for line in f:          
            # collect sub lines of some timestamp
            if line != '\n':
                sub_data.append(line[:-1]) # remove newline
                continue
            
            sub_idx = sub_data[0]
            is_sub_author_credits = (sub_idx == '0')
            if is_sub_author_credits: 
                sub_data = []
                continue
            
            # add sub line to all subs
            for i, sub_line in enumerate(sub_data):            
                if i > 1: # index 0 and 1 are sub index and timestamp
                    subs.append(sub_line)
                
            sub_data = []
    
    last_sub_not_included = (sub_data != [])
    if last_sub_not_included:
        for i, sub_line in enumerate(sub_data):            
            if i > 1: # index 0 and 1 are sub index and timestamp
                subs.append(sub_line)
    
    return subs

def preprocess_subs(raw):
    # remove markup
    markup = '<[^>]*>'
    subs = [re.sub(markup, '', r) for r in raw]
    
    # tokenisation case-folding
    subs = [word_tokenize(s.lower()) for s in subs]
    
    # remove stopwords and punctuation
    stops = set(stopwords.words('english'))
    tokenized_contractions_punctuations = {'...', '... ...', "'ll", 
                                           "'s", 'ca', 'je', 'j', 'g', 'jer', 
                                           "'re", "'ve", "''", "n't", "'d", "'m", 
                                           'wo', 'u', '``', 'l'}
    low_info_stops = {}
    stops = stops.union(string.punctuation)
    stops = stops.union(tokenized_contractions_punctuations)
    stops = stops.union(low_info_stops)
    subs = [[t for t in sub if t not in stops] for sub in subs]
    
    # lemmatize
    pos = get_pos_tags(subs) # different POS lemmatize differently
    subs = [t for sub in subs for t in sub] # flatten
    lmtzr = WordNetLemmatizer()
    subs = [lmtzr.lemmatize(s, pos[i]) for i, s in enumerate(subs)]
        
    return subs

def get_pos_tags(subs):
    pos = [pos_tag(sub) for sub in subs]
    pos = [get_wordnet_pos(tag) for sub in pos for t, tag in sub] # flatten
    return pos

def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else: # default POS in WordNetLemmatizer is noun
        return wordnet.NOUN

def get_vocabulary(bag_of_words):
    return set(bag_of_words)

In [59]:
%%time
# TOPIC MODELLING
def get_subtitle_fns(folder_fn_regex='srt/*'):
    return glob.glob(folder_fn_regex)

def load_lda(filename):
    return LdaModel.load(filename)

movies_corpus = get_subtitle_corpus(get_subtitle_fns()) 
dictionary = Dictionary(movies_corpus)

# remove low information words based on how frequently they appear in 
    # the intra-document and inter-document levels
    # 1. sparsely occurring words for each document; and 
    # 2. words frequent over the WHOLE collection
# remove words appearing in <20 or >50% movies
# dictionary.filter_extremes(no_below=20, no_above=0.5) # TODO adjust
    
movies_corpus = [dictionary.doc2bow(movie) for movie in movies_corpus] # vect

num_topics = 55 # paper used 55
chunksize = 2000 # default
passes = 20 # TODO adjust this
iterations = 400 # TODO adjust this --- passes and iterations must be adjusted to allow convergence
eval_every = None # apparently evaluating model perplexity takes too much time?

lda = LdaModel(movies_corpus, 
               id2word=dictionary,
               chunksize=chunksize,
               passes=passes,
               iterations=iterations,
               num_topics=num_topics,
               eval_every=eval_every)

lda.save(datapath('test')) # give absolute path

top_topics = lda.top_topics(movies_corpus, topn=20)
avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics

print('Average topic coherence: %.4f.' % avg_topic_coherence)
pprint(top_topics)

2018-11-10 17:30:30,591 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2018-11-10 17:30:30,603 : INFO : built Dictionary(2075 unique tokens: ["'cause", '5', '50', '6', '60']...) from 4 documents (total 12289 corpus positions)
2018-11-10 17:30:30,617 : INFO : using symmetric alpha at 0.01818181818181818
2018-11-10 17:30:30,618 : INFO : using symmetric eta at 0.01818181818181818
2018-11-10 17:30:30,620 : INFO : using serial LDA version on this node
2018-11-10 17:30:30,643 : INFO : running online (multi-pass) LDA training, 55 topics, 20 passes over the supplied corpus of 4 documents, updating model once every 4 documents, evaluating perplexity every 0 documents, iterating 400x with a convergence threshold of 0.001000
2018-11-10 17:30:30,645 : INFO : PROGRESS: pass 0, at document #4/4
2018-11-10 17:30:30,803 : INFO : topic #30 (0.018): 0.019*"howl" + 0.015*"sophie" + 0.014*"go" + 0.014*"get" + 0.009*"look" + 0.008*"come" + 0.008*"right" + 0.007*"like" + 0.007*"back" + 0.006

2018-11-10 17:30:31,369 : INFO : topic #21 (0.018): 0.000*"dentsu" + 0.000*"destroy" + 0.000*"defeat" + 0.000*"defend" + 0.000*"defile" + 0.000*"demand" + 0.000*"den" + 0.000*"death" + 0.000*"discover" + 0.000*"disease"
2018-11-10 17:30:31,370 : INFO : topic #15 (0.018): 0.010*"howl" + 0.009*"go" + 0.008*"get" + 0.006*"sophie" + 0.005*"come" + 0.005*"like" + 0.004*"look" + 0.004*"back" + 0.004*"let" + 0.004*"please"
2018-11-10 17:30:31,372 : INFO : topic #48 (0.018): 0.005*"go" + 0.004*"human" + 0.004*"shishi-gami" + 0.004*"back" + 0.003*"eboshi" + 0.003*"come" + 0.003*"let" + 0.003*"lady" + 0.003*"kill" + 0.003*"ashitaka"
2018-11-10 17:30:31,373 : INFO : topic diff=0.492504, rho=0.353553
2018-11-10 17:30:31,375 : INFO : PROGRESS: pass 7, at document #4/4
2018-11-10 17:30:31,429 : INFO : topic #26 (0.018): 0.001*"get" + 0.001*"come" + 0.001*"go" + 0.001*"sen" + 0.001*"look" + 0.001*"haku" + 0.001*"see" + 0.001*"back" + 0.001*"want" + 0.001*"take"
2018-11-10 17:30:31,430 : INFO : topic 

2018-11-10 17:30:31,787 : INFO : topic #47 (0.018): 0.000*"shishi-gami" + 0.000*"go" + 0.000*"back" + 0.000*"eboshi" + 0.000*"human" + 0.000*"come" + 0.000*"forest" + 0.000*"lady" + 0.000*"ashitaka" + 0.000*"u"
2018-11-10 17:30:31,789 : INFO : topic #12 (0.018): 0.000*"dentsu" + 0.000*"destroy" + 0.000*"defeat" + 0.000*"defend" + 0.000*"defile" + 0.000*"demand" + 0.000*"den" + 0.000*"death" + 0.000*"discover" + 0.000*"disease"
2018-11-10 17:30:31,790 : INFO : topic #32 (0.018): 0.000*"dentsu" + 0.000*"destroy" + 0.000*"defeat" + 0.000*"defend" + 0.000*"defile" + 0.000*"demand" + 0.000*"den" + 0.000*"death" + 0.000*"discover" + 0.000*"disease"
2018-11-10 17:30:31,791 : INFO : topic #30 (0.018): 0.001*"howl" + 0.001*"sophie" + 0.001*"go" + 0.001*"get" + 0.001*"look" + 0.001*"come" + 0.001*"right" + 0.001*"like" + 0.001*"back" + 0.001*"good"
2018-11-10 17:30:31,797 : INFO : topic diff=0.093653, rho=0.258199
2018-11-10 17:30:31,798 : INFO : PROGRESS: pass 14, at document #4/4
2018-11-10 17

2018-11-10 17:30:32,195 : INFO : saved X:\Anaconda3\lib\site-packages\gensim\test\test_data\test.state
2018-11-10 17:30:32,205 : INFO : saving LdaModel object under X:\Anaconda3\lib\site-packages\gensim\test\test_data\test, separately ['expElogbeta', 'sstats']
2018-11-10 17:30:32,207 : INFO : storing np array 'expElogbeta' to X:\Anaconda3\lib\site-packages\gensim\test\test_data\test.expElogbeta.npy
2018-11-10 17:30:32,219 : INFO : not storing attribute state
2018-11-10 17:30:32,220 : INFO : not storing attribute dispatcher
2018-11-10 17:30:32,221 : INFO : not storing attribute id2word
2018-11-10 17:30:32,231 : INFO : saved X:\Anaconda3\lib\site-packages\gensim\test\test_data\test


Average topic coherence: -0.1652.
[([(0.028377054, 'ponyo'),
   (0.021786038, 'sosuke'),
   (0.020266851, 'go'),
   (0.012446368, 'come'),
   (0.0115601495, 'let'),
   (0.011468796, 'lisa'),
   (0.011322326, 'good'),
   (0.011145548, 'like'),
   (0.009585739, 'little'),
   (0.009486706, 'okay'),
   (0.0091913305, 'get'),
   (0.00886419, 'see'),
   (0.008776071, 'look'),
   (0.008728315, 'make'),
   (0.008144027, 'want'),
   (0.007890558, 'take'),
   (0.007667365, 'right'),
   (0.007608491, 'u'),
   (0.007606759, 'back'),
   (0.0075664385, 'human')],
  -0.043777716665007044),
 ([(0.000508328, 'howl'),
   (0.0005027127, 'sophie'),
   (0.0005007922, 'go'),
   (0.00050069793, 'get'),
   (0.0004935496, 'look'),
   (0.0004933012, 'come'),
   (0.0004921239, 'right'),
   (0.00049180974, 'like'),
   (0.00049107394, 'back'),
   (0.00049048883, 'good'),
   (0.00048967276, 'witch'),
   (0.00048883766, 'know'),
   (0.00048837194, 'well'),
   (0.00048836635, 'help'),
   (0.00048779277, 'make'),
   (

 ([(0.0005924052, 'go'),
   (0.0005762348, 'human'),
   (0.0005753486, 'shishi-gami'),
   (0.00056567707, 'back'),
   (0.00054634403, 'eboshi'),
   (0.00054506486, 'come'),
   (0.0005432395, 'let'),
   (0.00053883385, 'lady'),
   (0.0005339772, 'kill'),
   (0.0005311155, 'ashitaka'),
   (0.00052733824, 'wolf'),
   (0.0005259877, 'forest'),
   (0.00052280235, 'san'),
   (0.0005213504, 'u'),
   (0.0005204196, 'must'),
   (0.000519107, 'leave'),
   (0.0005164352, 'take'),
   (0.0005133384, 'fight'),
   (0.0005131755, 'still'),
   (0.00051316724, 'help')],
  -0.2617007733139052),
 ([(0.0005018477, 'go'),
   (0.0004988406, 'human'),
   (0.00049640937, 'shishi-gami'),
   (0.0004948465, 'forest'),
   (0.00049376633, 'back'),
   (0.00049325894, 'come'),
   (0.00049290794, 'let'),
   (0.00049274566, 'kill'),
   (0.0004925343, 'eboshi'),
   (0.0004909947, 'lady'),
   (0.00048987626, 'must'),
   (0.0004897756, 'wolf'),
   (0.0004893392, 'u'),
   (0.0004888997, 'san'),
   (0.00048874266, 'life'),


In [None]:
# topic similarity

# cosine similarity between topic components 

In [None]:
# visualise total similarity matrix 
# white = 1, black = 0 