# Topic Modeling with *Frankenstein*

This notebook runs through some of the basics of topic modeling using the novel **Frankenstein** by Mary Shelly. 

Followed tutorial from https://towardsdatascience.com/topic-modelling-in-python-with-nltk-and-gensim-4ef03213cd21

## Text Cleaning 

Tokenize each word, replace urls with the token URL, email address with SCREEN_NAME, and all other tokens are lower case

In [19]:
import spacy
spacy.load('en')
from spacy.lang.en import English
parser = English()


def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('SCREEN_NAME')
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

Use word net to get roots of words, ie running and ran -> run

In [20]:
import nltk
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('wordnet')


from nltk.corpus import wordnet as wn
def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma
    
from nltk.stem.wordnet import WordNetLemmatizer
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

[nltk_data] Downloading package wordnet to /Users/zeke/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Get a set of stop words from nltk

In [21]:
nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))
en_stop.add('could')
en_stop.add('would')
en_stop.add('might')

[nltk_data] Downloading package stopwords to /Users/zeke/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Function that converts raw text to a list of tokens for lda

In [22]:
def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    return tokens

In [23]:
import random
text_data = []
with open('Frankenstein.txt') as f:
    for line in f:
        tokens = prepare_text_for_lda(line)
        if random.random() > .5:
            print(tokens)
            text_data.append(tokens)

['letter']
[]
['petersburgh']
[]
['commencement', 'enterprise', 'regard']
['foreboding', 'arrive', 'yesterday', 'first', 'assure']
['petersburgh', 'northern', 'breeze', 'cheek']
['braces', 'nerves', 'fill', 'delight', 'understand']
['advance', 'give', 'foretaste', 'clime']
['frost', 'desolation', 'present', 'imagination']
['region', 'beauty', 'delight', 'margaret']
['trust', 'precede', 'navigator', 'frost', 'banish']
['sailing', 'waft', 'surpass']
['wonder', 'beauty', 'every', 'region', 'hitherto', 'discover', 'habitable']
['regulate', 'thousand', 'celestial', 'observation', 'require']
['shall', 'satiate', 'ardent', 'curiosity', 'sight', 'world']
['never', 'visit', 'tread', 'never', 'imprint']
['enticement', 'sufficient']
['native', 'river', 'suppose', 'conjecture', 'false']
['contest', 'inestimable', 'benefit', 'shall', 'confer']
['possible', 'effect', 'undertaking']
['reflection', 'dispel', 'agitation', 'begin']
['letter', 'heart', 'enthusiasm', 'elevate']
['heaven', 'nothing', 'cont

[]
['fifteen', 'years', 'retire', 'house']
['frightful', 'loudness', 'various', 'quarters', 'heavens', 'remain']
['beautiful', 'stand', 'twenty', 'yard', 'house']
['dazzle', 'light', 'vanish', 'disappear', 'nothing']
['shock', 'entirely', 'reduce', 'ribbon', 'never', 'behold']
['anything', 'utterly', 'destroy']
['unacquainted', 'obvious']
['philosophy', 'excite', 'catastrophe', 'enter']
['know', 'engage', 'attention', 'suddenly']
['despicable', 'caprice', 'perhaps']
['occupation', 'natural', 'history', 'progeny', 'deform']
['abortive', 'creation', 'entertain', 'greatest', 'disdain']
['strangely', 'soul', 'construct', 'slight', 'ligament']
['almost', 'miraculous', 'change', 'inclination']
['follow', 'relinquishing', 'ancient', 'latterly', 'torment']
['study', 'teach', 'associate']
[]
['strong', 'effort', 'spirit', 'ineffectual']
['destiny', 'potent', 'immutable', 'decree', 'utter']
['terrible', 'destruction']
[]
[]
['chapter']
[]
['attain', 'seventeen', 'parent', 'resolve']
['become', '

['victim', 'clasp', 'hands', 'exclaim', 'murder']
[]
['faint', 'restore', 'extreme', 'difficulty']
['evening', 'william', 'tease', 'valuable']
['miniature', 'possess', 'mother', 'picture']
['trace', 'present', 'although', 'exertion', 'discover']
['unremitted', 'restore', 'beloved', 'william']
[]
['words', 'pierce', 'heart', 'unhappy']
['additional', 'motive', 'return', 'comforter']
[]
['feelings', 'peace', 'gentleness', 'instead']
['festering', 'wound', 'mind', 'enter', 'house', 'mourning']
['affectionate', 'afflict', 'father']
[]
['alphonse', 'frankenstein']
[]
['geneva']
[]
[]
['surprise', 'observe', 'despair', 'succeed', 'first']
['express', 'receive', 'friend', 'throw', 'letter']
['table', 'cover', 'hands']
[]
['bitterness', 'always', 'unhappy', 'friend']
[]
[]
['offer', 'consolation', 'friend']
[]
[]
['clerval', 'endeavour', 'words', 'consolation']
['express', 'heartfelt', 'sympathy', 'william']
['lovely', 'child', 'sleep', 'angel', 'mother']
['bright', 'joyous', 'young', 'beauty'

['touch', 'sight', 'found', 'wander', 'liberty']
['obstacle', 'either', 'surmount', 'avoid', 'light']
['become', 'oppressive', 'weary']
['rouse', 'nearly', 'dormant', 'state', 'berry']
['found', 'hanging', 'tree', 'lying', 'ground', 'slake', 'thirst']
[]
['instinctively', 'finding', 'desolate', 'quit']
['clothes', 'insufficient', 'secure']
['night', 'helpless', 'miserable', 'wretch']
['distinguish', 'nothing', 'feeling', 'invade', 'side']
['enlightened', 'search', 'berry']
['still', 'tree', 'found', 'cloak']
['cover', 'ground', 'distinct']
['idea', 'occupy', 'confuse', 'light', 'hunger']
['side', 'various', 'scent', 'salute', 'object']
['pleasure']
['greatly', 'lessen', 'begin', 'distinguish', 'sensation']
['gradually', 'plainly', 'clear', 'stream', 'supply']
['drink', 'tree', 'shade', 'foliage', 'delight']
['proceed', 'throat', 'little', 'wing', 'animal']
['often', 'intercept', 'light', 'begin', 'observe']
['boundary', 'radiant', 'light', 'canopy', 'sometimes']
[]
['show', 'still', 'r

['cheer', 'loveliness', 'sunshine', 'balminess']
['appear', 'revive', 'within', 'surprise', 'novelty']
['sensation', 'allow', 'bear']
['thankfulness', 'towards', 'bless', 'bestow']
['continue', 'among', 'path']
['pause', 'exactly', 'knowing', 'pursue', 'hear']
['sound', 'voice', 'induce', 'conceal', 'shade']
['cypress', 'scarcely', 'young', 'running']
['towards', 'conceal', 'laugh']
['rapid', 'stream', 'rush', 'hiding', 'place', 'extreme', 'labour']
['probably', 'person', 'playfully']
['seeing', 'dart', 'towards', 'tearing']
['hardly', 'aim']
['kindness', 'gentleness', 'entertain', 'moment']
['place', 'hellish', 'gnash', 'teeth', 'inflame']
['agony', 'wound', 'overcome', 'pulse', 'pause', 'faint']
['whether', 'remain', 'pass']
['means', 'extract', 'suffering', 'augment']
['alone', 'compensate', 'outrage', 'anguish']
['week', 'wound', 'heal', 'continue', 'journey']
['toil', 'close', 'month']
['evening', 'arrive', 'retire', 'hiding', 'place', 'among']
['enjoy', 'gentle', 'breeze', 'eveni

[]
['sound', 'disturb', 'woman', 'sleeping', 'chair', 'beside']
['hire', 'nurse', 'turnkey']
['countenance', 'express', 'quality', 'often', 'characterise']
['class', 'line']
['express', 'entire', 'indifference', 'address', 'english']
['voice', 'strike', 'hear', 'suffering']
['better']
[]
['indeed', 'dream', 'sorry']
['still', 'alive', 'misery', 'horror']
[]
['matter', 'reply', 'woman']
['business', 'nurse']
['conscience', 'everybody']
['appear', 'dream', 'sometimes', 'doubt', 'indeed']
['image', 'float', 'become', 'distinct']
['gentle', 'voice', 'support']
['utter', 'carelessness', 'visible', 'first']
['hangman']
[]
['first', 'reflection', 'learn', 'kirwin']
['prepare', 'wretched', 'indeed']
['every', 'human', 'creature', 'present', 'agony']
['neglect', 'visit', 'short']
['interval']
[]
['gradually', 'recover', 'seat', 'chair']
['cheek', 'livid', 'death', 'overcome', 'gloom']
['remain', 'world', 'replete', 'wretchedness']
['consider', 'whether', 'declare', 'guilty', 'suffer']
['penalty

['september', 'begin', 'roaring', 'thunder', 'hear']
['imminent', 'peril', 'remain', 'passive', 'chief']
['passage', 'towards', 'south', 'become', 'perfectly']
['sailor', 'return', 'native']
['continue', 'frankenstein', 'doze', 'awake', 'ask']
[]
['really', 'return']
[]
['unwillingly', 'danger', 'return']
['surely', 'spirits', 'assist', 'vengeance', 'endow']
[]
['restore', 'often', 'thought']
['sentence', 'pronounce', 'grieve', 'patient']
['watching', 'close', 'thought', 'sleep']
['presently', 'call', 'feeble', 'voice', 'bidding']
['walton', 'moment', 'existence', 'burning']
['hatred', 'ardent', 'desire', 'revenge', 'express']
['justify', 'desire', 'death', 'adversary']
['enthusiastic', 'madness', 'create', 'rational', 'creature']
['bound', 'towards', 'assure', 'power', 'happiness']
['duty', 'towards', 'being', 'species', 'greater', 'claim']
['attention', 'include', 'greater', 'proportion', 'happiness']
['misery', 'urge', 'refuse', 'right', 'refuse']
['ask', 'undertake', 'unfinished', 

## LDA with Gensim

In [24]:
from gensim import corpora
dictionary = corpora.Dictionary(text_data)
corpus = [dictionary.doc2bow(text) for text in text_data]

import pickle
pickle.dump(corpus, open('corpus.pkl', 'wb'))
dictionary.save('dictionary.gensim')

In [25]:
import gensim
NUM_TOPICS = 5
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
ldamodel.save('model5.gensim')
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.013*"become" + 0.009*"night" + 0.008*"death" + 0.007*"sometimes"')
(1, '0.010*"found" + 0.010*"miserable" + 0.010*"moment" + 0.008*"clerval"')
(2, '0.008*"friend" + 0.008*"every" + 0.007*"hear" + 0.007*"return"')
(3, '0.011*"elizabeth" + 0.010*"thought" + 0.010*"appear" + 0.010*"feelings"')
(4, '0.009*"first" + 0.008*"remain" + 0.008*"often" + 0.008*"enter"')


## pyLDAvis

In [26]:
dictionary = gensim.corpora.Dictionary.load('dictionary.gensim')
corpus = pickle.load(open('corpus.pkl', 'rb'))
lda = gensim.models.ldamodel.LdaModel.load('model5.gensim')
import pyLDAvis.gensim
lda_display = pyLDAvis.gensim.prepare(lda, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)

FileNotFoundError: [Errno 2] No such file or directory: 'model3.gensim'