In [1]:

# https://github.com/timestocome


# Lovecraft Corpus
# https://github.com/vilmibm/lovecraftcorpus


# GenSim Word2Vec
# https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/word2vec.ipynb

In [2]:
import pandas as pd
import numpy as np


import matplotlib.pyplot as plt

In [3]:
# silence is golden

import warnings
warnings.filterwarnings("ignore")
warnings.filterwarnings(action="ignore",category=DeprecationWarning)
warnings.filterwarnings(action="ignore",category=FutureWarning)


In [4]:
# list all files under the input directory
# this is the corpus
import os

fNames = []
for dirname, _, filenames in os.walk('lovecraftcorpus'):
    for filename in filenames:
        fNames.append(os.path.join(dirname, filename))

print(fNames)
print(len(fNames))

['lovecraftcorpus/beyond_wall_of_sleep.txt', 'lovecraftcorpus/tomb.txt', 'lovecraftcorpus/polaris.txt', 'lovecraftcorpus/moon_bog.txt', 'lovecraftcorpus/pharoahs.txt', 'lovecraftcorpus/nameless.txt', 'lovecraftcorpus/colour_out_of_space.txt', 'lovecraftcorpus/dagon.txt', 'lovecraftcorpus/hypnos.txt', 'lovecraftcorpus/silver_key.txt', 'lovecraftcorpus/lurking_fear.txt', 'lovecraftcorpus/book.txt', 'lovecraftcorpus/terrible_old_man.txt', 'lovecraftcorpus/tree.txt', 'lovecraftcorpus/juan_romero.txt', 'lovecraftcorpus/reanimator.txt', 'lovecraftcorpus/hound.txt', 'lovecraftcorpus/cthulhu.txt', 'lovecraftcorpus/rats_walls.txt', 'lovecraftcorpus/ex_oblivione.txt', 'lovecraftcorpus/medusas_coil.txt', 'lovecraftcorpus/descendent.txt', 'lovecraftcorpus/shadow_out_of_time.txt', 'lovecraftcorpus/temple.txt', 'lovecraftcorpus/cool_air.txt', 'lovecraftcorpus/kadath.txt', 'lovecraftcorpus/dunwich.txt', 'lovecraftcorpus/shunned_house.txt', 'lovecraftcorpus/alchemist.txt', 'lovecraftcorpus/iranon.txt'

In [5]:
# read in all files, split into sentences, do a bit of cleanup to reduce vocabulary size
# Gensim expects data to be input as sentences

from nltk.tokenize import sent_tokenize
import functools
import re

stories = []

for f in fNames:
    fp = open(f)
    story = fp.read()
    
    story = story.lower()
    story = re.sub('-', ' ', story)
    story = re.sub(" \'", ' ', story)
    story = re.sub('\"', ' ', story)
    story = re.sub('\d', '9', story)
    
    stories.append(sent_tokenize(story))
    
    
# flatten stories into sentences    
sentences = functools.reduce(lambda x, y: x+y, stories)
n_sentences = len(sentences)




In [6]:
print(sentences)
print('n sentences', n_sentences)


n sentences 18678


In [7]:
# split sentences into words

from nltk.tokenize import word_tokenize

words = [word_tokenize(t) for t in sentences]
print(words[0:10])


max_words = max([len(x) for x in words])
print('max_words', max_words)




[['beyond', 'the', 'wall', 'of', 'sleep', 'i', 'have', 'often', 'wondered', 'if', 'the', 'majority', 'of', 'mankind', 'ever', 'pause', 'to', 'reflect', 'upon', 'the', 'occasionally', 'titanic', 'significance', 'of', 'dreams', ',', 'and', 'of', 'the', 'obscure', 'world', 'to', 'which', 'they', 'belong', '.'], ['whilst', 'the', 'greater', 'number', 'of', 'our', 'nocturnal', 'visions', 'are', 'perhaps', 'no', 'more', 'than', 'faint', 'and', 'fantastic', 'reflections', 'of', 'our', 'waking', 'experiences', 'freud', 'to', 'the', 'contrary', 'with', 'his', 'puerile', 'symbolism', 'there', 'are', 'still', 'a', 'certain', 'remainder', 'whose', 'immundane', 'and', 'ethereal', 'character', 'permit', 'of', 'no', 'ordinary', 'interpretation', ',', 'and', 'whose', 'vaguely', 'exciting', 'and', 'disquieting', 'effect', 'suggests', 'possible', 'minute', 'glimpses', 'into', 'a', 'sphere', 'of', 'mental', 'existence', 'no', 'less', 'important', 'than', 'physical', 'life', ',', 'yet', 'separated', 'from

In [8]:
# get unique words and build a dictionary
import itertools


all_words = list(itertools.chain(*words))
print(all_words[0:100])


word_counts = np.unique( all_words, return_counts=True )
#print(word_counts[0][30], word_counts[1][30])


word_count_dict = dict(zip(word_counts[0], word_counts[1]))
print(word_count_dict)





['beyond', 'the', 'wall', 'of', 'sleep', 'i', 'have', 'often', 'wondered', 'if', 'the', 'majority', 'of', 'mankind', 'ever', 'pause', 'to', 'reflect', 'upon', 'the', 'occasionally', 'titanic', 'significance', 'of', 'dreams', ',', 'and', 'of', 'the', 'obscure', 'world', 'to', 'which', 'they', 'belong', '.', 'whilst', 'the', 'greater', 'number', 'of', 'our', 'nocturnal', 'visions', 'are', 'perhaps', 'no', 'more', 'than', 'faint', 'and', 'fantastic', 'reflections', 'of', 'our', 'waking', 'experiences', 'freud', 'to', 'the', 'contrary', 'with', 'his', 'puerile', 'symbolism', 'there', 'are', 'still', 'a', 'certain', 'remainder', 'whose', 'immundane', 'and', 'ethereal', 'character', 'permit', 'of', 'no', 'ordinary', 'interpretation', ',', 'and', 'whose', 'vaguely', 'exciting', 'and', 'disquieting', 'effect', 'suggests', 'possible', 'minute', 'glimpses', 'into', 'a', 'sphere', 'of', 'mental', 'existence', 'no']


In [9]:
# most common words
import operator

sorted_word_count = sorted(word_count_dict.items(), key=operator.itemgetter(1))


for j in range(1, 101):
    print(f'{sorted_word_count[-j][0]:20} {sorted_word_count[-j][1]}')
    

stop_words = set('the' ',' 'and' 'of' '.' 'to' 'a' 'in' 'i' 'was' 'that' 'had' 'it' 'as' 'with' ';' 'for' 'at'.split(' ' ))


the                  34843
,                    26814
and                  20015
of                   19161
.                    17643
to                   10742
a                    10106
in                   8857
i                    7434
was                  6942
that                 6817
had                  5876
he                   4962
it                   4714
as                   3731
with                 3698
his                  3347
;                    3181
for                  3151
at                   3062
which                3051
from                 2982
on                   2971
but                  2925
not                  2918
my                   2619
were                 2580
's                   2055
by                   2054
they                 2031
all                  1961
be                   1892
or                   1832
there                1755
this                 1747
me                   1721
have                 1682
could                1665
an   

In [10]:
from gensim import corpora

# note stop-words and one-offs left in corpus
dictionary = corpora.Dictionary(words)

print(dictionary)

Dictionary(23868 unique tokens: [',', '.', 'and', 'belong', 'beyond']...)


In [11]:
# ints-words

print(dictionary.token2id)



In [12]:

# convert original corpus to bow
# words -> tuples, (word location, word_count for this word )

bow_corpus = [dictionary.doc2bow(text) for text in words]
print(bow_corpus[0])
print(words[0])





[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 4), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 4), (21, 1), (22, 1), (23, 2), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1)]
['beyond', 'the', 'wall', 'of', 'sleep', 'i', 'have', 'often', 'wondered', 'if', 'the', 'majority', 'of', 'mankind', 'ever', 'pause', 'to', 'reflect', 'upon', 'the', 'occasionally', 'titanic', 'significance', 'of', 'dreams', ',', 'and', 'of', 'the', 'obscure', 'world', 'to', 'which', 'they', 'belong', '.']


In [13]:

from gensim import models

tfidf = models.TfidfModel(bow_corpus)


# returns tuples (token id, tf-idf weight)
print(tfidf[dictionary.doc2bow('Family and neighbors had now fled in a panic'.lower().split())])




[(2, 0.034865238488375784), (29, 0.07257800184355358), (97, 0.07489697555110764), (305, 0.37218962248941206), (326, 0.10538261048241108), (415, 0.5898845941054106), (493, 0.48685975696637035), (497, 0.2243418148553977), (498, 0.45050102658825547)]
