In [1]:

# https://github.com/timestocome


# Lovecraft Corpus
# https://github.com/vilmibm/lovecraftcorpus



In [2]:
# This is a step by step walk through of building tf-idf vectors
# The dataset has 68 stories, the unique words to total words is large, half for some stories 
# ... keeping one off words because count is so high - not stemming, not replacing with 'one_off'

In [3]:
import pandas as pd
import numpy as np


import matplotlib.pyplot as plt

In [4]:
# silence is golden

import warnings
warnings.filterwarnings("ignore")
warnings.filterwarnings(action="ignore",category=DeprecationWarning)
warnings.filterwarnings(action="ignore",category=FutureWarning)


In [5]:
# hack to make keras work with 2*** series gpus

import tensorflow as tf
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.Session(config=config)

In [6]:
from keras import Sequential
from keras.layers import Embedding
from keras.preprocessing.sequence import pad_sequences

from sklearn.manifold import TSNE


Using TensorFlow backend.


In [7]:
# list all files under the input directory
import os

fNames = []
for dirname, _, filenames in os.walk('lovecraftcorpus'):
    for filename in filenames:
        fNames.append(os.path.join(dirname, filename))

print(fNames)
print(len(fNames))

['lovecraftcorpus/beyond_wall_of_sleep.txt', 'lovecraftcorpus/tomb.txt', 'lovecraftcorpus/polaris.txt', 'lovecraftcorpus/moon_bog.txt', 'lovecraftcorpus/pharoahs.txt', 'lovecraftcorpus/nameless.txt', 'lovecraftcorpus/colour_out_of_space.txt', 'lovecraftcorpus/dagon.txt', 'lovecraftcorpus/hypnos.txt', 'lovecraftcorpus/silver_key.txt', 'lovecraftcorpus/lurking_fear.txt', 'lovecraftcorpus/book.txt', 'lovecraftcorpus/terrible_old_man.txt', 'lovecraftcorpus/tree.txt', 'lovecraftcorpus/juan_romero.txt', 'lovecraftcorpus/reanimator.txt', 'lovecraftcorpus/hound.txt', 'lovecraftcorpus/cthulhu.txt', 'lovecraftcorpus/rats_walls.txt', 'lovecraftcorpus/ex_oblivione.txt', 'lovecraftcorpus/medusas_coil.txt', 'lovecraftcorpus/descendent.txt', 'lovecraftcorpus/shadow_out_of_time.txt', 'lovecraftcorpus/temple.txt', 'lovecraftcorpus/cool_air.txt', 'lovecraftcorpus/kadath.txt', 'lovecraftcorpus/dunwich.txt', 'lovecraftcorpus/shunned_house.txt', 'lovecraftcorpus/alchemist.txt', 'lovecraftcorpus/iranon.txt'

In [8]:
# read in all files, split into sentences, do a bit of cleanup to reduce vocabulary size

# keep cleanup minimal 
#  convert to lower
#  convert all numbers to 9
#  remove ",'

from nltk.tokenize import sent_tokenize
import functools
import re

story_sentences = []


for i in range(len(fNames)):
    f = fNames[i]
    fp = open(f)
    story = fp.read()
    
    # minor cleanup
    story = story.lower()
    story = re.sub('-', ' ', story)
    story = re.sub(" \'", ' ', story)
    story = re.sub('\"', ' ', story)
    story = re.sub('\d', '9', story)
    
    # break into sentences and append to the story_sentences array
    story_sentences.append(sent_tokenize(story))
    
    
for i in range(len(fNames)):
    print(fNames[i])
    print('sentences', len(story_sentences[i]))



lovecraftcorpus/beyond_wall_of_sleep.txt
sentences 152
lovecraftcorpus/tomb.txt
sentences 152
lovecraftcorpus/polaris.txt
sentences 53
lovecraftcorpus/moon_bog.txt
sentences 110
lovecraftcorpus/pharoahs.txt
sentences 330
lovecraftcorpus/nameless.txt
sentences 157
lovecraftcorpus/colour_out_of_space.txt
sentences 523
lovecraftcorpus/dagon.txt
sentences 84
lovecraftcorpus/hypnos.txt
sentences 91
lovecraftcorpus/silver_key.txt
sentences 166
lovecraftcorpus/lurking_fear.txt
sentences 280
lovecraftcorpus/book.txt
sentences 45
lovecraftcorpus/terrible_old_man.txt
sentences 36
lovecraftcorpus/tree.txt
sentences 61
lovecraftcorpus/juan_romero.txt
sentences 113
lovecraftcorpus/reanimator.txt
sentences 463
lovecraftcorpus/hound.txt
sentences 100
lovecraftcorpus/cthulhu.txt
sentences 418
lovecraftcorpus/rats_walls.txt
sentences 288
lovecraftcorpus/ex_oblivione.txt
sentences 21
lovecraftcorpus/medusas_coil.txt
sentences 841
lovecraftcorpus/descendent.txt
sentences 49
lovecraftcorpus/shadow_out_of_

In [9]:
# split sentences into words
import nltk
from nltk.tokenize import word_tokenize


story_words = []

for i in range(len(fNames)):
    sentences = story_sentences[i]
    words = [word_tokenize(t) for t in sentences]
    story_words.append(words)

    max_words = max([len(x) for x in words])
    print('max_words in a sentence', max_words)
    
   

    
    


max_words in a sentence 119
max_words in a sentence 109
max_words in a sentence 91
max_words in a sentence 93
max_words in a sentence 131
max_words in a sentence 100
max_words in a sentence 344
max_words in a sentence 70
max_words in a sentence 115
max_words in a sentence 84
max_words in a sentence 114
max_words in a sentence 58
max_words in a sentence 75
max_words in a sentence 63
max_words in a sentence 72
max_words in a sentence 99
max_words in a sentence 130
max_words in a sentence 104
max_words in a sentence 86
max_words in a sentence 66
max_words in a sentence 71
max_words in a sentence 80
max_words in a sentence 138
max_words in a sentence 79
max_words in a sentence 89
max_words in a sentence 164
max_words in a sentence 154
max_words in a sentence 167
max_words in a sentence 110
max_words in a sentence 78
max_words in a sentence 85
max_words in a sentence 73
max_words in a sentence 67
max_words in a sentence 73
max_words in a sentence 71
max_words in a sentence 56
max_words in a

In [10]:
# create vocabulary

story_unique_words = []

for i in range(len(fNames)):
    
    words = story_words[i]
    all_words = [x for s in words for x in s]
    n_words = len(all_words)
    unique_words = set(x for s in words for x in s)
    story_unique_words.append(unique_words)

    n_unique = len(unique_words)
    print('vocabulary words %d, wc %d' %(n_unique, n_words))
         



vocabulary words 1533, wc 4772
vocabulary words 1490, wc 4582
vocabulary words 612, wc 1692
vocabulary words 1050, wc 3747
vocabulary words 3025, wc 11955
vocabulary words 1493, wc 5459
vocabulary words 2666, wc 13640
vocabulary words 911, wc 2458
vocabulary words 992, wc 3087
vocabulary words 1603, wc 5339
vocabulary words 2174, wc 8181
vocabulary words 522, wc 1277
vocabulary words 497, wc 1193
vocabulary words 587, wc 1647
vocabulary words 1089, wc 3028
vocabulary words 2872, wc 13519
vocabulary words 1141, wc 3291
vocabulary words 3115, wc 13233
vocabulary words 2264, wc 8827
vocabulary words 321, wc 769
vocabulary words 3145, wc 18786
vocabulary words 695, wc 1668
vocabulary words 4568, wc 28045
vocabulary words 1709, wc 6004
vocabulary words 1327, wc 3793
vocabulary words 5663, wc 46856
vocabulary words 3982, wc 19907
vocabulary words 2951, wc 11926
vocabulary words 1237, wc 4078
vocabulary words 732, wc 2996
vocabulary words 941, wc 2796
vocabulary words 1034, wc 2678
vocabulary

In [11]:
# get unique words and build a dictionary
import itertools


story_bag_of_words = []
story_wc = []

for i in range(len(fNames)):
    
    words = story_words[i]
    all_words = list(itertools.chain(*words))
    story_wc.append(len(all_words))
    #print(all_words[0:10])

    word_counts = np.unique( all_words, return_counts=True )
    #print(word_counts[0][30], word_counts[1][30])

    word_count_dict = dict(zip(word_counts[0], word_counts[1]))
    print(word_count_dict)
    
    story_bag_of_words.append(word_count_dict)

    






{'!': 2, "'s": 7, '(': 2, ')': 2, ',': 249, '.': 142, '99': 2, '9999': 3, ':': 2, ';': 23, '?': 8, 'a': 90, 'aberrations': 1, 'abnormal': 1, 'about': 6, 'above': 1, 'abruptly': 1, 'absurd': 1, 'abysses': 1, 'accept': 1, 'accident': 1, 'accordingly': 1, 'account': 1, 'accursed': 1, 'achieving': 1, 'acquitted': 1, 'across': 1, 'action': 1, 'active': 2, 'actual': 1, 'acute': 1, 'add': 1, 'additional': 1, 'adjustments': 2, 'admitted': 2, 'advance': 1, 'after': 9, 'afternoon': 3, 'afterward': 1, 'again': 4, 'against': 1, 'age': 2, 'agency': 1, 'ages': 1, 'agitated': 1, 'agreed': 1, 'air': 3, 'albany': 1, 'algol': 2, 'alienists': 3, 'alike': 1, 'all': 14, 'allowing': 1, 'alluded': 1, 'aloft': 1, 'along': 1, 'already': 1, 'also': 1, 'alteration': 1, 'always': 1, 'am': 3, 'amazement': 1, 'american': 1, 'amiable': 1, 'amidst': 1, 'among': 2, 'an': 20, 'ancient': 1, 'and': 142, 'anderson': 1, 'animal': 1, 'animated': 1, 'another': 4, 'any': 5, 'anything': 1, 'apparatus': 2, 'apparently': 1, 'app



{',': 25, '.': 17, 'a': 6, 'about': 1, 'accursed': 1, 'all': 1, 'am': 4, 'amidst': 1, 'and': 17, 'ape': 1, 'apes': 2, 'are': 4, 'aspect': 2, 'at': 2, 'back': 1, 'be': 2, 'beheld': 1, 'beings': 2, 'beneath': 2, 'bottom': 1, 'bound': 1, 'broken': 1, 'builders': 1, 'built': 1, 'but': 2, 'by': 1, 'called': 1, 'clearly': 1, 'columns': 1, 'courtyard': 1, 'courtyards': 1, 'coverlets': 1, 'crawl': 1, 'creeping': 1, 'crumbling': 2, 'daemon': 4, 'dank': 1, 'deeds': 2, 'deep': 1, 'depths': 1, 'did': 1, 'dimly': 1, 'each': 1, 'erect': 1, 'evil': 1, 'feeble': 1, 'fell': 1, 'filled': 1, 'flew': 1, 'flows': 1, 'foliage': 1, 'for': 5, 'forget': 1, 'forgotten': 1, 'forms': 1, 'from': 2, 'genie': 2, 'gigantic': 1, 'great': 1, 'grew': 1, 'grey': 1, 'grottoes': 1, 'grow': 1, 'habitation': 1, 'hands': 1, 'haunts': 1, 'heaving': 1, 'herbage': 1, 'hidden': 1, 'his': 1, 'horned': 1, 'horns': 1, 'i': 6, 'in': 9, 'intently': 1, 'is': 1, 'it': 4, 'its': 2, 'knows': 1, 'laid': 1, 'leap': 1, 'lethal': 1, 'lies': 1

In [12]:


# tf, term frequency   ( per story)
# idf  documents containing term

story_tf = []
idf = {}

for i in range(len(fNames)):
    
    token_counts = story_bag_of_words[i]
    print(token_counts)
    #n = min(len(story_unique_words[i]), 4000)
    n = len(story_unique_words[i])
    
    
    story_vector = []
    
    for key, value in token_counts.items():
        
        # n times word appears in a story / total words in the story
        story_vector.append(value / story_wc[i])
        
        # add unique words to document frequency, increment if already in dictionary
        if key not in idf:
            idf[key] = 1
        else:
            idf[key] += 1
        
    story_tf.append(story_vector)
    
    
    
#print(story_tf[0])
#print(idf)    
    



{'!': 2, "'s": 7, '(': 2, ')': 2, ',': 249, '.': 142, '99': 2, '9999': 3, ':': 2, ';': 23, '?': 8, 'a': 90, 'aberrations': 1, 'abnormal': 1, 'about': 6, 'above': 1, 'abruptly': 1, 'absurd': 1, 'abysses': 1, 'accept': 1, 'accident': 1, 'accordingly': 1, 'account': 1, 'accursed': 1, 'achieving': 1, 'acquitted': 1, 'across': 1, 'action': 1, 'active': 2, 'actual': 1, 'acute': 1, 'add': 1, 'additional': 1, 'adjustments': 2, 'admitted': 2, 'advance': 1, 'after': 9, 'afternoon': 3, 'afterward': 1, 'again': 4, 'against': 1, 'age': 2, 'agency': 1, 'ages': 1, 'agitated': 1, 'agreed': 1, 'air': 3, 'albany': 1, 'algol': 2, 'alienists': 3, 'alike': 1, 'all': 14, 'allowing': 1, 'alluded': 1, 'aloft': 1, 'along': 1, 'already': 1, 'also': 1, 'alteration': 1, 'always': 1, 'am': 3, 'amazement': 1, 'american': 1, 'amiable': 1, 'amidst': 1, 'among': 2, 'an': 20, 'ancient': 1, 'and': 142, 'anderson': 1, 'animal': 1, 'animated': 1, 'another': 4, 'any': 5, 'anything': 1, 'apparatus': 2, 'apparently': 1, 'app

In [13]:
# tf-idf

story_tf_idf = []




for i in range(len(fNames)):
    
    token_counts = story_bag_of_words[i]
    tf_idf = {}
    
    #n = min(len(story_unique_words[i]), 4000)
    n = len(story_unique_words[i])
    
    
    for key, value in token_counts.items():
    
        # term frequency per story
        tf = value/story_wc[i]
        
        # inverse document frequency all stories
        idf_ = np.log(len(fNames) / idf[key] )
        #idf_ = len(fNames) / idf[key]
        
        tf_idf[key] = tf * idf_

    story_tf_idf.append(tf_idf)
    
    
# any story will do, just a quick sanity check that all looks well    
print(story_tf_idf[1])


{'!': 0.001284819248498007, "'d": 0.0005064033966248551, "'ll": 0.0006532703702700813, "'m": 0.0014314650911649318, "'re": 0.000788650085810823, "'s": 8.463360360745794e-05, ',': 0.0, '.': 0.0, '9999': 0.00028343280571870467, ':': 0.00038876563078136725, ';': 0.00033731401664377367, '?': 0.00043434104061615707, 'a': 0.0, 'abandoned': 0.0005276998454111632, 'ablaze': 0.0006778874575999249, 'able': 0.0006454496690458961, 'abode': 0.0005500888870469339, 'about': 0.0001685806198564072, 'above': 9.41547677113451e-05, 'accents': 0.0008762409611762316, 'according': 0.00028684436086748175, 'accordingly': 0.0003416925556036026, 'accost': 0.0007663783148911001, 'account': 0.0001904164358829686, 'accursed': 0.00011809929577942372, 'acquaintances': 0.0005266113378792909, 'acquired': 0.0003753352181586568, 'adjacent': 0.0003753352181586568, 'admonishing': 0.0009176544346117341, 'adolescence': 0.0009176544346117341, 'adorned': 0.00046382607544983196, 'after': 3.999086515171907e-05, 'afternoon': 0.00

In [14]:
# find words most common and unique to each story

import operator


for i in range(len(fNames)):
    print('***********************************************************')
    print(story_sentences[i][0])
    print(story_sentences[i][1])
    
    print('\nkey words\n')
    s_tf_idf = story_tf_idf[i]
    sorted_tfidf = sorted(s_tf_idf.items(), key=operator.itemgetter(1))
    for j in range(1, 21):
        print(f'{sorted_tfidf[-j][0]:20} {sorted_tfidf[-j][1] * 100:1.2}')
    print('-------------------------------')
    


***********************************************************
beyond the wall of sleep

i have often wondered if the majority of mankind ever pause to reflect upon the occasionally titanic significance of dreams, and of the obscure world to which they belong.
whilst the greater number of our nocturnal visions are perhaps no more than faint and fantastic reflections of our waking experiences  freud to the contrary with his puerile symbolism  there are still a certain remainder whose immundane and ethereal character permit of no ordinary interpretation, and whose vaguely exciting and disquieting effect suggests possible minute glimpses into a sphere of mental existence no less important than physical life, yet separated from that life by an all but impassable barrier.

key words

slater               2.5
joe                  0.56
institution          0.37
oppressor            0.35
catskill             0.26
decadent             0.22
you                  0.22
attacks              0.2
alienis