In [1]:

# https://github.com/timestocome


# Lovecraft Corpus
# https://github.com/vilmibm/lovecraftcorpus



In [2]:
import pandas as pd
import numpy as np


import matplotlib.pyplot as plt

In [3]:
# silence is golden

import warnings
warnings.filterwarnings("ignore")
warnings.filterwarnings(action="ignore",category=DeprecationWarning)
warnings.filterwarnings(action="ignore",category=FutureWarning)


In [4]:
# hack to make keras work with 2*** series gpus

import tensorflow as tf
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.Session(config=config)

In [5]:
from keras import Sequential
from keras.layers import Embedding
from keras.preprocessing.sequence import pad_sequences

from sklearn.manifold import TSNE


Using TensorFlow backend.


In [6]:
# list all files under the input directory
import os

fNames = []
for dirname, _, filenames in os.walk('lovecraftcorpus'):
    for filename in filenames:
        fNames.append(os.path.join(dirname, filename))

print(fNames)
print(len(fNames))

['lovecraftcorpus/beyond_wall_of_sleep.txt', 'lovecraftcorpus/tomb.txt', 'lovecraftcorpus/polaris.txt', 'lovecraftcorpus/moon_bog.txt', 'lovecraftcorpus/pharoahs.txt', 'lovecraftcorpus/nameless.txt', 'lovecraftcorpus/colour_out_of_space.txt', 'lovecraftcorpus/dagon.txt', 'lovecraftcorpus/hypnos.txt', 'lovecraftcorpus/silver_key.txt', 'lovecraftcorpus/lurking_fear.txt', 'lovecraftcorpus/book.txt', 'lovecraftcorpus/terrible_old_man.txt', 'lovecraftcorpus/tree.txt', 'lovecraftcorpus/juan_romero.txt', 'lovecraftcorpus/reanimator.txt', 'lovecraftcorpus/hound.txt', 'lovecraftcorpus/cthulhu.txt', 'lovecraftcorpus/rats_walls.txt', 'lovecraftcorpus/ex_oblivione.txt', 'lovecraftcorpus/medusas_coil.txt', 'lovecraftcorpus/descendent.txt', 'lovecraftcorpus/shadow_out_of_time.txt', 'lovecraftcorpus/temple.txt', 'lovecraftcorpus/cool_air.txt', 'lovecraftcorpus/kadath.txt', 'lovecraftcorpus/dunwich.txt', 'lovecraftcorpus/shunned_house.txt', 'lovecraftcorpus/alchemist.txt', 'lovecraftcorpus/iranon.txt'

In [7]:
# read in all files, split into sentences, do a bit of cleanup to reduce vocabulary size

from nltk.tokenize import sent_tokenize
import functools
import re

stories = []

for f in fNames:
    fp = open(f)
    story = fp.read()
    
    story = story.lower()
    story = re.sub('-', ' ', story)
    story = re.sub(" \'", ' ', story)
    story = re.sub('\"', ' ', story)
    story = re.sub('\d', '9', story)
    
    stories.append(sent_tokenize(story))
    
    
# flatten stories into sentences    
sentences = functools.reduce(lambda x, y: x+y, stories)
n_sentences = len(sentences)




In [8]:
#print(sentences)
print('n sentences', n_sentences)


n sentences 18678


In [9]:
# split sentences into words

from nltk.tokenize import word_tokenize

words = [word_tokenize(t) for t in sentences]

print(words[10])
max_words = max([len(x) for x in words])
print('max_words', max_words)





['though', 'well', 'above', 'the', 'middle', 'stature', ',', 'and', 'of', 'somewhat', 'brawny', 'frame', ',', 'he', 'was', 'given', 'an', 'absurd', 'appearance', 'of', 'harmless', 'stupidity', 'by', 'the', 'pale', ',', 'sleepy', 'blueness', 'of', 'his', 'small', 'watery', 'eyes', ',', 'the', 'scantiness', 'of', 'his', 'neglected', 'and', 'never', 'shaven', 'growth', 'of', 'yellow', 'beard', ',', 'and', 'the', 'listless', 'drooping', 'of', 'his', 'heavy', 'nether', 'lip', '.']
max_words 344


In [10]:
# create vocabulary

vocabulary = set(x for s in words for x in s)
#print(vocabulary)


n_vocabulary = len(vocabulary)
print('vocabulary words', n_vocabulary)



vocabulary words 23868


In [11]:
# get unique words and build a dictionary
import itertools


all_words = list(itertools.chain(*words))
#print(all_words[0:10])



word_counts = np.unique( all_words, return_counts=True )
#print(word_counts[0][30], word_counts[1][30])


word_count_dict = dict(zip(word_counts[0], word_counts[1]))
print(word_count_dict)





In [30]:
# most common words
import operator

sorted_word_count = sorted(word_count_dict.items(), key=operator.itemgetter(1))


for j in range(1, 101):
    print(f'{sorted_word_count[-j][0]:20} {sorted_word_count[-j][1]}')
    




the                  34843
,                    26814
and                  20015
of                   19161
.                    17643
to                   10742
a                    10106
in                   8857
i                    7434
was                  6942
that                 6817
had                  5876
he                   4962
it                   4714
as                   3731
with                 3698
his                  3347
;                    3181
for                  3151
at                   3062
which                3051
from                 2982
on                   2971
but                  2925
not                  2918
my                   2619
were                 2580
's                   2055
by                   2054
they                 2031
all                  1961
be                   1892
or                   1832
there                1755
this                 1747
me                   1721
have                 1682
could                1665
an   

In [17]:
print('words occurring once')

n_words_used_once = 0
for k, v in word_count_dict.items():
    if v < 2:
        print(k)
        n_words_used_once += 1
        
print('\nn words appearing once', n_words_used_once)
print('percent of words used one time %.2lf%%' % ( n_words_used_once / n_vocabulary * 100.))


words occurring once
'about
'and
'bigger
'certainely
'damn
'did
'digging
'excellent
'feed
'fifteen
'god
'hel
'inbreeding
'iä
'may
'monday
'moonlight
'more
'mr
'nigh
'now
'of
'saw
'sblood
'see
'stop
'up
'wal
'wedn
'what
'why
'yes
.'
..some
..the
.after
.sally
.the
.yog
9,9999
99,99
999,999,999
999.
999st
99:99.
99rd
99°9
9rd
>
a'goin
a'movin
a'starin
a'talkin
aa
aaron
aback
abaddon
abbey
abbott
abdomen
abdool
aberrancy
aberrations
abeyance
abhor
abijah
abjectly
ablest
ably
abnormallty
aboriginal
abounding
aboundingly
abraham
abrasions
absolved
absorbedly
absorbingly
absorptive
abstracted
abstraction
abstractions
absurdities
absurdness
abt
abu
abusiveness
abyssinian
abyssward
academically
accaount
acceleration
accentuate
accentuated
accessions
acclaimed
accommodate
accompaniment
accorded
accosted
accountable
accounted
accretions
accrue
accts
accumulation
accusation
accusers
accustomedness
ace
acheron
acherontic
achievement
achievements
achieving
achilles
acidulous
acknowledgment
acolyte


leslie
lesse
lesseps
lethally
lethargic
lethean
levantine
leveled
leveling
levers
levite
lexington
lgeb
liability
liable
liberties
libre
licenced
licensed
lich
licks
liddeason
lieut
lifeboats
lifetimes
lifters
lightening
lightfooted
lightheadedly
lightnin
likelier
likeliest
likes
liking
lilted
limb
limbed
limber
limitation
limoges
linden
lindens
lindum
lineage
lineal
linen
lingeringly
linguellae
lining
linking
linplan
linteled
lipped
liquefaction
liquescent
liquourish
liranian
lisping
listenings
listens
litanies
litany
literarum
lithe
lithuanians
litigations
litt.d
lively
livened
liveried
lividly
lizards
lo
loaf
loan
loathings
loathsomenesses
lobster
lobstering
lobsters
locale
localism
locating
lockwood
locomotive
logically
logician
loin
loins
loiterers
loitering
lomarians
londonderry
lonelier
longfellow
longingly
longstanding
lookout
looms
loosing
lopex
lordship
loser
loses
losin
loth
lotteries
lottery
loudest
loudness
louisianans
loutish
lovable
lovelier
lovers
lovey
lovingly
lowell


In [13]:

# bag of words

print('most common words')

for w in sorted(word_count_dict, key=word_count_dict.get, reverse=True):
    print(w, word_count_dict[w])

most common words
the 34843
, 26814
and 20015
of 19161
. 17643
to 10742
a 10106
in 8857
i 7434
was 6942
that 6817
had 5876
he 4962
it 4714
as 3731
with 3698
his 3347
; 3181
for 3151
at 3062
which 3051
from 2982
on 2971
but 2925
not 2918
my 2619
were 2580
's 2055
by 2054
they 2031
all 1961
be 1892
or 1832
there 1755
this 1747
me 1721
have 1682
could 1665
an 1657
one 1579
when 1489
him 1457
no 1437
we 1370
been 1332
some 1306
what 1304
so 1290
would 1289
their 1270
is 1178
' 1174
more 1090
its 1079
old 1057
out 1039
them 1015
now 988
only 970
about 956
did 954
up 950
before 882
great 873
then 865
into 862
seemed 861
very 850
time 833
those 817
than 815
you 811
through 805
who 790
like 785
after 777
saw 760
even 756
where 756
any 748
things 744
man 737
though 732
must 704
if 692
over 690
our 688
down 677
night 673
other 655
these 635
came 621
found 610
might 596
such 584
whose 580
are 576
? 568
thing 544
never 541
long 539
strange 537
upon 537
n't 520
place 518
made 516
once 514
much 511


september 22
servant 22
sharply 22
singularly 22
slate 22
slippery 22
souls 22
specific 22
steeple 22
straight 22
studying 22
subterranean 22
superstitious 22
surprised 22
throne 22
tide 22
tightly 22
tone 22
torn 22
unexpected 22
unhallowed 22
victims 22
villagers 22
villages 22
visitors 22
visits 22
wandered 22
workmen 22
abode 21
absent 21
acute 21
alarm 21
alley 21
ancestral 21
antiquarian 21
arrived 21
articulate 21
assured 21
authorities 21
barry 21
basement 21
basis 21
battle 21
beam 21
blasphemies 21
bleak 21
blow 21
bright 21
busy 21
carry 21
carrying 21
charnel 21
circles 21
coarse 21
companions 21
concealed 21
conjecture 21
connecting 21
corners 21
cried 21
dancing 21
delirium 21
demon 21
desperately 21
devices 21
discussed 21
dismal 21
doctors 21
doorways 21
double 21
doubly 21
draw 21
effects 21
evident 21
experienced 21
exposed 21
fight 21
fires 21
flights 21
flood 21
freedom 21
frenzy 21
frozen 21
galleys 21
gazing 21
georgian 21
gleaming 21
glittering 21
grandmother 21


poles 10
pondered 10
praying 10
prehuman 10
preparing 10
prevailing 10
prey 10
produce 10
professional 10
prolonged 10
protection 10
purely 10
pursuits 10
queerness 10
radically 10
rambles 10
ramparts 10
ranged 10
rattle 10
reader 10
receive 10
reddish 10
reel 10
refinery 10
relentless 10
repetition 10
reptiles 10
rescued 10
reserve 10
respective 10
revolution 10
rhythmic 10
ridges 10
ridicule 10
roaring 10
rouse 10
rudimentary 10
rule 10
rumors 10
rural 10
sadly 10
sagging 10
saxon 10
scraping 10
secure 10
senior 10
sentient 10
seth 10
settlement 10
seventeen 10
seventeenth 10
shafts 10
sheds 10
shores 10
shout 10
shows 10
shub 10
shutters 10
silhouette 10
sixty 10
sole 10
somebody 10
sooner 10
sorcier 10
spots 10
squire 10
steal 10
steam 10
sterile 10
stern 10
stonework 10
straining 10
strongly 10
struggled 10
stumbling 10
submerged 10
subtler 10
subway 10
suns 10
supplies 10
survivor 10
swath 10
symbolism 10
systematic 10
talks 10
tattered 10
teeming 10
teloth 10
temperature 10
tena

warriors 6
wars 6
watches 6
wealthy 6
webbed 6
wet 6
wheezing 6
whipped 6
whirled 6
width 6
wig 6
windy 6
winking 6
wives 6
woefully 6
wolfish 6
wordes 6
worldly 6
wou 6
woven 6
wreathed 6
wrinkled 6
y 6
ya 6
yankee 6
yer 6
yr 6
'oh 5
'you 5
9nd 5
a.d. 5
abandonment 5
abundantly 5
academic 5
accessories 5
accidental 5
accuracy 5
acknowledged 5
acquire 5
acutely 5
addressing 5
admired 5
admits 5
adopted 5
advise 5
affect 5
affection 5
affiliations 5
afresh 5
aftermath 5
aimed 5
aisles 5
alarmingly 5
albert 5
alchemical 5
alchemist 5
ali 5
alight 5
alighted 5
allus 5
allusions 5
alternative 5
ambiguous 5
amiable 5
amounts 5
amply 5
analyze 5
anger 5
annihilated 5
annoying 5
answers 5
anthropologist 5
anxiously 5
apparel 5
appealed 5
arabia 5
arabic 5
arc 5
archaism 5
archer 5
arduous 5
argument 5
armies 5
arousing 5
asaph 5
aspirations 5
assign 5
astonishment 5
attendant 5
attested 5
attire 5
augmented 5
australia 5
avoiding 5
await 5
awaking 5
b.c 5
backgrounds 5
balconies 5
balked 5
ba

busied 3
butchers 3
buttresses 3
byrd 3
c 3
cabbages 3
cabbalistic 3
cage 3
calculating 3
calculation 3
calle 3
calloused 3
callousness 3
cambridge 3
campaign 3
campfires 3
canals 3
candlemas 3
cane 3
cannibals 3
canvass 3
canvassed 3
caprices 3
carelessness 3
cargoes 3
carpeted 3
carrion 3
carve 3
cased 3
casement 3
casks 3
cassiopeia 3
castes 3
catalogue 3
catastrophe 3
catechism 3
catskill 3
caucasian 3
cauldron 3
cautioned 3
cellulose 3
centering 3
centres 3
cerenarian 3
chalcedony 3
chalk 3
challenge 3
chandelier 3
chap 3
chapters 3
characteristics 3
characterized 3
charcoal 3
charged 3
chart 3
chat 3
chattering 3
checkley 3
cheerful 3
childless 3
chimaeras 3
chimes 3
china 3
chinks 3
chipped 3
chirography 3
chisels 3
chopped 3
chops 3
chords 3
chorused 3
chrysolite 3
chuckle 3
circled 3
clan 3
clash 3
clave 3
cleaning 3
cleanly 3
cleavage 3
cleverly 3
click 3
climber 3
clost 3
clothed 3
cloths 3
clumps 3
cluttered 3
co. 3
cobbler 3
cobbles 3
cobblestones 3
coddled 3
coherently 3


dashing 2
daughters 2
dauntless 2
dazedness 2
deadened 2
deadness 2
deaf 2
deafeningly 2
dealers 2
dearly 2
deathbed 2
debauch 2
deborah 2
decadently 2
deceptive 2
decidedly 2
deciding 2
decisions 2
decorum 2
decreed 2
defeats 2
defects 2
deformities 2
delapores 2
deletion 2
delivery 2
deluded 2
delusive 2
demeter 2
democritus 2
demolished 2
deneb 2
dens 2
dental 2
denying 2
depends 2
depopulated 2
deprivation 2
deprive 2
deputy 2
derelict 2
dervishes 2
describable 2
describes 2
describing 2
desiderate 2
desirable 2
dessert 2
destinies 2
destiny 2
detain 2
deterring 2
detoured 2
detours 2
devastation 2
devour 2
dewy 2
dexterity 2
dho 2
diabolical 2
diabolically 2
diadem 2
dialects 2
dialogue 2
diamonds 2
diction 2
diego 2
digest 2
digestion 2
digg 2
diggers 2
dignitary 2
digressed 2
dilapidation 2
dilate 2
dime 2
dimmed 2
dimming 2
dimness 2
dined 2
diorite 2
dipping 2
direful 2
disabled 2
disagreeably 2
disarm 2
disasters 2
discomfort 2
discounted 2
discourses 2
discredit 2
discrepanc

swirled 2
swishin 2
swoop 2
swooping 2
swords 2
sylvan 2
symbolic 2
symbolised 2
symbolized 2
sympathy 2
symphonic 2
synthetic 2
systematically 2
tablecloth 2
taciturnity 2
tag 2
tai 2
tailed 2
talkative 2
talkes 2
tamash 2
tamed 2
tampering 2
tank 2
tantalising 2
tantalisingly 2
tapered 2
tapestries 2
tapestry 2
tapped 2
tar 2
tarraco 2
tartarean 2
tartary 2
tastefully 2
tatters 2
tauern 2
taunt 2
taunted 2
taut 2
tavernkeeper 2
taxi 2
taxicabs 2
taylor 2
technically 2
tedious 2
teem 2
tekel 2
telepathy 2
telescopes 2
telled 2
temper 2
temperamentally 2
tempered 2
tempestuously 2
temptation 2
tenancy 2
tend 2
tender 2
tendon 2
tendons 2
tendrils 2
tensely 2
tentatively 2
tenth 2
terminer 2
territories 2
testified 2
tethering 2
thawing 2
theatre 2
theatrical 2
theodolite 2
theodore 2
theoretical 2
theosophical 2
theosophist 2
theosophists 2
therefrom 2
thicket 2
thickets 2
thingumajig 2
thinkers 2
thinnish 2
thirst 2
thirsts 2
thirteenth 2
thirties 2
thorabonian 2
thorfinnssen 2
though

declamation 1
declamations 1
declaring 1
decomposed 1
decomposing 1
decorating 1
decorators 1
decorously 1
decrease 1
dedham 1
deduce 1
deduces 1
deduction 1
deesmees 1
deestrick 1
def'nite 1
defeated 1
defeating 1
defect 1
defences 1
defend 1
defenders 1
defending 1
defenses 1
defensive 1
defer 1
deference 1
deferred 1
defiance 1
defiances 1
deficiencies 1
deficiency 1
defies 1
defieth 1
defile 1
definiteness 1
definition 1
deflect 1
deflected 1
deformity 1
defray 1
deftness 1
degenerating 1
degrade 1
degradedly 1
degrading 1
dehnous 1
deigned 1
deimos 1
deities 1
delaying 1
delegated 1
delegation 1
delegations 1
deletions 1
delib'rit 1
deliberating 1
delicacies 1
delicious 1
delightedly 1
delightfully 1
delilah 1
delineated 1
deliriously 1
delivering 1
delphinus 1
delphis 1
delrio 1
delta 1
delve 1
demeanor 1
dementedly 1
dementia 1
demesnes 1
demi 1
demigod 1
demise 1
demolish 1
demolition 1
demoniacal 1
demonological 1
demonologists 1
demonstrate 1
demonstrated 1
demonstrations 1
d

lyrics 1
lässt 1
macchu 1
machicolated 1
madame 1
maddens 1
madder 1
maddest 1
madmen 1
madnesses 1
madre 1
maeonides 1
magahs 1
magellanic 1
maggoty 1
magically 1
magicians 1
magick 1
magnanimous 1
magnates 1
magnet 1
magnetism 1
magnifying 1
magnum 1
magyar 1
mahmoud 1
maids 1
mailbox 1
mailed 1
mails 1
maiming 1
maius 1
majestically 1
majoring 1
makeshift 1
makings 1
maladies 1
malay 1
maledictions 1
males 1
malforming 1
malicious 1
malkowaki 1
malleable 1
malodorously 1
malodourous 1
malted 1
maltese 1
maltreatment 1
malwa 1
mammalia 1
mammalian 1
manacle 1
manager 1
mane 1
manfully 1
mangy 1
manhole 1
maniacs 1
manifold 1
manipulation 1
manlike 1
mannered 1
manorial 1
manservant 1
mantle 1
manual 1
manufacture 1
manufacturing 1
maounds 1
maouths 1
mapped 1
marchers 1
margaret 1
margin 1
marginalia 1
margins 1
mariner 1
maritime 1
marix 1
marketplace 1
markets 1
marry 1
mars 1
marseilles 1
marshalled 1
marshland 1
marten 1
martineco 1
martyr 1
marv 1
marvelousness 1
masculine 1
mas

solidity 1
soliloquies 1
solitudes 1
solomon 1
solstice 1
solve 1
somehaow 1
somewhars 1
somnambulist 1
somnolence 1
songfully 1
soone 1
sooth 1
soothed 1
soothsayers 1
sophisticate 1
sophisticates 1
sophistry 1
sophomores 1
sops 1
sorbonne 1
sorcerers 1
sorceress 1
sorceries 1
soreness 1
sorrowful 1
sorter 1
soth 1
sother 1
sothotha 1
sothothe 1
soughte 1
soundness 1
soup 1
southeastwards 1
southwestward 1
souvenir 1
souvenirs 1
sow 1
sowed 1
spa 1
spacing 1
spaciousness 1
spadeful 1
spadeless 1
spangled 1
spaniard 1
spanning 1
spans 1
sparkles 1
spasms 1
spat 1
spatially 1
spatter 1
speake 1
speakest 1
specialty 1
specifically 1
specified 1
specious 1
speckled 1
specter 1
specters 1
speculative 1
speculatively 1
speculator 1
speeches 1
speechlessly 1
speeded 1
speedily 1
speeds 1
speedy 1
speel 1
spellbound 1
spelled 1
spellings 1
spendours 1
spewing 1
spheroidal 1
spheroids 1
spilling 1
spin 1
spindling 1
spiraling 1
spiralled 1
spirifera 1
spirito 1
spits 1
spitzbergen 1
splashin 1