In [1]:

# https://github.com/timestocome


# Lovecraft Corpus
# https://github.com/vilmibm/lovecraftcorpus


# build word embeddings for corpus
# ~ half the words are one off and accuracy ~ 52% so that's about as well as it'll do with out
# stemming words and otherwise trimming vocabulary


In [2]:
from __future__ import absolute_import, division, print_function, unicode_literals

import pandas as pd
import numpy as np


import matplotlib.pyplot as plt

In [3]:
# silence is golden

import warnings
warnings.filterwarnings("ignore")
warnings.filterwarnings(action="ignore",category=DeprecationWarning)
warnings.filterwarnings(action="ignore",category=FutureWarning)


In [4]:

# hack to make keras work with 2*** series gpus

import tensorflow as tf
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.Session(config=config)


In [5]:
from keras.models import Sequential
from keras.layers import Conv1D, GlobalAveragePooling1D
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding
from keras.preprocessing import sequence



Using TensorFlow backend.


In [6]:
# list all files under the input directory
import os

fNames = []
for dirname, _, filenames in os.walk('lovecraftcorpus'):
    for filename in filenames:
        fNames.append(os.path.join(dirname, filename))

#print(fNames)
#print(len(fNames))

In [7]:
# read in all files, split into sentences, do a bit of cleanup to reduce vocabulary size

# keep cleanup minimal 
#  convert to lower
#  convert all numbers to 9
#  remove ",'

from nltk.tokenize import sent_tokenize
import functools
import re

story_sentences = []
targets = []

for i in range(len(fNames)):
    f = fNames[i]
    fp = open(f)
    story = fp.read()
    
    # minor cleanup
    story = story.lower()
    story = re.sub('-', ' ', story)
    story = re.sub(" \'", ' ', story)
    story = re.sub('\"', ' ', story)
    story = re.sub('\d', '9', story)
    
    # break into sentences and append to the story_sentences array
    story_sentences.append(sent_tokenize(story))
    
    

 

In [8]:
# split sentences into words and get story number for each sentence to use as the target
import nltk
from nltk.tokenize import word_tokenize


story_words = []
targets = []

for i in range(len(fNames)):
    
    sentences = story_sentences[i]
    sentence_words = [word_tokenize(t) for t in sentences]
    
    targets.append( [i] * len(sentences))
    story_words.append(sentence_words)

  



In [9]:
     
# break sentences into words
combined_words = [item for sublist in story_words for item in sublist]  

# flatten sentence words, reduce to unique and sort
unique_words = sorted(set(x for s in combined_words for x in s))
n_words = len(unique_words)
print('unique words', n_words)

# flatten lists
sentences = [item for sublist in story_sentences for item in sublist]  
targets = [item for sublist in targets for item in sublist]  

# max length of sentences
maxlen = max([len(x) for x in combined_words])
print('maxlen', maxlen)



unique words 23868
maxlen 344


In [10]:
# store sentences and targets in a df
# df makes it easier to shuffle samples and pull out train/test data 

train = pd.DataFrame(targets)
train.columns = ['target']
train['words'] = combined_words

print(train.tail())





       target                                              words
18673      66  [their, deeds, i, recall, not, ,, for, they, w...
18674      66  [their, aspect, i, recall, dimly, ,, it, was, ...
18675      66  [their, name, i, recall, clearly, ,, for, it, ...
18676      66  [these, beings, of, yesterday, were, called, m...
18677      66  [so, the, genie, flew, back, to, the, thin, ho...


In [11]:
# there are faster ways to do this but this is easily reversable, works for small dataset

# convert words to int ids
def convert_word(word_list):
    
    wl = [0] * maxlen
    n = min(maxlen, len(word_list))
    
    for i in range(n):
            wl[i] = unique_words.index(word_list[i])
            
    return wl


# util to convert ints back into words  ** not used here, coded for use in future code using this as base
def convert_index(word_list):
    l = []
    for w in word_list:
        l.append(unique_words[w])
        
    return l


train['indexes'] = train['words'].apply(convert_word)
print(train.head(20))



    target                                              words  \
0        0  [beyond, the, wall, of, sleep, i, have, often,...   
1        0  [whilst, the, greater, number, of, our, noctur...   
2        0  [from, my, experience, i, can, not, doubt, but...   
3        0  [from, those, blurred, and, fragmentary, memor...   
4        0  [we, may, guess, that, in, dreams, life, ,, ma...   
5        0  [sometimes, i, believe, that, this, less, mate...   
6        0  [it, was, from, a, youthful, revery, filled, w...   
7        0  [his, name, ,, as, given, on, the, records, ,,...   
8        0  [among, these, odd, folk, ,, who, correspond, ...   
9        0  [joe, slater, ,, who, came, to, the, instituti...   
10       0  [though, well, above, the, middle, stature, ,,...   
11       0  [his, age, was, unknown, ,, since, among, his,...   
12       0  [from, the, medical, and, court, documents, we...   
13       0  [he, had, habitually, slept, at, night, beyond...   
14       0  [not, that, h

In [12]:
# convert words into oneHot vectors
from sklearn.preprocessing import OneHotEncoder

enc = OneHotEncoder()
enc.fit(train['target'].values.reshape(-1,1))


# shuffle data
train = train.sample(frac=1.)


# pull out 10% as validation data
n_test = len(train) // 10
n_train = len(train) - n_test
print(n_test, n_train)


# split data into train/validate and
# reshape pandas series into arrays for network
# train
x = train[0:n_train]['indexes']
x = np.asarray([y for z in x for y in z])
x = x.reshape(n_train, maxlen)

y = train[0:n_train]['target'].values.reshape(-1,1)
y = enc.transform(y)


# test
x_test = train[n_train:-1]['indexes']

x_test = np.asarray([y for z in x_test for y in z])
x_test = x_test.reshape(n_test-1, maxlen)


y_test = train[n_train:-1]['target'].values.reshape(-1,1)
y_test = enc.transform(y_test)

print(y.shape, y_test.shape)

1867 16811
(16811, 67) (1866, 67)


In [13]:
# Embedding model

n_embedding = 48



model = Sequential()

model.add(Embedding(n_words, n_embedding, input_length=maxlen))
model.add(GlobalAveragePooling1D())

model.add(Dense(len(fNames), activation='sigmoid'))

model.summary()


model.compile(optimizer='adam', loss='mse', metrics=['accuracy'])





_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 344, 48)           1145664   
_________________________________________________________________
global_average_pooling1d_1 ( (None, 48)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 67)                3283      
Total params: 1,148,947
Trainable params: 1,148,947
Non-trainable params: 0
_________________________________________________________________



In [14]:

# train model 
n_epochs = 300
batch_size = 16

model.fit(x, y,
         batch_size = batch_size,
         epochs = n_epochs,
         validation_data = (x_test, y_test))



Train on 16811 samples, validate on 1866 samples
Epoch 1/300
Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300
Epoch 10/300
Epoch 11/300
Epoch 12/300
Epoch 13/300
Epoch 14/300
Epoch 15/300
Epoch 16/300
Epoch 17/300
Epoch 18/300
Epoch 19/300
Epoch 20/300
Epoch 21/300
Epoch 22/300
Epoch 23/300
Epoch 24/300
Epoch 25/300
Epoch 26/300
Epoch 27/300
Epoch 28/300
Epoch 29/300
Epoch 30/300
Epoch 31/300
Epoch 32/300
Epoch 33/300
Epoch 34/300
Epoch 35/300
Epoch 36/300
Epoch 37/300
Epoch 38/300
Epoch 39/300
Epoch 40/300
Epoch 41/300
Epoch 42/300
Epoch 43/300
Epoch 44/300
Epoch 45/300
Epoch 46/300
Epoch 47/300
Epoch 48/300
Epoch 49/300
Epoch 50/300
Epoch 51/300
Epoch 52/300
Epoch 53/300
Epoch 54/300
Epoch 55/300
Epoch 56/300


Epoch 57/300
Epoch 58/300
Epoch 59/300
Epoch 60/300
Epoch 61/300
Epoch 62/300
Epoch 63/300
Epoch 64/300
Epoch 65/300
Epoch 66/300
Epoch 67/300
Epoch 68/300
Epoch 69/300
Epoch 70/300
Epoch 71/300
Epoch 72/300
Epoch 73/300
Epoch 74/300
Epoch 75/300
Epoch 76/300
Epoch 77/300
Epoch 78/300
Epoch 79/300
Epoch 80/300
Epoch 81/300
Epoch 82/300
Epoch 83/300
Epoch 84/300
Epoch 85/300
Epoch 86/300
Epoch 87/300
Epoch 88/300
Epoch 89/300
Epoch 90/300
Epoch 91/300
Epoch 92/300
Epoch 93/300
Epoch 94/300
Epoch 95/300
Epoch 96/300
Epoch 97/300
Epoch 98/300
Epoch 99/300
Epoch 100/300
Epoch 101/300
Epoch 102/300
Epoch 103/300
Epoch 104/300
Epoch 105/300
Epoch 106/300
Epoch 107/300
Epoch 108/300
Epoch 109/300
Epoch 110/300
Epoch 111/300
Epoch 112/300
Epoch 113/300
Epoch 114/300


Epoch 115/300
Epoch 116/300
Epoch 117/300
Epoch 118/300
Epoch 119/300
Epoch 120/300
Epoch 121/300
Epoch 122/300
Epoch 123/300
Epoch 124/300
Epoch 125/300
Epoch 126/300
Epoch 127/300
Epoch 128/300
Epoch 129/300
Epoch 130/300
Epoch 131/300
Epoch 132/300
Epoch 133/300
Epoch 134/300
Epoch 135/300
Epoch 136/300
Epoch 137/300
Epoch 138/300
Epoch 139/300
Epoch 140/300
Epoch 141/300
Epoch 142/300
Epoch 143/300
Epoch 144/300
Epoch 145/300
Epoch 146/300
Epoch 147/300
Epoch 148/300
Epoch 149/300
Epoch 150/300
Epoch 151/300
Epoch 152/300
Epoch 153/300
Epoch 154/300
Epoch 155/300
Epoch 156/300
Epoch 157/300
Epoch 158/300
Epoch 159/300
Epoch 160/300
Epoch 161/300
Epoch 162/300
Epoch 163/300
Epoch 164/300
Epoch 165/300
Epoch 166/300
Epoch 167/300
Epoch 168/300
Epoch 169/300
Epoch 170/300
Epoch 171/300
Epoch 172/300


Epoch 173/300
Epoch 174/300
Epoch 175/300
Epoch 176/300
Epoch 177/300
Epoch 178/300
Epoch 179/300
Epoch 180/300
Epoch 181/300
Epoch 182/300
Epoch 183/300
Epoch 184/300
Epoch 185/300
Epoch 186/300
Epoch 187/300
Epoch 188/300
Epoch 189/300
Epoch 190/300
Epoch 191/300
Epoch 192/300
Epoch 193/300
Epoch 194/300
Epoch 195/300
Epoch 196/300
Epoch 197/300
Epoch 198/300
Epoch 199/300
Epoch 200/300
Epoch 201/300
Epoch 202/300
Epoch 203/300
Epoch 204/300
Epoch 205/300
Epoch 206/300
Epoch 207/300
Epoch 208/300
Epoch 209/300
Epoch 210/300
Epoch 211/300
Epoch 212/300
Epoch 213/300
Epoch 214/300
Epoch 215/300
Epoch 216/300
Epoch 217/300
Epoch 218/300
Epoch 219/300
Epoch 220/300
Epoch 221/300
Epoch 222/300
Epoch 223/300
Epoch 224/300
Epoch 225/300
Epoch 226/300
Epoch 227/300
Epoch 228/300
Epoch 229/300
Epoch 230/300


Epoch 231/300
Epoch 232/300
Epoch 233/300
Epoch 234/300
Epoch 235/300
Epoch 236/300
Epoch 237/300
Epoch 238/300
Epoch 239/300
Epoch 240/300
Epoch 241/300
Epoch 242/300
Epoch 243/300
Epoch 244/300
Epoch 245/300
Epoch 246/300
Epoch 247/300
Epoch 248/300
Epoch 249/300
Epoch 250/300
Epoch 251/300
Epoch 252/300
Epoch 253/300
Epoch 254/300
Epoch 255/300
Epoch 256/300
Epoch 257/300
Epoch 258/300
Epoch 259/300
Epoch 260/300
Epoch 261/300
Epoch 262/300
Epoch 263/300
Epoch 264/300
Epoch 265/300
Epoch 266/300
Epoch 267/300
Epoch 268/300
Epoch 269/300
Epoch 270/300
Epoch 271/300
Epoch 272/300
Epoch 273/300
Epoch 274/300
Epoch 275/300
Epoch 276/300
Epoch 277/300
Epoch 278/300
Epoch 279/300
Epoch 280/300
Epoch 281/300
Epoch 282/300
Epoch 283/300
Epoch 284/300
Epoch 285/300
Epoch 286/300
Epoch 287/300
Epoch 288/300


Epoch 289/300
Epoch 290/300
Epoch 291/300
Epoch 292/300
Epoch 293/300
Epoch 294/300
Epoch 295/300
Epoch 296/300
Epoch 297/300
Epoch 298/300
Epoch 299/300
Epoch 300/300


<keras.callbacks.History at 0x7fa961f0ed30>

In [15]:
# retrieve embedding weights
embeddings = model.layers[0]
weights = embeddings.get_weights()[0]


# n_docs, n_words, n_embedding
weights = weights.reshape(-1, n_words, n_embedding)

print(weights.shape)

(1, 23868, 48)


In [16]:
import io


# write to file


out_v = io.open('vecs.tsv', 'w', encoding='utf-8')
out_m = io.open('meta.tsv', 'w', encoding='utf-8')

for num, word in enumerate(unique_words):
    vec = weights[0, num, :] 
    out_m.write(word + "\n")
    out_v.write('\t'.join([str(x) for x in vec]) + "\n")
    
out_v.close()
out_m.close()


