In [1]:

# https://github.com/timestocome


# Lovecraft Corpus
# https://github.com/vilmibm/lovecraftcorpus


# Conv network sorts stories with about 88% accuracy
# Not enough data to pull out a hold out set



In [2]:
import pandas as pd
import numpy as np


import matplotlib.pyplot as plt

In [3]:
# silence is golden

import warnings
warnings.filterwarnings("ignore")
warnings.filterwarnings(action="ignore",category=DeprecationWarning)
warnings.filterwarnings(action="ignore",category=FutureWarning)


In [4]:

# hack to make keras work with 2*** series gpus

import tensorflow as tf
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.Session(config=config)


In [5]:
from keras.models import Sequential
from keras.layers import Conv1D, GlobalMaxPooling1D
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding
from keras.preprocessing import sequence



Using TensorFlow backend.


In [6]:
# list all files under the input directory
import os

fNames = []
for dirname, _, filenames in os.walk('lovecraftcorpus'):
    for filename in filenames:
        fNames.append(os.path.join(dirname, filename))

#print(fNames)
#print(len(fNames))

In [7]:
# read in all files, split into sentences, do a bit of cleanup to reduce vocabulary size

# keep cleanup minimal 
#  convert to lower
#  convert all numbers to 9
#  remove ",'

from nltk.tokenize import sent_tokenize
import functools
import re

story_sentences = []
targets = []

for i in range(len(fNames)):
    f = fNames[i]
    fp = open(f)
    story = fp.read()
    
    # minor cleanup
    story = story.lower()
    story = re.sub('-', ' ', story)
    story = re.sub(" \'", ' ', story)
    story = re.sub('\"', ' ', story)
    story = re.sub('\d', '9', story)
    
    # break into sentences and append to the story_sentences array
    story_sentences.append(sent_tokenize(story))
    
    

 

In [8]:
# split sentences into words and get story number for each sentence to use as the target
import nltk
from nltk.tokenize import word_tokenize


story_words = []
targets = []

for i in range(len(fNames)):
    
    sentences = story_sentences[i]
    sentence_words = [word_tokenize(t) for t in sentences]
    
    targets.append( [i] * len(sentences))
    story_words.append(sentence_words)

  



In [9]:
     
# break sentences into words
combined_words = [item for sublist in story_words for item in sublist]  

# flatten sentence words, reduce to unique and sort
unique_words = sorted(set(x for s in combined_words for x in s))

# flatten lists
sentences = [item for sublist in story_sentences for item in sublist]  
targets = [item for sublist in targets for item in sublist]  

# max length of sentences
maxlen = max([len(x) for x in combined_words])
print('maxlen', maxlen)



maxlen 344


In [10]:
# store sentences and targets in a df
# df makes it easier to shuffle samples and pull out train/test data 

train = pd.DataFrame(targets)
train.columns = ['target']
train['words'] = combined_words

print(train.tail())





       target                                              words
18673      66  [their, deeds, i, recall, not, ,, for, they, w...
18674      66  [their, aspect, i, recall, dimly, ,, it, was, ...
18675      66  [their, name, i, recall, clearly, ,, for, it, ...
18676      66  [these, beings, of, yesterday, were, called, m...
18677      66  [so, the, genie, flew, back, to, the, thin, ho...


In [11]:
# there are faster ways to do this but this is easily reversable, works for small dataset

# convert words to int ids
def convert_word(word_list):
    
    wl = [0] * maxlen
    n = min(maxlen, len(word_list))
    
    for i in range(n):
            wl[i] = unique_words.index(word_list[i])
      
    return wl


# util to convert ints back into words  ** not used here, coded for use in future code using this as base
def convert_index(word_list):
    l = []
    for w in word_list:
        l.append(unique_words[w])
        
    return l


train['indexes'] = train['words'].apply(convert_word)
print(train.head(20))



    target                                              words  \
0        0  [beyond, the, wall, of, sleep, i, have, often,...   
1        0  [whilst, the, greater, number, of, our, noctur...   
2        0  [from, my, experience, i, can, not, doubt, but...   
3        0  [from, those, blurred, and, fragmentary, memor...   
4        0  [we, may, guess, that, in, dreams, life, ,, ma...   
5        0  [sometimes, i, believe, that, this, less, mate...   
6        0  [it, was, from, a, youthful, revery, filled, w...   
7        0  [his, name, ,, as, given, on, the, records, ,,...   
8        0  [among, these, odd, folk, ,, who, correspond, ...   
9        0  [joe, slater, ,, who, came, to, the, instituti...   
10       0  [though, well, above, the, middle, stature, ,,...   
11       0  [his, age, was, unknown, ,, since, among, his,...   
12       0  [from, the, medical, and, court, documents, we...   
13       0  [he, had, habitually, slept, at, night, beyond...   
14       0  [not, that, h

In [12]:
# this is a small dataset, wouldn't be practical to use all unique words in most problems
# drop common words ( stop words ) and one-offs 
max_features = len(unique_words)
batch_size = 32
embedding_dims = 50
n_filters = 250
kernel_size = 3
n_hidden = 250
n_epochs = 20


In [13]:

# convert words into oneHot vectors
from sklearn.preprocessing import OneHotEncoder

enc = OneHotEncoder()
enc.fit(train['target'].values.reshape(-1,1))


# shuffle data
train = train.sample(frac=1.)


# pull out 10% as validation data
n_test = len(train) // 10
n_train = len(train) - n_test
print(n_test, n_train)


# split data into train/validate and
# reshape pandas series into arrays for network
# train
x = train[0:n_train]['indexes']
x = np.asarray([y for z in x for y in z])
x = x.reshape(n_train, maxlen)

y = train[0:n_train]['target'].values.reshape(-1,1)
y = enc.transform(y)


# test
x_test = train[n_train:-1]['indexes']

x_test = np.asarray([y for z in x_test for y in z])
x_test = x_test.reshape(n_test-1, maxlen)


y_test = train[n_train:-1]['target'].values.reshape(-1,1)
y_test = enc.transform(y_test)

print(y.shape, y_test.shape)

1867 16811
(16811, 67) (1866, 67)


In [14]:
# ML model word embedding layer, Conv 1D, Dense, Dense output 

model = Sequential()

model.add(Embedding(max_features, embedding_dims, input_length=maxlen))
model.add(Dropout(0.2))


model.add(Conv1D(n_filters, kernel_size, padding='valid', activation='relu', strides=1))
model.add(GlobalMaxPooling1D())


model.add(Dense(n_hidden))
model.add(Dropout(0.2))
model.add(Activation('relu'))


model.add(Dense(len(fNames)))
model.add(Activation('softmax'))
model.compile(loss='mse', optimizer='adam', metrics=['accuracy'])










Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.



In [15]:

# train model 

model.fit(x, y,
          batch_size = batch_size,
          epochs = n_epochs,
          validation_data = (x_test, y_test))






Train on 16811 samples, validate on 1866 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7fbce054a780>