In [1]:
import sys
import numpy as np
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils
import nltk
from nltk.tokenize import sent_tokenize
from nltk import word_tokenize
from keras.preprocessing.sequence import pad_sequences
import pickle

Using TensorFlow backend.


In [2]:
raw_texts = open("wonderland.txt").read()
raw_texts = raw_texts.strip().lower()

sentences = list(map(lambda x: nltk.word_tokenize(x), sent_tokenize(raw_texts)))
sentences = [['@']+sent+['#'] for sent in sentences]

In [3]:
words = [word for sentence in sentences for word in sentence]
words = sorted(list(set(words)))
word_to_index = {w: i+1 for i, w in enumerate(words)}
index_to_word = {i+1: w for i, w in enumerate(words)}

n_vocab = len(words)+1
print('Total number of words:', n_vocab)
with open('word_mappings.pickle', 'wb') as f:
    pickle.dump([word_to_index, index_to_word], f, protocol=pickle.HIGHEST_PROTOCOL)

Total number of words: 2846


In [4]:
sent_len = 10
dataX, dataY = [], []
for sent in sentences:
    for i in range(1, len(sent)):
        dataX.append([word_to_index[w] for w in sent[:i]])
        dataY.append(word_to_index[sent[i]])
dataX = pad_sequences(dataX, sent_len)
n_patterns = len(dataX)
print('Total Patterns:', n_patterns)

Total Patterns: 34936


In [5]:
X = np.reshape(dataX, (n_patterns, sent_len, 1))
X = X/float(n_vocab)
y = np_utils.to_categorical(dataY)

In [6]:
# 2 layer stacked LSTM model
model = Sequential()
model.add(LSTM(1024, input_shape=(X.shape[1], X.shape[2]), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(1024))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))

filename = "weights-improvement-50-1.4122.hdf5"
model.load_weights(filename)
model.compile(loss='categorical_crossentropy', optimizer='adam')

print(y.shape)

(34936, 2846)


In [20]:
# pick a random seed
if type(dataX) != list:
    dataX = dataX.tolist()
start = np.random.randint(0, len(dataX)-1)
pattern = dataX[start]
print("Seed:",' '.join([index_to_word.get(value, '') for value in pattern]))
# generate characters
for i in range(1000):
    x = np.reshape(pattern, (1, len(pattern), 1))
    x = x / float(n_vocab)
    prediction = model.predict(x, verbose=0)
    index = np.argmax(prediction)
    result = index_to_word[index]
    if result == '#':
        start = np.random.randint(0, len(dataX)-1)
        pattern = dataX[start]
        continue
    sys.stdout.write(' '+result)
    pattern.append(index)
    pattern = pattern[1:]
print("\nDone.")

Seed: a deep voice , 'are done with a whiting .
 the simple rules her turtle close . of the house before she in found the fan and gloves . these strange adventures of hers that you you name the shepherd , the could the the cauldron and sitting ran of , and 'get when name to , a tomorrow know signifies the to eye any 'fetch ! ' , minute or two i could shut ; and was dormouse back to a all really . that her head ! best , for this must ever be a secret , kept from all the rest , between yourself and me . ' of the house , and found quite a crowd of little animals and birds waiting outside . the mouse , who a bound ! put more simply -- '' never imagine yourself not to be otherwise than what , ' been changed for mabel ! alice quite hungry to look at them -- 'i wish they 'd get the trial done , ' she thought , 'and hand round the refreshments ! ' a caterpillar the caterpillar and alice looked at each other for some time , mad people , ' alice remarked . , ' the alice , 'only , as it 's asleep