In [None]:
## https://machinelearningmastery.com/text-generation-lstm-recurrent-neural-networks-python-keras/

In [3]:
import numpy
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils
import tensorflow as tf

In [4]:
# load ascii text and covert to lowercase
filename = "wonderland.txt"
raw_text = open(filename).read()
raw_text = raw_text.lower()

In [5]:
# We cannot model the characters directly, instead we must convert the characters to integers. 
# We can do this easily by first creating a set of all of the distinct characters in the book, 
# then creating a map of each character to a unique integer.

In [6]:
# create mapping of unique chars to integers
chars = sorted(list(set(raw_text)))
char_to_int = dict((c, i) for i, c in enumerate(chars))

In [7]:
# Summarize dataset
n_chars = len(raw_text)
n_vocab = len(chars)
print("Total Characters: ", n_chars)
print("Total Vocab: ", n_vocab)

('Total Characters: ', 173595)
('Total Vocab: ', 66)


In [8]:
# Here we see that there are 66 distinct characters in vocab for network to learn.
# we will split the book text up into subsequences with a fixed length of 100 characters, an arbitrary length. 
# We could just as easily split the data up by sentences and pad the shorter sequences and truncate the longer ones.
# When creating these sequences, we slide this window along the whole book one character at a time, allowing each character a chance to be learned from the 100 characters that preceded it 

In [9]:
# prepare the dataset of input to output pairs encoded as integers
seq_length = 100
dataX = []
dataY = []
for i in range(0, n_chars - seq_length, 1):
	seq_in = raw_text[i:i + seq_length] # the characters in sequence
	seq_out = raw_text[i + seq_length] # The sequence position
	dataX.append([char_to_int[char] for char in seq_in])
	dataY.append(char_to_int[seq_out])
n_patterns = len(dataX)
print("Total Patterns: ", n_patterns)

('Total Patterns: ', 173495)


In [10]:
# Now we need to transform it so that it is suitable for use with Keras. 
# First we must transform the list of input sequences into the form [samples, time steps, features] expected by an LSTM network.
# Next we need to rescale the integers to the range 0-to-1 to make the patterns easier to learn by the LSTM network that uses the sigmoid activation function by default.
# Finally, we need to convert the output patterns (single characters converted to integers) into a one hot encoding. 
# This is so that we can configure the network to predict the probability of each of the 47 different characters in the vocabulary (an easier representation) rather than trying to force it to predict precisely the next character.

In [11]:
# Each y value is converted into a sparse vector with a length of 47, 
# full of zeros except with a 1 in the column for the letter (integer) that the pattern represents.

In [12]:
print(dataX[0])
print(numpy.shape(dataX))

[65, 62, 63, 46, 48, 45, 40, 35, 33, 50, 2, 37, 51, 50, 35, 44, 32, 35, 48, 37, 64, 57, 59, 49, 2, 31, 42, 39, 33, 35, 64, 57, 59, 49, 2, 31, 34, 52, 35, 44, 50, 51, 48, 35, 49, 2, 39, 44, 2, 53, 45, 44, 34, 35, 48, 42, 31, 44, 34, 10, 2, 32, 55, 2, 42, 35, 53, 39, 49, 2, 33, 31, 48, 48, 45, 42, 42, 1, 0, 1, 0, 50, 38, 39, 49, 2, 35, 32, 45, 45, 41, 2, 39, 49, 2, 36, 45, 48, 2, 50]
(173495, 100)


In [13]:
# reshape X to be [samples, time steps, features]
X = numpy.reshape(dataX, (n_patterns, seq_length, 1))
# normalize
X = X / float(n_vocab)
# one hot encode the output variable
y = np_utils.to_categorical(dataY)

In [14]:
#print(X[0])
print(numpy.shape(X))

(173495, 100, 1)


In [15]:
# define the LSTM model
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2])))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [16]:
# define the checkpoint
filepath="weights-improvement-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

In [17]:
with tf.device('/gpu:0'):
    model.fit(X, y, epochs=20, batch_size=128, callbacks=callbacks_list)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [18]:
## Generating text with LSTM model
### Firstly, we load the data and define the network in exactly the same way, except the network weights are loaded from a 
### checkpoint file and the network does not need to be trained.

In [20]:
# load the network weights
filename = "weights-improvement-19-1.9394.hdf5"
model.load_weights(filename)
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [21]:
# Also, when preparing the mapping of unique characters to integers, we must also create a reverse mapping that we can use to 
# convert the integers back to characters so that we can understand the predictions.
int_to_char = dict((i, c) for i, c in enumerate(chars))

In [22]:
# The simplest way to use the Keras LSTM model to make predictions is to first start off with a seed sequence as input, 
# generate the next character then update the seed sequence to add the generated character on the end and trim off the 
# first character. This process is repeated for as long as we want to predict new characters (e.g. a sequence of 1,000 characters in length).

In [29]:
import sys
# We can pick a random input pattern as our seed sequence, then print generated characters as we generate them.
# pick a random seed
start = numpy.random.randint(0, len(dataX)-1)
pattern = dataX[start]
print("Seed:")
print("\"", ''.join([int_to_char[value] for value in pattern]), "\"")
# generate characters
for i in range(1000):
	x = numpy.reshape(pattern, (1, len(pattern), 1))
	x = x / float(n_vocab)
	prediction = model.predict(x, verbose=0)
	index = numpy.argmax(prediction)
	result = int_to_char[index]
	seq_in = [int_to_char[value] for value in pattern]
	sys.stdout.write(result)
	pattern.append(index)
	pattern = pattern[1:len(pattern)]
print("\nDone.")

Seed:
('"', 'd she dropped\r\nit hastily, just in time to avoid shrinking away altogether.\r\n\r\n\xe2\x80\x98that was a narrow ', '"')
��� said the cotpouse, ���and the mort of the sooe
to the mortee ��� 

���i dan���t tee toe toiereen ��� said the monk  ano oo   

���what i den do a lirsle to too would toe toien of the sooe,��� she said 
to herself, ���the mort oo mere the morte oo the toie to the thitg oo 
the moote  soe moot oo the the the wort  th the to tee thi goot the was
oote the wonle toied to the that sae hnre
ant ho the rooee and the sooee of the sooe  

                                                                                                                                                                                                                                                                                                                                                                                                                                           