# RNN word prediction 
Building a Vanilla RNN and LSTM in Keras

In [42]:
import numpy as np
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.layers import SimpleRNN
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils
import sys

## Load the data

In [4]:
filepath = "../data/pg.txt"
raw_text = open(filepath).read()
raw_text = raw_text.lower()

## Map the characters to unique indexes
These dictionaries are used to:
1) Transform all words into a series of one-hot encoded vectors
2) Transform vectorized predictions back into words

In [45]:
# unique chars present in the data set
vocab = list(set(raw_text))
print(vocab)

['q', '1', '[', '_', '.', 't', '(', 'c', 'g', 'j', 'x', 'i', 'o', ';', "'", '2', '9', 'w', '5', 'e', ' ', '?', '-', '—', 'n', '0', ',', 'k', 'v', 'a', ']', ':', 'm', 'r', 'y', 'l', 'p', 'd', 'u', '$', '3', '\n', 'h', 'b', '"', 'z', 'f', '6', 's', '8', '4', '%']


In [6]:
ch2int = { c:i for i,c in enumerate(vocab) }
int2ch = { i:c for i,c in enumerate(vocab) }
num_chars, num_vocab = len(raw_text), len(vocab)

In [7]:
print("Total chars: {}, total vocab: {}".format(num_chars, num_vocab))

Total chars: 49993, total vocab: 52


## Prepare the training data
We essentially have to decide up front how long each sequence of inputs is. How many time steps do you predict through one forward pass of the network?

Let's just choose 255, since that's the length of a tweet.

In [9]:
seq_length = 255

# Input data stores the 255 character sequences
input_data = []

# Output data stores the predicted characters
output_data = []

# populate both of them
# we stop at num_chars - seq_length because that is the last
# pattern we will need to recognize
for i in range(num_chars - seq_length):
    
    # the "sliding window" of characters
    # these are basically 255-grams
    input_seq = raw_text[i:i+seq_length]
    output_prediction = raw_text[i+seq_length]
    
    input_data.append([ch2int[ch] for ch in input_seq])
    output_data.append(ch2int[output_prediction])

num_patterns = len(input_data)
print("Number of patterns: ", num_patterns)

print(input_data[0])
print(output_data[0])

Number of patterns:  49738
[5, 42, 19, 20, 11, 24, 28, 19, 48, 5, 12, 33, 48, 20, 48, 12, 20, 32, 38, 7, 42, 20, 29, 43, 12, 38, 5, 20, 5, 42, 19, 11, 33, 20, 48, 5, 29, 33, 5, 38, 36, 20, 42, 38, 43, 48, 4, 20, 20, 29, 48, 20, 29, 20, 35, 12, 5, 20, 12, 46, 20, 32, 11, 24, 37, 20, 11, 20, 37, 12, 24, 14, 5, 20, 27, 24, 12, 17, 20, 5, 42, 19, 20, 32, 12, 33, 19, 20, 29, 11, 33, 43, 12, 33, 24, 11, 24, 8, 20, 7, 29, 48, 19, 20, 12, 46, 20, 5, 42, 19, 20, 19, 38, 33, 12, 36, 19, 29, 24, 20, 12, 46, 20, 5, 42, 19, 20, 48, 7, 42, 19, 37, 38, 35, 19, 26, 20, 29, 24, 37, 20, 46, 33, 12, 32, 20, 48, 38, 7, 42, 20, 48, 11, 5, 19, 48, 20, 5, 42, 29, 5, 20, 17, 12, 38, 35, 37, 20, 43, 19, 20, 5, 42, 29, 5, 20, 29, 20, 35, 12, 17, 20, 11, 37, 19, 29, 20, 46, 12, 33, 20, 32, 12, 24, 19, 34, 20, 17, 19, 14, 33, 19, 20, 24, 12, 5, 20, 11, 24, 20, 29, 24, 8, 19, 35, 20, 33, 12, 38, 24, 37, 48, 4, 20, 5, 42, 19, 20, 17, 12, 33, 35, 37, 20, 29, 33, 19, 20, 5, 42, 19, 20, 37, 29, 24, 8, 19, 33, 20, 5, 4

### Reshape the data for input into Keras

In [10]:
# new shape: samples, time steps, features

reshaped_input = np.reshape(input_data, (num_patterns, seq_length, 1))

# normalize each of the inputs by the number of characters
# sort of a "character gradient
reshaped_input = reshaped_input / (num_vocab)

In [11]:
# Translation here: 
# This is a list of ALL "255-grams",
# of length 255, where each one is represented by
# a list of 1D normalized character values.

print(reshaped_input.shape)

(49738, 255, 1)


In [12]:
onehot_output = np_utils.to_categorical(output_data)

In [13]:
print("Non vectorized: ", output_data[2])
print("Vectorized: ", onehot_output[2])

Non vectorized:  24
Vectorized:  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0.]


## Build the model!

In [14]:
# First, a vanilla RNN

VanillaModel = Sequential()
VanillaModel.add(SimpleRNN(256, input_shape=(reshaped_input.shape[1], reshaped_input.shape[2])))
VanillaModel.add(Dropout(0.2))
VanillaModel.add(Dense(onehot_output.shape[1], activation='softmax'))
VanillaModel.compile(loss='categorical_crossentropy', optimizer='adam')

# Then, an LSTM
LSTMModel = Sequential()
LSTMModel.add(LSTM(256, input_shape=(reshaped_input.shape[1], reshaped_input.shape[2])))
LSTMModel.add(Dropout(0.2))
LSTMModel.add(Dense(onehot_output.shape[1], activation='softmax'))
LSTMModel.compile(loss='categorical_crossentropy', optimizer='adam')

### Check the weight improvements as you train

In [15]:
weightfile = "weights-improvement-{epoch:02d}-{loss:.4f}.hdf5"
weightfile = "weights-LSTM-improvement-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(weightfile, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

In [57]:
VanillaModel.fit(reshaped_input, onehot_output,
                epochs=20,
                batch_size=128,
                callbacks=callbacks_list)

Epoch 1/20

Epoch 00001: loss improved from inf to 2.93904, saving model to weights-improvement-01-2.9390.hdf5
Epoch 2/20

Epoch 00002: loss improved from 2.93904 to 2.84919, saving model to weights-improvement-02-2.8492.hdf5
Epoch 3/20

Epoch 00003: loss improved from 2.84919 to 2.82878, saving model to weights-improvement-03-2.8288.hdf5
Epoch 4/20

Epoch 00004: loss improved from 2.82878 to 2.82008, saving model to weights-improvement-04-2.8201.hdf5
Epoch 5/20

Epoch 00005: loss did not improve from 2.82008
Epoch 6/20

Epoch 00006: loss did not improve from 2.82008
Epoch 7/20

KeyboardInterrupt: 

In [102]:
LSTMModel.fit(reshaped_input, onehot_output,
                epochs=20,
                batch_size=128,
                callbacks=callbacks_list)

Epoch 1/20

Epoch 00001: loss improved from inf to 2.96338, saving model to weights-LSTM-improvement-01-2.9634.hdf5
Epoch 2/20

Epoch 00002: loss improved from 2.96338 to 2.88842, saving model to weights-LSTM-improvement-02-2.8884.hdf5
Epoch 3/20

Epoch 00003: loss improved from 2.88842 to 2.82154, saving model to weights-LSTM-improvement-03-2.8215.hdf5
Epoch 4/20

Epoch 00004: loss improved from 2.82154 to 2.79407, saving model to weights-LSTM-improvement-04-2.7941.hdf5
Epoch 5/20

Epoch 00005: loss improved from 2.79407 to 2.76402, saving model to weights-LSTM-improvement-05-2.7640.hdf5
Epoch 6/20

Epoch 00006: loss improved from 2.76402 to 2.72558, saving model to weights-LSTM-improvement-06-2.7256.hdf5
Epoch 7/20

Epoch 00007: loss improved from 2.72558 to 2.67676, saving model to weights-LSTM-improvement-07-2.6768.hdf5
Epoch 8/20

Epoch 00008: loss improved from 2.67676 to 2.61825, saving model to weights-LSTM-improvement-08-2.6182.hdf5
Epoch 9/20

Epoch 00009: loss improved from 

<keras.callbacks.History at 0x1818325d68>

In [41]:
# Test the model... error didn't improve after a few epochs
# load the network weights

weightsfile = "weights-LSTM-improvement-20-1.9716.hdf5"

LSTMModel.load_weights(weightsfile)
LSTMModel.compile(loss='categorical_crossentropy', optimizer='adam')

start = np.random.randint(0, len(input_data)-1)
pattern = input_data[start]
generated = []

print("RANDOM SEED: ")
print(''.join([int2ch[i] for i in pattern]))

for i in range(1000):
        
    inp = np.reshape(pattern, (1, len(pattern), 1))
    inp = inp / float(num_vocab)
    pred = LSTMModel.predict(inp, verbose=0)
    
    idx = np.argmax(pred)    
    result = int2ch[idx]
    sys.stdout.write(result)
            
    # append the new predicted index
    pattern.append(idx)
    pattern = pattern[1:len(pattern)]

RANDOM SEED: 
you're trying to have good programmers. there's no implicit problem that probably be components with a day for startups. but the best startup is choosing that it's really presented unfortunately. we haven't had a few decided model of startups that wouldn'
o$vdwwo$vdww$_dew$1d0w$vd1"6_[$v5w$kw6vd_$v5w[$0w$_dv$v$1w6_'$'5$'wwv$_de$$6_v$vd$kw6f$[de0$wd60$60n$'d0$$ffvww$600w0w$vdw_o$vd$kww$wd1wvv'_2$vd$0w$6 fw$vd$kww$vd$swdo$6f0v$0d$0w$1d0$$5ww$[deu0w$2d'_2$vd$ w$6 fw$vd$kww$vd$swdo$6f0v$0d$0w$1d0$$5ww$[deu0w$2d'_2$vd$ w$6 fw$vd$kww$vd$swdo$6f0v$0d$0w$1d0$$5ww$[deu0w$2d'_2$vd$ w$6 fw$vd$kww$vd$swdo$6f0v$0d$0w$1d0$$5ww$[deu0w$2d'_2$vd$ w$6 fw$vd$kww$vd$swdo$6f0v$0d$0w$1d0$$5ww$[deu0w$2d'_2$vd$ w$6 fw$vd$kww$vd$swdo$6f0v$0d$0w$1d0$$5ww$[deu0w$2d'_2$vd$ w$6 fw$vd$kww$vd$swdo$6f0v$0d$0w$1d0$$5ww$[deu0w$2d'_2$vd$ w$6 fw$vd$kww$vd$swdo$6f0v$0d$0w$1d0$$5ww$[deu0w$2d'_2$vd$ w$6 fw$vd$kww$vd$swdo$6f0v$0d$0w$1d0$$5ww$[deu0w$2d'_2$vd$ w$6 fw$vd$kww$vd$swdo$6f0v$0d$0w$1d0$$5ww$[deu0w$2d'_