In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from keras.utils.data_utils import get_file
from keras.datasets import imdb
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Input, Embedding, Reshape, merge, LSTM, Bidirectional
from keras.layers import SimpleRNN, TimeDistributed
from keras.layers.normalization import BatchNormalization
from keras.utils import np_utils
from keras.layers import Conv1D, MaxPooling1D, ZeroPadding1D
from keras.utils import np_utils
from keras.optimizers import Adam
import cPickle as pickle
import bcolz
import re
from numpy.random import random, permutation, randn, normal, uniform, choice

Using TensorFlow backend.


In [2]:
path = get_file('nietzsche.txt', origin="https://s3.amazonaws.com/text-datasets/nietzsche.txt")
text = open(path).read()
print len(text)

600901


Creating a vocabulary of unique characters

In [3]:
chars = sorted(list(set(text)))
print len(chars)+1

86


Inserting 0 as it wasn't in the original text

In [4]:
chars.insert(0, '\0')

Creating a dictionary, mapping characters to index and index to characters

In [5]:
char_to_index = {v:i for i,v in enumerate(chars)}
index_to_char = {i:v for i,v in enumerate(chars)}

Converting the entire nietzsche text into index of characters

In [6]:
total_index = [char_to_index[char] for char in text]

In [7]:
total_index[:10]

[40, 42, 29, 30, 25, 27, 29, 1, 1, 1]

In [8]:
''.join(index_to_char[i] for i in total_index[:25])

'PREFACE\n\n\nSUPPOSING that '

As were are predicting the 8th character, we need to create an array of the first 7 characters each acting as an input and the last character as the output.

For example, for the text 'this and that'

The input will be -> [['t', ' '], ['h', 't'], ['i', 'h'], ['s', 'a'], [' ', 't'], ['a'], ['n']] -> but instead of the characters, there will be the index of the character.

And the output will be -> ['d']

In [35]:
pred_num = 7
xin = [[total_index[j+i] for j in xrange(0, len(total_index)-1-pred_num, pred_num)] for i in range(pred_num)]
y = [total_index[i+pred_num] for i in xrange(0, len(total_index)-1-pred_num, pred_num)]

We are removing the last 2 characters to keep the length of each array equal

In [36]:
X = [np.stack(xin[i][:-2]) for i in range(pred_num)]
Y = np.stack(y[:-2])

In [37]:
X[:8]

[array([40,  1, 39, ..., 54, 57, 58]),
 array([42,  1, 43, ..., 67,  2,  2]),
 array([29,  1, 33, ...,  2, 54, 62]),
 array([30, 43, 38, ..., 76, 72, 67]),
 array([25, 45, 31, ..., 68,  2, 57]),
 array([27, 40,  2, ..., 71, 73, 62]),
 array([29, 40, 73, ..., 65, 61, 56])]

In [38]:
Y[:8]

array([ 1, 39, 61, 73,  2,  9, 61,  2])

In [39]:
X[0].shape, Y.shape

((85840,), (85840,))

In [61]:
hidden_layers = 256
vocab_size = 86
n_fac = 42

Creating a simple RNN

In [62]:
model = Sequential([
        Embedding(vocab_size, n_fac, input_length=pred_num),
        SimpleRNN(hidden_layers, activation='relu'),
        Dense(vocab_size, activation='softmax')
    ])

In [63]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 7, 42)             3612      
_________________________________________________________________
simple_rnn_6 (SimpleRNN)     (None, 256)               76544     
_________________________________________________________________
dense_6 (Dense)              (None, 86)                22102     
Total params: 102,258.0
Trainable params: 102,258
Non-trainable params: 0.0
_________________________________________________________________


In [64]:
model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam())

In [65]:
model.fit(np.stack(X, 1), Y, batch_size=64, epochs=5)

In [None]:
model.save_weights('simpleRNN_3pred.h5')

In [18]:
model.load_weights('simpleRNN_3pred.h5')

In [50]:
model.save_weights('simpleRNN_7pred.h5')

In [66]:
model.load_weights('simpleRNN_7pred.h5')

1. First convert the input to indices
2. Then expand the dimension to match the model's output format
3. Predict the 8th character using the input
4. As we are using softmax activation in the last layer of the model, we get the probability of every 86 characters in our vocabulary. So the character with the maximum probability will be the 8th predicted character by the model

In [67]:
def predict_next_char(inp):
    index = [char_to_index[i] for i in inp]
    arr = np.expand_dims(np.array(index), axis=0)
    prediction = model.predict(arr)
    return index_to_char[np.argmax(prediction)]

In this example, prediction is being done for the 8th character(pred_num = 7)

In [68]:
predict_next_char('those w')

'h'

In this example, prediction is being done for the 4th character, so just set pred_num = 3

In [21]:
predict_next_char(' th')

'e'

In [22]:
predict_next_char(' an')

'd'

In [69]:
predict_next_char('does th')

'e'

# Return Sequences

Here we will predict the next word where the input will be all the words before it.

For example, to predict the 2nd word, first word will be used

To predict the 3rd word, first and second word will be used and so on.

In [162]:
ys = [[total_index[j+i] for j in xrange(1, len(total_index)-pred_num, pred_num)] for i in range(pred_num)]

In [163]:
Y_return = [np.stack(ys[i][:-2]) for i in range(pred_num)]

In [164]:
X

[array([40,  1, 39, ..., 54, 57, 58]),
 array([42,  1, 43, ..., 67,  2,  2]),
 array([29,  1, 33, ...,  2, 54, 62]),
 array([30, 43, 38, ..., 76, 72, 67]),
 array([25, 45, 31, ..., 68,  2, 57]),
 array([27, 40,  2, ..., 71, 73, 62]),
 array([29, 40, 73, ..., 65, 61, 56])]

In [165]:
Y_return

[array([42,  1, 43, ..., 67,  2,  2]),
 array([29,  1, 33, ...,  2, 54, 62]),
 array([30, 43, 38, ..., 76, 72, 67]),
 array([25, 45, 31, ..., 68,  2, 57]),
 array([27, 40,  2, ..., 71, 73, 62]),
 array([29, 40, 73, ..., 65, 61, 56]),
 array([ 1, 39, 61, ..., 57, 58, 54])]

In [166]:
vocab_size = 86
n_fac = 42
hidden_layers = 256

As we are setting return_sequences=True, we need to wrap the Dense layer in a TimeDistributed Layer since it is a sequence.

In [192]:
return_model = Sequential([
        Embedding(vocab_size, n_fac, input_length=pred_num),
        SimpleRNN(hidden_layers, return_sequences=True, activation='relu'),
        TimeDistributed(Dense(vocab_size, activation='softmax'))
    ])

In [193]:
return_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_10 (Embedding)     (None, 7, 42)             3612      
_________________________________________________________________
simple_rnn_8 (SimpleRNN)     (None, 7, 256)            76544     
_________________________________________________________________
time_distributed_6 (TimeDist (None, 7, 86)             22102     
Total params: 102,258.0
Trainable params: 102,258
Non-trainable params: 0.0
_________________________________________________________________


In [194]:
return_model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam())

In [195]:
X_model = np.stack(X, 1)
Y_model = np.expand_dims(np.stack(Y_return, 1), axis=-1)

In [196]:
return_model.fit(X_model, Y_model, batch_size=64, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fe7c09d3890>

In [197]:
return_model.optimizer.lr = 1e-4
return_model.fit(X_model, Y_model, batch_size=64, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fe7cd152f90>

In [198]:
def predict_every_char(inp):
    index = [char_to_index[i] for i in inp]
    arr = np.expand_dims(index, axis=0)
    prediction = return_model.predict(arr)
    return [index_to_char[np.argmax(i)] for i in prediction[0]]

In [199]:
predict_every_char('and the')

['n', 'd', ' ', 't', 'h', 'e', ' ']

In [200]:
predict_every_char('this is')

['h', 'e', 'n', ' ', 'o', 'n', ' ']

# Stateful Model

In stateful model, the model remembers the context i.e. the long-term dependencies. Make sure you set shuffle=False.

Because if you set shuffle=True, the order of input will not be preserved, hence the model won't be able to extract the context of the text

In [234]:
bs = 64

If we use ReLU as the activation of LSTM, the loss shoots up to infinity and then NaN.

Hence using tanh as the activation keeps the hidden state vector from growing beyond [-1, 1]

In [235]:
stateful_model = Sequential([
        Embedding(vocab_size, n_fac, input_length=pred_num, batch_input_shape=(bs, 7)),
        BatchNormalization(),
        LSTM(hidden_layers, activation='tanh', return_sequences=True, stateful=True),
        TimeDistributed(Dense(vocab_size, activation='softmax'))
    ])

In [236]:
stateful_model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam())

In [237]:
divide = len(X_model)//bs*bs

In [238]:
stateful_model.fit(X_model[:divide], Y_model[:divide], batch_size=64, epochs=5, shuffle=False)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fe7b4484590>

In [239]:
def predict_every_char_stateful(inp):
    index = [char_to_index[i] for i in inp]
    arr = np.expand_dims(index, axis=0)
    prediction = return_model.predict(arr)
    return [index_to_char[np.argmax(i)] for i in prediction[0]]

In [240]:
predict_every_char_stateful('this is')

['h', 'e', 'n', ' ', 'o', 'n', ' ']

In [246]:
predict_every_char_stateful(' and fo')

['t', 'n', 'd', ' ', 't', 'o', 'r']