In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from keras.utils.data_utils import get_file
from keras.datasets import imdb
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Input, Embedding, Reshape, merge, LSTM, Bidirectional
from keras.layers import SimpleRNN, TimeDistributed
from keras.layers.normalization import BatchNormalization
from keras.utils import np_utils
from keras.layers import Conv1D, MaxPooling1D, ZeroPadding1D
from keras.utils import np_utils
from keras.optimizers import Adam
import cPickle as pickle
import bcolz
import re
from numpy.random import random, permutation, randn, normal, uniform, choice

Using TensorFlow backend.


In [2]:
path = get_file('nietzsche.txt', origin="https://s3.amazonaws.com/text-datasets/nietzsche.txt")
text = open(path).read()
print len(text)

600901


Creating a vocabulary of unique characters

In [3]:
chars = sorted(list(set(text)))
print len(chars)+1

86


Inserting 0 as it wasn't in the original text

In [4]:
chars.insert(0, '\0')

Creating a dictionary, mapping characters to index and index to characters

In [5]:
char_to_index = {v:i for i,v in enumerate(chars)}
index_to_char = {i:v for i,v in enumerate(chars)}

Converting the entire nietzsche text into index of characters

In [6]:
total_index = [char_to_index[char] for char in text]

In [7]:
total_index[:10]

[40, 42, 29, 30, 25, 27, 29, 1, 1, 1]

In [8]:
''.join(index_to_char[i] for i in total_index[:25])

'PREFACE\n\n\nSUPPOSING that '

As were are predicting the 8th character, we need to create an array of the first 7 characters each acting as an input and the last character as the output.

For example, for the text 'this and that'

The input will be -> [['t', ' '], ['h', 't'], ['i', 'h'], ['s', 'a'], [' ', 't'], ['a'], ['n']] -> but instead of the characters, there will be the index of the character.

And the output will be -> ['d']

In [171]:
pred_num = 25
xin = [[total_index[j+i] for j in xrange(0, len(total_index)-1-pred_num, pred_num)] for i in range(pred_num)]
y = [total_index[i+pred_num] for i in xrange(0, len(total_index)-1-pred_num, pred_num)]

We are removing the last 2 characters to keep the length of each array equal

In [172]:
X = [np.stack(xin[i][:-2]) for i in range(pred_num)]
Y = np.stack(y[:-2])

In [175]:
X

[array([40, 44, 58, ..., 76, 78, 62]),
 array([42, 71, 67, ..., 58,  2, 54]),
 array([29, 74, 24, ..., 71, 73, 65]),
 array([30, 73,  2, ..., 72, 61,  2]),
 array([25, 61, 33, ...,  2, 58, 73]),
 array([27,  2, 72, ..., 76,  1, 68]),
 array([29, 62,  2, ..., 58, 26,  2]),
 array([ 1, 72, 73, ..., 71, 74, 72]),
 array([ 1,  2, 61, ..., 58, 57, 54]),
 array([ 1, 54, 58, ...,  2, 57, 67]),
 array([43,  2, 71, ..., 62, 61, 56]),
 array([45, 76, 58, ..., 67, 62, 73]),
 array([40, 68,  2, ..., 72, 72, 62]),
 array([40, 66, 67, ..., 62, 73, 73]),
 array([39, 54, 68, ..., 72, 72, 78]),
 array([43, 67, 73, ..., 73,  2,  8]),
 array([33,  9,  2, ..., 58, 54,  2]),
 array([38,  9, 60, ..., 57, 72, 63]),
 array([31, 76, 71, ...,  2,  2, 74]),
 array([ 2, 61, 68, ..., 74, 58, 72]),
 array([73, 54, 74, ..., 69, 72, 73]),
 array([61, 73, 67, ..., 68, 72,  2]),
 array([54,  2, 57, ..., 67, 58, 54]),
 array([73, 73,  1, ...,  2, 67, 72]),
 array([ 2, 61, 59, ..., 55, 73,  2])]

In [176]:
Y[:8]

array([44, 58, 68, 62, 73,  8, 67, 65])

In [177]:
X[0].shape, Y.shape

((24033,), (24033,))

In [178]:
hidden_layers = 256
vocab_size = 86
n_fac = 42

Creating a simple RNN

In [62]:
model = Sequential([
        Embedding(vocab_size, n_fac, input_length=pred_num),
        SimpleRNN(hidden_layers, activation='relu'),
        Dense(vocab_size, activation='softmax')
    ])

In [63]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 7, 42)             3612      
_________________________________________________________________
simple_rnn_6 (SimpleRNN)     (None, 256)               76544     
_________________________________________________________________
dense_6 (Dense)              (None, 86)                22102     
Total params: 102,258.0
Trainable params: 102,258
Non-trainable params: 0.0
_________________________________________________________________


In [64]:
model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam())

In [65]:
model.fit(np.stack(X, 1), Y, batch_size=64, epochs=5)

In [None]:
model.save_weights('simpleRNN_3pred.h5')

In [18]:
model.load_weights('simpleRNN_3pred.h5')

In [50]:
model.save_weights('simpleRNN_7pred.h5')

In [66]:
model.load_weights('simpleRNN_7pred.h5')

1. First convert the input to indices
2. Then expand the dimension to match the model's output format
3. Predict the 8th character using the input
4. As we are using softmax activation in the last layer of the model, we get the probability of every 86 characters in our vocabulary. So the character with the maximum probability will be the 8th predicted character by the model

In [67]:
def predict_next_char(inp):
    index = [char_to_index[i] for i in inp]
    arr = np.expand_dims(np.array(index), axis=0)
    prediction = model.predict(arr)
    return index_to_char[np.argmax(prediction)]

In this example, prediction is being done for the 8th character(pred_num = 7)

In [68]:
predict_next_char('those w')

'h'

In this example, prediction is being done for the 4th character, so just set pred_num = 3

In [21]:
predict_next_char(' th')

'e'

In [22]:
predict_next_char(' an')

'd'

In [69]:
predict_next_char('does th')

'e'

# Return Sequences

Here we will predict the next word where the input will be all the words before it.

For example, to predict the 2nd word, first word will be used

To predict the 3rd word, first and second word will be used and so on.

In [179]:
ys = [[total_index[j+i] for j in xrange(1, len(total_index)-pred_num, pred_num)] for i in range(pred_num)]

In [180]:
Y_return = [np.stack(ys[i][:-2]) for i in range(pred_num)]

In [181]:
X

[array([40, 44, 58, ..., 76, 78, 62]),
 array([42, 71, 67, ..., 58,  2, 54]),
 array([29, 74, 24, ..., 71, 73, 65]),
 array([30, 73,  2, ..., 72, 61,  2]),
 array([25, 61, 33, ...,  2, 58, 73]),
 array([27,  2, 72, ..., 76,  1, 68]),
 array([29, 62,  2, ..., 58, 26,  2]),
 array([ 1, 72, 73, ..., 71, 74, 72]),
 array([ 1,  2, 61, ..., 58, 57, 54]),
 array([ 1, 54, 58, ...,  2, 57, 67]),
 array([43,  2, 71, ..., 62, 61, 56]),
 array([45, 76, 58, ..., 67, 62, 73]),
 array([40, 68,  2, ..., 72, 72, 62]),
 array([40, 66, 67, ..., 62, 73, 73]),
 array([39, 54, 68, ..., 72, 72, 78]),
 array([43, 67, 73, ..., 73,  2,  8]),
 array([33,  9,  2, ..., 58, 54,  2]),
 array([38,  9, 60, ..., 57, 72, 63]),
 array([31, 76, 71, ...,  2,  2, 74]),
 array([ 2, 61, 68, ..., 74, 58, 72]),
 array([73, 54, 74, ..., 69, 72, 73]),
 array([61, 73, 67, ..., 68, 72,  2]),
 array([54,  2, 57, ..., 67, 58, 54]),
 array([73, 73,  1, ...,  2, 67, 72]),
 array([ 2, 61, 59, ..., 55, 73,  2])]

In [182]:
Y_return

[array([42, 71, 67, ..., 58,  2, 54]),
 array([29, 74, 24, ..., 71, 73, 65]),
 array([30, 73,  2, ..., 72, 61,  2]),
 array([25, 61, 33, ...,  2, 58, 73]),
 array([27,  2, 72, ..., 76,  1, 68]),
 array([29, 62,  2, ..., 58, 26,  2]),
 array([ 1, 72, 73, ..., 71, 74, 72]),
 array([ 1,  2, 61, ..., 58, 57, 54]),
 array([ 1, 54, 58, ...,  2, 57, 67]),
 array([43,  2, 71, ..., 62, 61, 56]),
 array([45, 76, 58, ..., 67, 62, 73]),
 array([40, 68,  2, ..., 72, 72, 62]),
 array([40, 66, 67, ..., 62, 73, 73]),
 array([39, 54, 68, ..., 72, 72, 78]),
 array([43, 67, 73, ..., 73,  2,  8]),
 array([33,  9,  2, ..., 58, 54,  2]),
 array([38,  9, 60, ..., 57, 72, 63]),
 array([31, 76, 71, ...,  2,  2, 74]),
 array([ 2, 61, 68, ..., 74, 58, 72]),
 array([73, 54, 74, ..., 69, 72, 73]),
 array([61, 73, 67, ..., 68, 72,  2]),
 array([54,  2, 57, ..., 67, 58, 54]),
 array([73, 73,  1, ...,  2, 67, 72]),
 array([ 2, 61, 59, ..., 55, 73,  2]),
 array([44, 58, 68, ..., 78, 62, 73])]

In [183]:
vocab_size = 86
n_fac = 42
hidden_layers = 256

As we are setting return_sequences=True, we need to wrap the Dense layer in a TimeDistributed Layer since it is a sequence.

In [184]:
return_model = Sequential([
        Embedding(vocab_size, n_fac, input_length=pred_num),
        SimpleRNN(hidden_layers, return_sequences=True, activation='relu'),
        TimeDistributed(Dense(vocab_size, activation='softmax'))
    ])

In [185]:
return_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, 25, 42)            3612      
_________________________________________________________________
simple_rnn_2 (SimpleRNN)     (None, 25, 256)           76544     
_________________________________________________________________
time_distributed_7 (TimeDist (None, 25, 86)            22102     
Total params: 102,258.0
Trainable params: 102,258
Non-trainable params: 0.0
_________________________________________________________________


In [186]:
return_model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam())

In [187]:
X_model = np.stack(X, 1)
Y_model = np.expand_dims(np.stack(Y_return, 1), axis=-1)

In [188]:
return_model.fit(X_model, Y_model, batch_size=64, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fa7ac3be0d0>

In [189]:
return_model.optimizer.lr = 1e-4
return_model.fit(X_model, Y_model, batch_size=64, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fa7d623e950>

In [190]:
return_model.optimizer.lr = 1e-4
return_model.fit(X_model, Y_model, batch_size=64, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fa7ad2ac810>

In [191]:
return_model.save_weights('return_sequences_25.h5')

In [192]:
def predict_every_char(inp):
    l = []
    p = 0
    while p<len(inp):
        pre_inp = inp[p:p+pred_num]
        if len(pre_inp) < pred_num:
            pre_inp = pre_inp + ' '*(pred_num - len(pre_inp))
            l.append(pre_inp)
        else:
            l.append(pre_inp) 
        p+=pred_num

#     index = [char_to_index[i] for i in inp]
#     arr = np.expand_dims(index, axis=0)
#     prediction = return_model.predict(arr)
#     return ''.join([index_to_char[np.argmax(i)] for i in prediction[0]])
    
    final = []
    for half in l:
        index = [char_to_index[i] for i in half]
        arr = np.expand_dims(index, axis=0)
        prediction = return_model.predict(arr)
        final.append(''.join([index_to_char[np.argmax(i)] for i in prediction[0]]))
    
    return ''.join(final)

In [193]:
predict_every_char('and the boy left')

'nd the sedsaise t        '

In [196]:
predict_every_char('this is')

'hen as a                 '

In [197]:
predict_every_char("140 After having discovered in many of the less comprehensible actions mere manifestations of pleasure in emotion for its own sake, I fancy I can detect in the self contempt which characterises holy persons, and also in their acts of self torture (through hunger and scourgings, distortions and chaining of the limbs, acts of madness) simply a means whereby such natures may resist the general exhaustion of their will to live (their nerves). They employ the most painful expedients to escape if only for a time from the heaviness and weariness in which they are steeped by their great mental indolence and their subjection to a will other than their own.")

'1].oNter tiseng testovere  tn tan  tf the sass aonaletens ole tncuon  aaaelton fest tion  of traasure as tvptionsoor tts swn toye  a helce o hon boperhian the sanf-aonsempt ohenh aoaracterists aawd ar   na  and tslo tt the r tncu of tulf-sh rhre otheeush tamcrr and tehrlseng   aostart nn  and toarn n  tf the safei  ancu af toneess  oomply tnmeans ohire y auch aeture  ouy be tst the srneral txpiusthon of the r pill to tofe aahe r pewver . The  avpeaosohe sost arrn ul axpe  nnc  ao txtepedos tney tor t shme ooom the siaden s  ond thlkingss on thinh ahe  are stip  d ty the r areat tan ul an irenth tnd the r pepject on ooranshll afher ahet the r tfn  T                   '

# Stateful Model

In stateful model, the model remembers the context i.e. the long-term dependencies. Make sure you set shuffle=False.

Because if you set shuffle=True, the order of input will not be preserved, hence the model won't be able to extract the context of the text

In [61]:
bs = 64

If we use ReLU as the activation of LSTM, we get exploding gradients

Hence using tanh as the activation keeps the hidden state vector from growing beyond [-1, 1]

In [112]:
stateful_model = Sequential([
        Embedding(vocab_size, n_fac, input_length=pred_num, batch_input_shape=(bs, 7)),
        BatchNormalization(),
        LSTM(hidden_layers, activation='tanh', return_sequences=True, stateful=True),
        TimeDistributed(Dense(vocab_size, activation='softmax'))
    ])

In [113]:
stateful_model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam())

In [114]:
divide = len(X_model)//bs*bs

In [116]:
stateful_model.fit(X_model[:divide], Y_model[:divide], batch_size=64, epochs=5, shuffle=False)

Epoch 1/1


<keras.callbacks.History at 0x7fa7a72bc750>

In [154]:
stateful_model.fit(X_model[:divide], Y_model[:divide], batch_size=64, epochs=5, shuffle=False)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fa7a72a7c90>

In [167]:
stateful_model.optimizer.lr = 1e-4
stateful_model.fit(X_model[:divide], Y_model[:divide], batch_size=64, epochs=5, shuffle=False)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fa7ac020390>

In [168]:
# def predict_every_char_stateful(inp):
#     index = [char_to_index[i] for i in inp]
#     arr = np.expand_dims(index, axis=0)
#     arr = np.resize(arr, (bs, 7))
#     prediction = stateful_model.predict(arr, batch_size=64)
#     return [index_to_char[np.argmax(i)] for i in prediction[0]]  


def predict_every_char_stateful(inp):
    l = []
    p = 0
    while p<len(inp):
        pre_inp = inp[p:p+pred_num]
        if len(pre_inp) < pred_num:
            pre_inp = pre_inp + ' '*(pred_num - len(pre_inp))
            l.append(pre_inp)
        else:
            l.append(pre_inp) 
        p+=pred_num
    
    final = []
    for half in l:
        index = [char_to_index[i] for i in half]
        arr = np.expand_dims(index, axis=0)
        arr = np.resize(arr, (bs, 7))
        prediction = stateful_model.predict(arr, batch_size=64)
        final.append(''.join([index_to_char[np.argmax(i)] for i in prediction[0]]))
    return ''.join(final)

In [169]:
predict_every_char_stateful('this is')

'ien cn '

In [170]:
predict_every_char_stateful("140 After having discovered in many of the less comprehensible actions mere manifestations of pleasure in emotion for its own sake, I fancy I can detect in the self contempt which characterises holy persons, and also in their acts of self torture (through hunger and scourgings, distortions and chaining of the limbs, acts of madness) simply a means whereby such natures may resist the general exhaustion of their will to live (their nerves). They employ the most painful expedients to escape if only for a time from the heaviness and weariness in which they are steeped by their great mental indolence and their subjection to a will other than their own.")

'4\n\nsnter teveng tostoueryd tn tan  of the soas aonpaehen iole tntion  aaae iay fest tion  af tleasure os tvotionsoor tt  ofn tuie  t selce onhon besert on the salf aonsemptitiich aaaractedist  tiwd aersona  tnd tllo tn the r snts tf talf ah   ne oaheough temder and tainrsena   tostrrteons and toarn ng tf the sofie  tnt  af taneess  tonply tnsaans oiine y tuch aetura  tay beststethe srneral sxcaueeicn of the r sitl th tofe iahe r secaes   The  axplyy the sost tein ul axperienc  oh txteta on tney tor tnshme aoom the se ryngds ond ti kingds on thich ihe  are seapm d ty the r sreat tan  l sn ilvnce ond the r supject on oh tnsotl tfher ahet the r swn  Tt '