In [5]:
! pip install pydrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive 
from google.colab import auth 
from oauth2client.client import GoogleCredentials
# 1. Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)



In [0]:
###https://drive.google.com/open?id=1fy0pk3Xs2uNR6HaM9oVN4830A2I5CqnY

#### Downloading the text data

In [0]:
downloaded = drive.CreateFile({'id': '1fy0pk3Xs2uNR6HaM9oVN4830A2I5CqnY'})
downloaded.GetContentFile('wonderland.txt')

In [7]:
!ls

adc.json  sample_data  wonderland.txt


### The purpose of this tutorial is to create a generative model for text, character-by-character using LSTM recurrent neural networks in Python with Keras.

In [8]:
import sys
import string
import numpy
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.layers import Masking
from keras.callbacks import ModelCheckpoint
from keras.preprocessing.sequence import pad_sequences
from keras.utils import np_utils

Using TensorFlow backend.


In [0]:
# load ascii text and covert to lowercase
filename = "wonderland.txt"
raw_text = open(filename,encoding='utf-8').read()
raw_text = raw_text.lower()

In [10]:
num_lines = 0
with open(filename, 'r') as f:
    for line in f:
        num_lines += 1
print("Number of lines:")
print(num_lines)

Number of lines:
3328


#### We have Alice in the Wonderland text consisting of nearly 3328 lines. Check out `raw_text` below.

In [11]:
raw_text



#### Few preprocessing steps to consider:

#### We need preprocessing of the given wonderland.txt so that we can define the training data for the network
* Removing punctuations except `full-stop` since we will be using it to break up the text into sentences.

* Also since there are lot of new-line character `\n` we can replace them with `full-stop` which will help us break longer sentences into shorter one.

In [0]:

def Punctuation(inp_str): 
  
    # punctuation marks 
    
    punctuations = '!"#$%&\'()*+,-/:;<=>?@[\\]^_`{|}~'
  
    # traverse the given string and if any punctuation 
    # marks occur replace it with null 
    for x in inp_str.lower(): 
        if x in punctuations: 
            inp_str = inp_str.replace(x, "") 
  
    # Print string without punctuation 
    return inp_str

In [0]:
new_text = Punctuation(raw_text)
new_text = new_text.replace('\n','.')
new_text = new_text.replace('\ufeff','')
new_text = new_text.split('.')


In [14]:
len(new_text)

4317

In [0]:
maxlen=0
for line in new_text:
  if len(line)>maxlen:
    maxlen=len(line)

In [16]:
maxlen

73

#### Check out `final_text`. It is a list of strings of size nearly 72.
Note all the punctuations are removed and we have simple strings of characters `a-z` (lower case only) and `space` character. Since we are processing characters we need our network to learn `space` character also so that it can form words of its own.

In [17]:
final_text = []
for line in new_text:
  if line == '':
    continue
  final_text.append(line)

final_text
    

['chapter i',
 ' down the rabbithole',
 'alice was beginning to get very tired of sitting by her sister on the',
 'bank and of having nothing to do once or twice she had peeped into the',
 'book her sister was reading but it had no pictures or conversations in',
 'it and what is the use of a book thought alice without pictures or',
 'conversations',
 'so she was considering in her own mind as well as she could for the',
 'hot day made her feel very sleepy and stupid whether the pleasure',
 'of making a daisychain would be worth the trouble of getting up and',
 'picking the daisies when suddenly a white rabbit with pink eyes ran',
 'close by her',
 'there was nothing so very remarkable in that nor did alice think it so',
 'very much out of the way to hear the rabbit say to itself oh dear',
 'oh dear i shall be late when she thought it over afterwards it',
 'occurred to her that she ought to have wondered at this but at the time',
 'it all seemed quite natural but when the rabbit actuall

In [18]:
len(final_text)

2845

#### Check out distinct characters in our `final_text`  and create a vocabulory using them.

In [19]:
final_str = ' '.join(final_text)
chars = sorted(list(set(final_str)))
print(chars)

[' ', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [0]:
char_to_int = dict((c, i) for i, c in enumerate(chars,1))

In [21]:
char_to_int

{' ': 1,
 'a': 2,
 'b': 3,
 'c': 4,
 'd': 5,
 'e': 6,
 'f': 7,
 'g': 8,
 'h': 9,
 'i': 10,
 'j': 11,
 'k': 12,
 'l': 13,
 'm': 14,
 'n': 15,
 'o': 16,
 'p': 17,
 'q': 18,
 'r': 19,
 's': 20,
 't': 21,
 'u': 22,
 'v': 23,
 'w': 24,
 'x': 25,
 'y': 26,
 'z': 27}

#### Define a padding character `PAD` with index `0`. Importance of padding explained little later

In [0]:
char_to_int['PAD'] = 0

In [0]:
int_to_char = {v: k for k, v in char_to_int.items()}

In [24]:
int_to_char

{0: 'PAD',
 1: ' ',
 2: 'a',
 3: 'b',
 4: 'c',
 5: 'd',
 6: 'e',
 7: 'f',
 8: 'g',
 9: 'h',
 10: 'i',
 11: 'j',
 12: 'k',
 13: 'l',
 14: 'm',
 15: 'n',
 16: 'o',
 17: 'p',
 18: 'q',
 19: 'r',
 20: 's',
 21: 't',
 22: 'u',
 23: 'v',
 24: 'w',
 25: 'x',
 26: 'y',
 27: 'z'}

#### Note that we have only 2845 individual strings in our `final_text` data. To train network we will need quite large amout of sequences. Let's create the sequences by passing a sliding window on each of the strings in our data. 

In [0]:
input_seq = []
output_seq = []
for mystring in final_text:
  for i in range(1,len(mystring)):
    seq_in = mystring[:i]
    seq_out = mystring[i]
    input_seq.append(seq_in)
    output_seq.append(seq_out)
  

In [112]:
for i in range(3):
  print(final_text[i])
  

chapter i
 down the rabbithole
alice was beginning to get very tired of sitting by her sister on the


#### This is our dataset preparation for RNN model. We feed in a sequence of characters and get an output character which has maximum probability based on the input fed characters. See sequence `samples` below:

In [111]:
for i in range(20):
  print("Input:", input_seq[i], "Output:", output_seq[i])

Input: c Output: h
Input: ch Output: a
Input: cha Output: p
Input: chap Output: t
Input: chapt Output: e
Input: chapte Output: r
Input: chapter Output:  
Input: chapter  Output: i
Input:   Output: d
Input:  d Output: o
Input:  do Output: w
Input:  dow Output: n
Input:  down Output:  
Input:  down  Output: t
Input:  down t Output: h
Input:  down th Output: e
Input:  down the Output:  
Input:  down the  Output: r
Input:  down the r Output: a
Input:  down the ra Output: b


#### So final data for our LSTM model has now 129829 input-output pairs

In [26]:
len(input_seq)

129829

In [27]:
len(output_seq)

129829

In [0]:
dataX=[]
dataY=[]
for i in range(len(input_seq)):
  seq_in = input_seq[i]
  seq_out = output_seq[i]
  dataX.append([char_to_int[char] for char in seq_in])
  dataY.append(char_to_int[seq_out])

In [29]:
dataX

[[4],
 [4, 9],
 [4, 9, 2],
 [4, 9, 2, 17],
 [4, 9, 2, 17, 21],
 [4, 9, 2, 17, 21, 6],
 [4, 9, 2, 17, 21, 6, 19],
 [4, 9, 2, 17, 21, 6, 19, 1],
 [1],
 [1, 5],
 [1, 5, 16],
 [1, 5, 16, 24],
 [1, 5, 16, 24, 15],
 [1, 5, 16, 24, 15, 1],
 [1, 5, 16, 24, 15, 1, 21],
 [1, 5, 16, 24, 15, 1, 21, 9],
 [1, 5, 16, 24, 15, 1, 21, 9, 6],
 [1, 5, 16, 24, 15, 1, 21, 9, 6, 1],
 [1, 5, 16, 24, 15, 1, 21, 9, 6, 1, 19],
 [1, 5, 16, 24, 15, 1, 21, 9, 6, 1, 19, 2],
 [1, 5, 16, 24, 15, 1, 21, 9, 6, 1, 19, 2, 3],
 [1, 5, 16, 24, 15, 1, 21, 9, 6, 1, 19, 2, 3, 3],
 [1, 5, 16, 24, 15, 1, 21, 9, 6, 1, 19, 2, 3, 3, 10],
 [1, 5, 16, 24, 15, 1, 21, 9, 6, 1, 19, 2, 3, 3, 10, 21],
 [1, 5, 16, 24, 15, 1, 21, 9, 6, 1, 19, 2, 3, 3, 10, 21, 9],
 [1, 5, 16, 24, 15, 1, 21, 9, 6, 1, 19, 2, 3, 3, 10, 21, 9, 16],
 [1, 5, 16, 24, 15, 1, 21, 9, 6, 1, 19, 2, 3, 3, 10, 21, 9, 16, 13],
 [2],
 [2, 13],
 [2, 13, 10],
 [2, 13, 10, 4],
 [2, 13, 10, 4, 6],
 [2, 13, 10, 4, 6, 1],
 [2, 13, 10, 4, 6, 1, 24],
 [2, 13, 10, 4, 6, 1, 24, 2],
 

In [30]:
len(dataX)

129829

In [0]:
maxlen=0
for li in dataX:
  if maxlen<len(li):
    maxlen=len(li)

In [32]:
maxlen

72

### PADDING SEQUENCES

To feed our LSTM network in batch we will try to make all sequences of same-length by padding each sequence with some `padding character` defined by us. Note that this padding character will be different from the existing characters in our dataset. Its complete purpose is just to create a batch of same length sequences for LSTM

In [0]:
dataX_padded = pad_sequences(dataX, padding='post')

In [0]:
maxlen=72
for li in dataX_padded:
  if maxlen<len(li):
    maxlen=len(li)

In [35]:
maxlen

72

So now we have padded input data where each sequence is of same length 72. check out `dataX_padded`.

In [36]:
dataX_padded

array([[ 4,  0,  0, ...,  0,  0,  0],
       [ 4,  9,  0, ...,  0,  0,  0],
       [ 4,  9,  2, ...,  0,  0,  0],
       ...,
       [19,  6, 14, ...,  0,  0,  0],
       [19,  6, 14, ...,  0,  0,  0],
       [19,  6, 14, ...,  0,  0,  0]], dtype=int32)

In [113]:
n_patterns = len(dataX_padded)
print("Total input-output patterns in our dataset: ", n_patterns)

Total input-output patterns in our dataset:  129829


In [114]:
seq_length = maxlen
print("Length of padded sequences: ",seq_length)

Length of padded sequences:  72


In [116]:
n_vocab = len(chars)
print("Total characters in our vocab {'a-z' and ' '}: " , n_vocab)

Total characters in our vocab {'a-z' and ' '}:  27


Now that we have prepared our training data we need to transform it so that it is suitable for use with Keras.

First we must transform the list of input sequences into the form [samples, time steps, features] expected by an LSTM network.

Next we need to rescale the integers to the range 0-to-1 to make the patterns easier to learn by the LSTM network that uses the sigmoid activation function by default.

Finally, we need to convert the output patterns (single characters converted to integers) into a one hot encoding. This is so that we can configure the network to predict the probability of each of the 28 different characters in the vocabulary (an easier representation) rather than trying to force it to predict precisely the next character. Each y value is converted into a sparse vector with a length of 28, full of zeros except with a 1 in the column for the letter (integer) that the pattern represents.



In [0]:
# reshape X to be [samples, time steps, features]
X = numpy.reshape(dataX_padded, (n_patterns, seq_length, 1))
# normalize
X = X / float(n_vocab)
# one hot encode the output variable
y = np_utils.to_categorical(dataY)

In [41]:
X.shape

(129829, 72, 1)

In [117]:
features = X.shape[2]
print("Feature for each character is the character index in our vocab. No. of features: ", features)

Feature for each character is the character index in our vocab. No. of features:  1


In our vocabulory, character `p` has index 17. Check out its one-hot encoded vector. It will have `1` only at the `17th` position

In [121]:
dataY[2]

17

In [123]:
y[2]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], dtype=float32)

In [43]:
y.shape

(129829, 28)

### Creating our model

* Remember that only purpose of adding ``'PAD'`` with index `0` was to make all the input sequences of same size. Then we use a Masking layer that skips those special timestamps like they don't exist.
Note that if we pad without masking, padded value will be regarded as actual value, thus, it becomes noise in data. Refer [how-to-feed-lstm-with-different-input-array-sizes using padding](https://datascience.stackexchange.com/questions/48796/how-to-feed-lstm-with-different-input-array-sizes) for the same.

* Our model is simple comprising of two layers of LSTM and one Dense layer to apply softmax.
Before sending the input to LSTM (first layer) we will use dropout of 0.1. And again before sending the first LSTM layer output to second LSTM layer we will apply dropout. Note that no dropout is applied before Dense layer (we prefer that the softmax layer sees all the final activation values before making decision)

* Check out the model details using `model.summary()`

In [0]:
model = Sequential()
model.add(Masking(mask_value=0, input_shape=(seq_length, features)))
model.add(Dropout(0.1))
model.add(LSTM(256,return_sequences=True))
model.add(Dropout(0.1))
model.add(LSTM(256))
model.add(Dense(y.shape[1], activation='softmax'))

In [49]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
masking_5 (Masking)          (None, 72, 1)             0         
_________________________________________________________________
dropout_7 (Dropout)          (None, 72, 1)             0         
_________________________________________________________________
lstm_7 (LSTM)                (None, 72, 256)           264192    
_________________________________________________________________
dropout_8 (Dropout)          (None, 72, 256)           0         
_________________________________________________________________
lstm_8 (LSTM)                (None, 256)               525312    
_________________________________________________________________
dense_2 (Dense)              (None, 28)                7196      
Total params: 796,700
Trainable params: 796,700
Non-trainable params: 0
_________________________________________________________________


### Model training for 100 epochs and batch-size 128

In [50]:
model.compile(loss='categorical_crossentropy', optimizer='adam')
# define the checkpoint
filepath="weights-improvement-2-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]
# fit the model
model.fit(X, y, epochs=100, batch_size=128, callbacks=callbacks_list)

W0726 04:07:32.625015 140424021329792 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/optimizers.py:790: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.

W0726 04:07:32.657660 140424021329792 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:3295: The name tf.log is deprecated. Please use tf.math.log instead.



Epoch 1/100

Epoch 00001: loss improved from inf to 2.74415, saving model to weights-improvement-2-01-2.7441.hdf5
Epoch 2/100

Epoch 00002: loss improved from 2.74415 to 2.48569, saving model to weights-improvement-2-02-2.4857.hdf5
Epoch 3/100

Epoch 00003: loss improved from 2.48569 to 2.29766, saving model to weights-improvement-2-03-2.2977.hdf5
Epoch 4/100

Epoch 00004: loss improved from 2.29766 to 2.16071, saving model to weights-improvement-2-04-2.1607.hdf5
Epoch 5/100

Epoch 00005: loss improved from 2.16071 to 2.05807, saving model to weights-improvement-2-05-2.0581.hdf5
Epoch 6/100

Epoch 00006: loss improved from 2.05807 to 1.97463, saving model to weights-improvement-2-06-1.9746.hdf5
Epoch 7/100

Epoch 00007: loss improved from 1.97463 to 1.91729, saving model to weights-improvement-2-07-1.9173.hdf5
Epoch 8/100

Epoch 00008: loss improved from 1.91729 to 1.86673, saving model to weights-improvement-2-08-1.8667.hdf5
Epoch 9/100

Epoch 00009: loss improved from 1.86673 to 1.82

<keras.callbacks.History at 0x7fb6b048be48>

#### So finally after running 100 epochs it is observed that the lowest loss is 1.17 which occurs at the last epoch `100`. And so we will use this trained model to make predictions. 

In [0]:
from google.colab import files
files.download('weights-improvement-2-100-1.1717.hdf5')

#### Lets try out predictions by feeding a random seed sequence into the trained model.
We can pick a random input pattern as our seed sequence, then print generated characters as we generate them.
#### Generating 500 characters

Example 1

In [79]:
# pick a random seed
start = numpy.random.randint(0, len(dataX)-1)
print("Index of the pattern to start with:",start)
pattern = dataX_padded[start]
print("Seed:")
print("\"", ''.join([int_to_char[value] for value in pattern]), "\"")


Index of the pattern to start with: 118532
Seed:
" nearly out of sight he said in a deep voice what are tarPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPAD "


In [0]:
pattern = list(dataX_padded[start])

In [0]:
# generate characters
import sys
output = ''
for i in range(500):
  x = numpy.reshape(pattern, (1, len(pattern), 1))
  x = x / float(n_vocab)
  prediction = model.predict(x, verbose=0)
  index = numpy.argmax(prediction)
  result = int_to_char[index]
  seq_in = [int_to_char[value] for value in pattern]
  output = output+result
  #sys.stdout.write(result)
  pattern.append(index)
  #print(len(pattern))
  pattern = pattern[1:len(pattern)]

#### 500 characters generated `output`

In [82]:
output

'ce anlce tuittie toee iisself uhe had nevehl tuedked oure and tat as suoanary and tepiine tuesttoeling ano tound toeaked hirsoish cate ier stom temarked tarty suitting tuesbling anice and suusidlers whe suocess oed lerstrally hot teneing tatty suisui tusting about ier lnee ano taid alice and tead ier face ootoce oot tatted her hererally jt suitted toopling about ier lnee ano taid alice and teasg tight ie sane begors iere anice hasning ano tound teeahran anlce temarked tuitting toeaked hirsoays a'

#### Displaying the output in a readable manner

In [96]:
display = ''
for i in range(len(output)):
  display=display+output[i]
  if i!=0 and i%100==0:
    print(display)
    display = ''
  

ce anlce tuittie toee iisself uhe had nevehl tuedked oure and tat as suoanary and tepiine tuesttoelin
g ano tound toeaked hirsoish cate ier stom temarked tarty suitting tuesbling anice and suusidlers wh
e suocess oed lerstrally hot teneing tatty suisui tusting about ier lnee ano taid alice and tead ier
 face ootoce oot tatted her hererally jt suitted toopling about ier lnee ano taid alice and teasg ti


In [83]:
len(output)

500

Example 2

In [105]:
# pick a random seed
start = numpy.random.randint(0, len(dataX)-1)
print("Index of the pattern to start with:",start)
pattern = dataX_padded[start]
print("Seed:")
print("\"", ''.join([int_to_char[value] for value in pattern]), "\"")


Index of the pattern to start with: 81210
Seed:
" the soldiers were silent and lookePADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPADPAD "


In [0]:
pattern = list(dataX_padded[start])

In [0]:
# generate characters
import sys
output = ''
for i in range(500):
  x = numpy.reshape(pattern, (1, len(pattern), 1))
  x = x / float(n_vocab)
  prediction = model.predict(x, verbose=0)
  index = numpy.argmax(prediction)
  result = int_to_char[index]
  seq_in = [int_to_char[value] for value in pattern]
  output = output+result
  #sys.stdout.write(result)
  pattern.append(index)
  #print(len(pattern))
  pattern = pattern[1:len(pattern)]

#### 500 characters generated `output`

In [108]:
output

'd at suoanly temember and tead iimg alice anlver troeasimledt and as suosuratt tuesnlyo iad anyiousled oot tattedk gor yery wnder teady tosme at suopues anice hainine toease because tuiml tuedking tuesing anices anice hainine teplled iad anl at suopues toated anlce tuied tound temember about ier anice and io bno anlce temarked toeepy butsuted oot tatted oy horves anice whought alice and iere tairy foough hot seteated at suoanly temember teady tosme anice and io bno tound teeahr at suictutean tep'

#### Displaying the output in a readable manner

In [109]:
display = ''
for i in range(len(output)):
  display=display+output[i]
  if i!=0 and i%100==0:
    print(display)
    display = ''
  

d at suoanly temember and tead iimg alice anlver troeasimledt and as suosuratt tuesnlyo iad anyiousle
d oot tattedk gor yery wnder teady tosme at suopues anice hainine toease because tuiml tuedking tues
ing anices anice hainine teplled iad anl at suopues toated anlce tuied tound temember about ier anic
e and io bno anlce temarked toeepy butsuted oot tatted oy horves anice whought alice and iere tairy 


#### For reference all the weight files generated during improvement. Since the model at the last epoch performs with lowest loss we can download those weights and utilize them in future purposes

In [63]:
!ls

adc.json			       weights-improvement-2-45-1.3396.hdf5
sample_data			       weights-improvement-2-46-1.3383.hdf5
weights-improvement-2-01-2.7441.hdf5   weights-improvement-2-47-1.3304.hdf5
weights-improvement-2-02-2.4857.hdf5   weights-improvement-2-48-1.3303.hdf5
weights-improvement-2-03-2.2977.hdf5   weights-improvement-2-49-1.3222.hdf5
weights-improvement-2-04-2.1607.hdf5   weights-improvement-2-50-1.3165.hdf5
weights-improvement-2-05-2.0581.hdf5   weights-improvement-2-51-1.3137.hdf5
weights-improvement-2-06-1.9746.hdf5   weights-improvement-2-52-1.3100.hdf5
weights-improvement-2-07-1.9173.hdf5   weights-improvement-2-53-1.3065.hdf5
weights-improvement-2-08-1.8667.hdf5   weights-improvement-2-54-1.2988.hdf5
weights-improvement-2-09-1.8223.hdf5   weights-improvement-2-55-1.2955.hdf5
weights-improvement-2-100-1.1717.hdf5  weights-improvement-2-56-1.2898.hdf5
weights-improvement-2-10-1.7864.hdf5   weights-improvement-2-58-1.2787.hdf5
weights-improvement-2-11-1.7539.hdf5   weights-impr