# Text generation with LSTM / GRU

Dataset: text from Shakespeare.

To explore Recurrent Neural Networks. Model will convert each character to its embedding, run the embeddings through LSTM or GRU, predict the next set of characters.

In [2]:
import tensorflow as tf
gpu_devices = tf.config.experimental.list_physical_devices('GPU')
for device in gpu_devices:
    tf.config.experimental.set_memory_growth(device, True)
print(tf.__version__)

import string     
import requests   

2.3.1


## Data Preprocessing

In [208]:
# Load dataset
dirname = './'
filename = 'shakespeare.txt'
lines = [] # storing all the lines in a variable. 
# for filename in os.listdir(dirname):
with open(os.path.join(dirname, filename)) as files:
    for line in files:
        # remove leading and trailing whitespace
        pure_line = line.strip('\n')
        
        # if pure_line is not the empty string,
#         if pure_line:
            # append it to the list
        lines.append(pure_line)
print(len(lines))
# Start the data fron index 253, since data[:253] is not work of shakespeare
# Print the first line of shakespeare creation
data = lines[253:]

124456


In [209]:
# Text cleaning
data = " ".join(data)
def clean_text(doc):
  tokens = doc.split()
  table = str.maketrans('','', string.punctuation)        # remove the punctuations
  tokens = [(w.translate(table)) for w in tokens]  # list without punctuations (removing the punctuations)
  tokens = [word for word in tokens if word.isalpha()]   # remove non alphanumeric special charactors
  tokens = [word.lower() for word in tokens]               # convert into lowercase letters
  return tokens

# we will use these 50 tokens as seed text
tokens = clean_text(data)
print(tokens[:50])

['from', 'fairest', 'creatures', 'we', 'desire', 'increase', 'that', 'thereby', 'beautys', 'rose', 'might', 'never', 'die', 'but', 'as', 'the', 'riper', 'should', 'by', 'time', 'decease', 'his', 'tender', 'heir', 'might', 'bear', 'his', 'memory', 'but', 'thou', 'contracted', 'to', 'thine', 'own', 'bright', 'eyes', 'feedst', 'thy', 'lights', 'flame', 'with', 'selfsubstantial', 'fuel', 'making', 'a', 'famine', 'where', 'abundance', 'lies', 'thy']


In [27]:
# Numbers of tokens and number of unique words
print(len(tokens), len(set(tokens)))

898199 27956


In [28]:
# Use 50 set of words to predict the next word (51th)
length = 50+1  # 50 is for input and 50+1 is for output
lines = []
for i in range(length, len(tokens)):   # this range will start from length(51)
  seq = tokens[i-length:i]             # seq = 0 to 51 for first sequence
  line = ' '.join(seq)                 # join tokens to create a line
  lines.append(line)                   # a single line is sequence of 51 words, append that line in list (lines)
  if i > 200000:                       # take first 200k words to train the model, it will reduce the time and required resourses  
    break

In [33]:
# totel number of sequences
print(len(lines))
print(lines[0])               # at index 0 you can see first 51 words
tokens[0], tokens[50]  # first word amd 51th word

199951
from fairest creatures we desire increase that thereby beautys rose might never die but as the riper should by time decease his tender heir might bear his memory but thou contracted to thine own bright eyes feedst thy lights flame with selfsubstantial fuel making a famine where abundance lies thy self


('from', 'self')

## Tokenization

In [95]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras .utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding, GRU, Dropout
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [133]:
# Tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts(lines) # fit lines in tokenization
sequences = tokenizer.texts_to_sequences(lines)        # word embedding
# Convert sequences array into numpy array
sequences = np.array(sequences)
sequences

array([[  47, 1408, 1264, ...,  466,   31,  307],
       [1408, 1264,   37, ...,   31,  307,   31],
       [1264,   37,  451, ...,  307,   31, 1582],
       ...,
       [  33,   80, 5197, ...,  215,   44,   30],
       [  80, 5197,  103, ...,   44,   30, 1332],
       [5197,  103, 1846, ...,   30, 1332,    2]])

In [134]:
# create x and y
# rows = line of play, columns = first 50 columns is x and 51st column is y
x, y = sequences[:, :-1], sequences[:, -1]
print(x[0])
print(y)

[   47  1408  1264    37   451  1406     9  2766  1158  1213   171   132
   269    20    24     1  4782    87    30    98  4781    18   715  1263
   171   211    18   829    20    27  3807     4   214   121  1212   153
 13004    31  2765  1847    16 13003 13002   754     7  3806    99  2430
   466    31]
[ 307   31 1582 ...   30 1332    2]


In [135]:
# length of total vocabulary
vocab_size = len(tokenizer.word_index) + 1 
print(vocab_size, len(set(tokens)))
y = to_categorical(y, num_classes=vocab_size)
seq_length = x.shape[1]
print(seq_length)
print(y)

13009 27956
50
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]]


## Build the LSTM Model

In [136]:
model = Sequential()
# First embedding layer
model.add(Embedding(input_dim=vocab_size, output_dim=50, input_length=seq_length))
# First LSTM layer
model.add(LSTM(units=100, return_sequences=True))
# Second LSTM layer
model.add(LSTM(units=100))
# Dense layer
model.add(Dense(units=100,activation='relu'))
# Final layer
model.add(Dense(units=vocab_size, activation='softmax'))

In [137]:
# summary of model
model.summary()

Model: "sequential_19"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_19 (Embedding)     (None, 50, 50)            650450    
_________________________________________________________________
lstm_15 (LSTM)               (None, 50, 100)           60400     
_________________________________________________________________
lstm_16 (LSTM)               (None, 100)               80400     
_________________________________________________________________
dense_33 (Dense)             (None, 100)               10100     
_________________________________________________________________
dense_34 (Dense)             (None, 13009)             1313909   
Total params: 2,115,259
Trainable params: 2,115,259
Non-trainable params: 0
_________________________________________________________________


In [138]:
# Compile the model
model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])
# Train the model
model.fit(x,y, batch_size=256,epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<tensorflow.python.keras.callbacks.History at 0x1e44df21e20>

In [139]:
print(lines[12343])
seed_text = lines[12343]
# put this line as seed text

home of love if i have ranged like him that travels i return again just to the time not with the time exchanged so that my self bring water for my stain never believe though in my nature reigned all frailties that besiege all kinds of blood that it could so


In [140]:
# ML model will predict the word on the basis of what it learned
# define a function
def generate_text_seq(model, tokenizer, text_seq_length, seed_text, n_words):
  text = []
  for _ in range(n_words):
    encoded = tokenizer.texts_to_sequences([seed_text])[0]  # 0 is for zeroth dimention of array
    encoded = pad_sequences([encoded], maxlen = text_seq_length, truncating='pre')
    y_predict = model.predict_classes(encoded)
    
    predicted_word = ''
    for word, index in tokenizer.word_index.items():    # for each of these words we are checking, it matches with y_pred or not
      if index == y_predict:
        predicted_word = word
        break
    seed_text = seed_text + ' ' + predicted_word
    text.append(predicted_word)
  return ' '.join(text)

In [141]:
generate_text_seq(model, tokenizer, seq_length, seed_text, 10)

'preposterously be stained to leave to make me leave to'

## Build the GRU Model

In [110]:
model_2 = Sequential()
# First embedding layer
model_2.add(Embedding(input_dim=vocab_size, output_dim=50, input_length=seq_length))
# First GRU layer
model_2.add(GRU(units=100, return_sequences=True))
# model_2.add(Dropout(0.5))
# Second GRU layer
model_2.add(GRU(units=100))
model_2.add(Dropout(0.2))
# Dense layer
model_2.add(Dense(units=200, activation='relu'))
# model_2.add(Dropout(0.5))
# Final layer
model_2.add(Dense(units=vocab_size, activation='softmax'))
model_2.summary()

Model: "sequential_12"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_12 (Embedding)     (None, 50, 50)            650450    
_________________________________________________________________
gru_18 (GRU)                 (None, 50, 100)           45600     
_________________________________________________________________
gru_19 (GRU)                 (None, 100)               60600     
_________________________________________________________________
dropout_12 (Dropout)         (None, 100)               0         
_________________________________________________________________
dense_19 (Dense)             (None, 200)               20200     
_________________________________________________________________
dense_20 (Dense)             (None, 13009)             2614809   
Total params: 3,391,659
Trainable params: 3,391,659
Non-trainable params: 0
___________________________________________

In [111]:
# Compile the model
model_2.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])
# Train the model
model_2.fit(x,y, batch_size=256,epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<tensorflow.python.keras.callbacks.History at 0x1e448bf6bb0>

In [114]:
generate_text_seq(model_2, tokenizer, seq_length, seed_text, 10)

Instructions for updating:
Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`,   if your model does binary classification   (e.g. if it uses a `sigmoid` last-layer activation).


'preposterously be stained to leave for nothing so unkind to'