<a href="https://colab.research.google.com/github/sumanyurosha/tensorflow-specialization/blob/master/Practice/Practicing_Text_Generation_with_RNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import os
import time

# **1. Downloading the Dataset**

In [2]:
path_to_file = keras.utils.get_file("shakespeare.txt", "https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt")

text_data = open(path_to_file, mode="rb", ).read().decode(encoding="utf-8")
print("Characters in dataset : {}".format(len(text_data)))

Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt
Characters in dataset : 1115394


In [3]:
print(text_data[:250])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.



# **2. Data Preprocessing**

In [4]:
from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(char_level=True)

tokenizer.fit_on_texts(text_data)

In [5]:
for i, item in enumerate(tokenizer.word_index):
    print("{}: {}".format(i+1, repr(item)))

1: ' '
2: 'e'
3: 't'
4: 'o'
5: 'a'
6: 'i'
7: 'h'
8: 's'
9: 'r'
10: 'n'
11: '\n'
12: 'l'
13: 'd'
14: 'u'
15: 'm'
16: 'y'
17: 'w'
18: ','
19: 'c'
20: 'f'
21: 'g'
22: 'b'
23: 'p'
24: ':'
25: 'k'
26: 'v'
27: '.'
28: "'"
29: ';'
30: '?'
31: '!'
32: '-'
33: 'j'
34: 'q'
35: 'x'
36: 'z'
37: '3'
38: '&'
39: '$'


In [6]:
max_len = len(tokenizer.word_index)
seq_length = 100
batch_size = 32
vocab_size = max_len
embedding_dim = 16

In [19]:
[sequences] = np.array(tokenizer.texts_to_sequences([text_data])) - 1
sequences[:5]

array([19,  5,  8,  7,  2])

In [20]:
x = sorted(set(sequences))
print(x)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38]


In [21]:
dataset = tf.data.Dataset.from_tensor_slices(sequences)
dataset = dataset.batch(seq_length+1, drop_remainder=True)
dataset

<BatchDataset shapes: (101,), types: tf.int64>

In [22]:
def split_input_target(chunk):
    input_text = chunk[:-1]
    output_text = chunk[1:]
    return input_text, output_text

dataset = dataset.map(split_input_target)

In [26]:
for input_text, output_text in dataset.take(3):
    print(tokenizer.sequences_to_texts([input_text.numpy() + 1]))
    print(tokenizer.sequences_to_texts([output_text.numpy() + 1]))
    print()

['f i r s t   c i t i z e n : \n b e f o r e   w e   p r o c e e d   a n y   f u r t h e r ,   h e a r   m e   s p e a k . \n \n a l l : \n s p e a k ,   s p e a k . \n \n f i r s t   c i t i z e n : \n y o u']
['i r s t   c i t i z e n : \n b e f o r e   w e   p r o c e e d   a n y   f u r t h e r ,   h e a r   m e   s p e a k . \n \n a l l : \n s p e a k ,   s p e a k . \n \n f i r s t   c i t i z e n : \n y o u  ']

['a r e   a l l   r e s o l v e d   r a t h e r   t o   d i e   t h a n   t o   f a m i s h ? \n \n a l l : \n r e s o l v e d .   r e s o l v e d . \n \n f i r s t   c i t i z e n : \n f i r s t ,   y o u  ']
['r e   a l l   r e s o l v e d   r a t h e r   t o   d i e   t h a n   t o   f a m i s h ? \n \n a l l : \n r e s o l v e d .   r e s o l v e d . \n \n f i r s t   c i t i z e n : \n f i r s t ,   y o u   k']

["n o w   c a i u s   m a r c i u s   i s   c h i e f   e n e m y   t o   t h e   p e o p l e . \n \n a l l : \n w e   k n o w ' t ,   w e   k n o w ' t . \

In [27]:
dataset

<MapDataset shapes: ((100,), (100,)), types: (tf.int64, tf.int64)>

In [28]:
dataset = dataset.shuffle(10000).batch(batch_size, drop_remainder=True)

dataset

<BatchDataset shapes: ((32, 100), (32, 100)), types: (tf.int64, tf.int64)>

In [29]:
for input, target in dataset.take(1):
    print(input)
    print(target)

tf.Tensor(
[[ 8  0  6 ...  2  5  3]
 [ 5  9  2 ...  5 18  5]
 [17  0  5 ...  8  1  4]
 ...
 [ 0 14  5 ...  4  8  0]
 [ 4 11  0 ... 25  1  0]
 [18 11  4 ...  3 12  0]], shape=(32, 100), dtype=int64)
tf.Tensor(
[[ 0  6  3 ...  5  3  9]
 [ 9  2  3 ... 18  5  9]
 [ 0  5  0 ...  1  4  2]
 ...
 [14  5  9 ...  8  0 21]
 [11  0  2 ...  1  0 16]
 [11  4 13 ... 12  0  5]], shape=(32, 100), dtype=int64)


# **Creating a Model**

In [30]:
model = keras.models.Sequential([
    keras.layers.Embedding(vocab_size, embedding_dim, batch_input_shape=[batch_size, None]),
    keras.layers.LSTM(128, return_sequences=True),
    keras.layers.LSTM(64, return_sequences=True),
    keras.layers.Dense(vocab_size)
])

model.compile(loss="sparse_categorical_crossentropy", optimizer="adam")

In [31]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (32, None, 16)            624       
_________________________________________________________________
lstm_2 (LSTM)                (32, None, 128)           74240     
_________________________________________________________________
lstm_3 (LSTM)                (32, None, 64)            49408     
_________________________________________________________________
dense_1 (Dense)              (32, None, 39)            2535      
Total params: 126,807
Trainable params: 126,807
Non-trainable params: 0
_________________________________________________________________


In [32]:
for input_example, output_example in dataset.take(1):
    predicted_example = model(input_example)

print(predicted_example.shape)

(32, 100, 39)


In [33]:
model.fit(dataset, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7f3e18e37ac8>

In [36]:
model = keras.models.Sequential([
    keras.layers.Embedding(vocab_size, embedding_dim, input_length=seq_length),
    keras.layers.LSTM(128, return_sequences=True),
    keras.layers.LSTM(64, return_sequences=True),
    keras.layers.TimeDistributed(keras.layers.Dense(vocab_size,
                                                    activation="softmax"))
])

model.compile(loss="sparse_categorical_crossentropy", optimizer="adam")

model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 100, 16)           624       
_________________________________________________________________
lstm_6 (LSTM)                (None, 100, 128)          74240     
_________________________________________________________________
lstm_7 (LSTM)                (None, 100, 64)           49408     
_________________________________________________________________
time_distributed_1 (TimeDist (None, 100, 39)           2535      
Total params: 126,807
Trainable params: 126,807
Non-trainable params: 0
_________________________________________________________________


In [37]:
model.fit(dataset, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7f3e18135a90>

In [57]:
model = keras.models.Sequential([
    keras.layers.Embedding(vocab_size, embedding_dim, batch_input_shape=[batch_size, None]),
    keras.layers.GRU(128, return_sequences=True, stateful=True),
    keras.layers.GRU(128, return_sequences=True, stateful=True),
    keras.layers.TimeDistributed(keras.layers.Dense(vocab_size,
                                                    activation="softmax"))
])

model.compile(loss="sparse_categorical_crossentropy", optimizer="adam")

model.fit(dataset, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7f3de48f34e0>

In [58]:
for input_example, output_example in dataset.take(1):
    predicted_example = model(input_example)

print(predicted_example.shape)

(32, 100, 39)


In [59]:
prediction = tf.random.categorical(predicted_example[0], num_samples=1)
prediction = tf.squeeze(prediction, axis=-1)
prediction

<tf.Tensor: shape=(100,), dtype=int64, numpy=
array([24, 12, 19,  5,  4,  2, 26,  3,  7, 25,  4,  8,  6, 31, 17, 17, 26,
       23,  8, 20,  1, 13, 11, 27, 24, 13,  3,  8, 17, 34, 36, 24, 17,  0,
       24, 22,  2, 11, 38, 17, 28,  4, 14, 14, 28,  4,  3, 23, 33, 38, 34,
        7, 10, 21,  8,  0, 16, 14, 11, 12, 38, 34, 32, 17, 14, 30, 27, 11,
       20,  2, 17, 21, 19, 36, 17, 19, 28, 10,  8, 26, 20, 21, 31,  2,  5,
        9, 35, 30,  8, 10, 38,  1,  9,  9, 23, 35, 13, 32,  3,  0])>

In [60]:
[prediction] = tokenizer.sequences_to_texts([prediction.numpy() + 1])
print(prediction)

k d f i a t . o s v a r h - , , . : r g e u l ' k u o r , x 3 k ,   k p t l $ , ; a m m ; a o : q $ x s 
 b r   w m l d $ x j , m ! ' l g t , b f 3 , f ; 
 r . g b - t i n z ! r 
 $ e n n : z u j o  


In [61]:
def generate_text(model, start_string):

    # number of characters to generate
    text_length = 1000

    # vectorize the start_string
    input_sequence = np.array(tokenizer.texts_to_sequences([start_string])) - 1

    text_generated = []
    # some comment related to temparature
    temparature = 1.0

    for i in range(text_length):
        predictions = model(input_sequence)
        # removing the batch dimension since the batch size == 1
        predictions = tf.squeeze(predictions, 0)

        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1, 0]

        input_sequence = tf.expand_dims([predicted_id], 0)

        [generated] = tokenizer.sequences_to_texts([predicted_id.numpy() + 1])
        text_generated.append(generated)

        return (start_string + "".join(text_generated))

In [62]:
print(generate_text(model, u"ROMEO: "))

InvalidArgumentError: ignored