<a href="https://colab.research.google.com/github/sumanyurosha/tensorflow-specialization/blob/master/Practice/Practicing_Text_Generation_one_more_time.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import tensorflow as tf
import numpy as np
import os
from tensorflow import keras

# **Downloading the Dataset**

In [9]:
shakespeare_url = "https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt"
filepath = keras.utils.get_file("shakespeare", shakespeare_url)

with open(filepath, "r") as f:
    text_data = f.read()

print("The size of Dataset is :{}".format(len(text_data)))

The size of Dataset is :1115394


# **Analyzing the Dataset**

In [10]:
print(text_data[:250])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.



In [11]:
vocab = sorted(set(text_data))
print("Unique characters in the Dataset are : {}".format(len(vocab)))

Unique characters in the Dataset are : 65


In [17]:
for char, i in zip(vocab, range(20)):
    print("{} : {}".format(i+1, repr(char)))

1 : '\n'
2 : ' '
3 : '!'
4 : '$'
5 : '&'
6 : "'"
7 : ','
8 : '-'
9 : '.'
10 : '3'
11 : ':'
12 : ';'
13 : '?'
14 : 'A'
15 : 'B'
16 : 'C'
17 : 'D'
18 : 'E'
19 : 'F'
20 : 'G'


# **Creating a Vocabulary out of the Dataset**

In [147]:
char2idx = {char:i for i, char in enumerate(vocab)}
idx2char = np.array(vocab)

# converting text into a sequence of integers
text_as_int = np.array([char2idx[char] for char in text_data])

print(repr(text_data[:13]))
print(text_as_int[:13])

'First Citizen'
[18 47 56 57 58  1 15 47 58 47 64 43 52]


# **Setting Hyperparameters**

In [87]:
vocab_size = len(vocab)
batch_size = 32
seq_len = 100
embedding_dim = 256
rnn_units = 1024

# **Creating a Dataset for our Model**

In [88]:
# creating a Dataset with batches equal to sequence length + 1 (+ 1 for target)

dataset = tf.data.Dataset.from_tensor_slices(text_as_int)
dataset = dataset.batch(seq_len + 1, drop_remainder=True)

# shape should be (101, )
dataset

<BatchDataset shapes: (101,), types: tf.int32>

# **Creating Input and Target Values for our Model**

In [89]:
def seperate_input_target(chunk):
    # all except the last
    input_text = chunk[:-1]
    # all except the first
    output_text = chunk[1:]

    return input_text, output_text


In [90]:
dataset = dataset.map(seperate_input_target)

# shape should be (100, ), (100, )
dataset

<MapDataset shapes: ((100,), (100,)), types: (tf.int32, tf.int32)>

# **Checking what our model will see**

In [91]:
for input, output in dataset.take(1):
    print("Input : {}".format(repr("".join([idx2char[char] for char in input]))))
    print()
    print("Output: {}".format(repr("".join([idx2char[char] for char in output]))))

Input : 'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou'

Output: 'irst Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou '


# **Creating batches of our Dataset**

In [92]:
dataset = dataset.shuffle(10000).batch(batch_size, drop_remainder=True)

# shape should be (32, 100), (32, 100)
dataset

<BatchDataset shapes: ((32, 100), (32, 100)), types: (tf.int32, tf.int32)>

# **Creating a Model for our Dataset**

In [148]:
def build_model(batch_size, rnn_units, vocab_size, embedding_dim):

    model = keras.models.Sequential([
        keras.layers.Embedding(vocab_size, embedding_dim, 
                               batch_input_shape=[batch_size,None]),
        keras.layers.LSTM(rnn_units, return_sequences=True),
        keras.layers.TimeDistributed(keras.layers.Dense(vocab_size,
         activation="softmax"))
    ])

    return model

In [135]:
def loss(labels, logits):
    return keras.losses.sparse_categorical_crossentropy( labels, logits,
                                                        from_logits=True,)

In [149]:
model = build_model(batch_size=batch_size, 
                    rnn_units=rnn_units,
                    vocab_size=vocab_size,
                    embedding_dim=embedding_dim)

model.summary()

model.compile(loss="sparse_categorical_crossentropy",
              optimizer="adam")

Model: "sequential_10"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_11 (Embedding)     (32, None, 256)           16640     
_________________________________________________________________
lstm_16 (LSTM)               (32, None, 1024)          5246976   
_________________________________________________________________
time_distributed_8 (TimeDist (None, None, 65)          66625     
Total params: 5,330,241
Trainable params: 5,330,241
Non-trainable params: 0
_________________________________________________________________


# **Checking the Model beahviour and Output shape**

In [160]:
for input, output in dataset.take(1):  
    predicted_example = model(input)
    print(predicted_example.shape)

(32, 100, 65)


In [161]:
predicted_example[0][0]

<tf.Tensor: shape=(65,), dtype=float32, numpy=
array([1.8489206e-02, 2.9586709e-01, 1.9500013e-02, 8.3192060e-07,
       1.0284434e-06, 8.0131441e-03, 4.8147071e-02, 4.2435517e-05,
       1.2373032e-02, 7.1150034e-06, 1.8079085e-02, 1.0799175e-02,
       6.5165604e-03, 8.0820973e-07, 3.0882195e-08, 6.2296683e-09,
       2.8834386e-08, 7.0053567e-08, 4.8825888e-07, 1.5504098e-08,
       1.7830434e-09, 5.8081209e-06, 7.9266854e-08, 1.4234587e-07,
       1.4980741e-06, 5.6981753e-07, 9.1497044e-05, 6.6821769e-07,
       7.5054082e-08, 3.7964909e-07, 9.3837247e-07, 9.9987255e-06,
       2.2691697e-06, 1.1426713e-07, 3.6605554e-06, 7.4559679e-08,
       1.7281325e-05, 1.7665116e-08, 3.0462110e-07, 4.7049444e-02,
       1.8218769e-05, 9.1463597e-03, 3.0167127e-02, 4.7915582e-02,
       5.7611475e-03, 4.7673788e-03, 1.6211727e-07, 7.8390073e-03,
       1.5253588e-06, 7.7585474e-04, 2.5885260e-02, 9.9517563e-03,
       6.9462299e-02, 5.2193092e-05, 8.1354994e-03, 1.4053859e-03,
       1.624410

In [165]:
# select a sequence from the batch
# here predicted_example[0] has a shape (seq_len, vocab_size)
# where row[i, :] contains prob for each word in the vocab for the ith word in the sequence
prediction = tf.random.categorical(predicted_example[0], num_samples=1)
prediction = np.argmax(predicted_example[0], axis=1)
#prediction = tf.squeeze(prediction, axis=-1)
prediction

array([ 1,  1, 21, 53, 40, 43,  1, 47,  5,  1, 58, 46, 43,  1, 61, 53, 56,
       50, 42,  6,  0, 31, 53,  1, 57, 47, 53, 52,  1, 39, 57,  1, 63, 53,
       59, 56, 57,  1, 41, 53, 52, 50, 42,  1, 44, 47, 52,  1, 51, 43, 10,
        1, 57, 53,  1, 47, 58,  1, 57, 46, 53, 59, 50, 42,  1, 52, 53, 58,
        6,  0, 35, 43, 56, 43,  1, 58, 46, 43, 56, 43, 47, 52, 43, 61, 43,
       57, 57, 47, 58, 63,  1, 47, 52,  1, 63, 53, 59, 56,  1, 45])

In [166]:
prediction.shape

(100,)

In [167]:
# we should expect very random sequence because the model is not trained yet
print(repr("".join([idx2char[c] for c in prediction])))

"  Iobe i' the world,\nSo sion as yours conld fin me: so it should not,\nWere thereinewessity in your g"


# **Creating a Checkpoint Callback for saving our Model**

In [154]:
checkpoint_dir = "/checkpoints"

checkpoint_prefix = os.path.join(checkpoint_dir, "{epoch}")

checkpoint_callback = keras.callbacks.ModelCheckpoint(checkpoint_prefix,
                                                      save_weights_only=True)

# **Training our Model**

In [155]:
history = model.fit(dataset, epochs=30, callbacks=[checkpoint_callback], verbose=1)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


# **Building a new model with saved weights but different batch size for Text Generation**

In [156]:
tf.train.latest_checkpoint(checkpoint_dir)

'/checkpoints/30'

In [157]:
model = build_model(batch_size=1, vocab_size=vocab_size,
                    embedding_dim=embedding_dim, rnn_units=rnn_units)

model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))

# here the batch_size should be 1
model.summary()

Model: "sequential_11"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_12 (Embedding)     (1, None, 256)            16640     
_________________________________________________________________
lstm_17 (LSTM)               (1, None, 1024)           5246976   
_________________________________________________________________
time_distributed_9 (TimeDist (None, None, 65)          66625     
Total params: 5,330,241
Trainable params: 5,330,241
Non-trainable params: 0
_________________________________________________________________


In [186]:
def generate_text(model, start_string):

    text_length = 1000
    text_generated = []

    input_sequence = np.array([char2idx[c] for c in start_string])
    input_sequence = tf.expand_dims(input_sequence, 0)

    for i in range(text_length):
        prediction = model(input_sequence)
        # prediction = tf.squeeze(prediction, 0)
        # prediction_id = tf.random.categorical(prediction, num_samples=1)[-1, 0].numpy()

        prediction = np.argmax(prediction, axis=-1)
        prediction_id = prediction[-1, 0]
        input_sequence = tf.expand_dims([prediction_id], 0)

        text_generated.append(idx2char[prediction_id])
        

    return start_string + "".join(text_generated)


In [187]:
print(generate_text(model, "Romeo:"))

Romeo:I the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the 