<a href="https://colab.research.google.com/github/sumanyurosha/tensorflow-specialization/blob/master/Practice/Practicing_Text_Generation_one_more_time.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import tensorflow as tf
import numpy as np
import os
from tensorflow import keras

# **Downloading the Dataset**

In [9]:
shakespeare_url = "https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt"
filepath = keras.utils.get_file("shakespeare", shakespeare_url)

with open(filepath, "r") as f:
    text_data = f.read()

print("The size of Dataset is :{}".format(len(text_data)))

The size of Dataset is :1115394


# **Analyzing the Dataset**

In [10]:
print(text_data[:250])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.



In [11]:
vocab = sorted(set(text_data))
print("Unique characters in the Dataset are : {}".format(len(vocab)))

Unique characters in the Dataset are : 65


In [17]:
for char, i in zip(vocab, range(20)):
    print("{} : {}".format(i+1, repr(char)))

1 : '\n'
2 : ' '
3 : '!'
4 : '$'
5 : '&'
6 : "'"
7 : ','
8 : '-'
9 : '.'
10 : '3'
11 : ':'
12 : ';'
13 : '?'
14 : 'A'
15 : 'B'
16 : 'C'
17 : 'D'
18 : 'E'
19 : 'F'
20 : 'G'


# **Creating a Vocabulary out of the Dataset**

In [86]:
char2idx = {char:i for i, char in enumerate(vocab)}
idx2char = np.array(vocab)

# converting text into a sequence of integers
text_as_int = ap.array([char2idx[char] for char in text_data])

print(repr(text_data[:13]))
print(text_as_int[:13])

NameError: ignored

# **Setting Hyperparameters**

In [87]:
vocab_size = len(vocab)
batch_size = 32
seq_len = 100
embedding_dim = 256
rnn_units = 1024

# **Creating a Dataset for our Model**

In [88]:
# creating a Dataset with batches equal to sequence length + 1 (+ 1 for target)

dataset = tf.data.Dataset.from_tensor_slices(text_as_int)
dataset = dataset.batch(seq_len + 1, drop_remainder=True)

# shape should be (101, )
dataset

<BatchDataset shapes: (101,), types: tf.int32>

# **Creating Input and Target Values for our Model**

In [89]:
def seperate_input_target(chunk):
    # all except the last
    input_text = chunk[:-1]
    # all except the first
    output_text = chunk[1:]

    return input_text, output_text


In [90]:
dataset = dataset.map(seperate_input_target)

# shape should be (100, ), (100, )
dataset

<MapDataset shapes: ((100,), (100,)), types: (tf.int32, tf.int32)>

# **Checking what our model will see**

In [91]:
for input, output in dataset.take(1):
    print("Input : {}".format(repr("".join([idx2char[char] for char in input]))))
    print()
    print("Output: {}".format(repr("".join([idx2char[char] for char in output]))))

Input : 'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou'

Output: 'irst Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou '


# **Creating batches of our Dataset**

In [92]:
dataset = dataset.shuffle(10000).batch(batch_size, drop_remainder=True)

# shape should be (32, 100), (32, 100)
dataset

<BatchDataset shapes: ((32, 100), (32, 100)), types: (tf.int32, tf.int32)>

# **Creating a Model for our Dataset**

In [106]:
def build_model(batch_size, rnn_units, vocab_size, embedding_dim):

    model = keras.models.Sequential([
        keras.layers.Embedding(vocab_size, embedding_dim, 
                               batch_input_shape=[batch_size,None]),
        keras.layers.Dropout(0.3),
        keras.layers.LSTM(rnn_units, return_sequences=True),
        keras.layers.LSTM(rnn_units, return_sequences=True),
        keras.layers.Dropout(0.3),
        keras.layers.TimeDistributed(keras.layers.Dense(vocab_size,
                                                        activation="softmax"))
    ])

    return model

In [107]:
model = build_model(batch_size=batch_size, 
                    rnn_units=rnn_units,
                    vocab_size=vocab_size,
                    embedding_dim=embedding_dim)

model.summary()

model.compile(loss="sparse_categorical_crossentropy",
              optimizer="adam")

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (32, None, 256)           16640     
_________________________________________________________________
dropout_12 (Dropout)         (32, None, 256)           0         
_________________________________________________________________
lstm_12 (LSTM)               (32, None, 1024)          5246976   
_________________________________________________________________
lstm_13 (LSTM)               (32, None, 1024)          8392704   
_________________________________________________________________
dropout_13 (Dropout)         (32, None, 1024)          0         
_________________________________________________________________
time_distributed_6 (TimeDist (None, None, 65)          66625     
Total params: 13,722,945
Trainable params: 13,722,945
Non-trainable params: 0
__________________________________________

# **Checking the Model beahviour and Output shape**

In [130]:
for input, output in dataset.take(1):  
    predicted_example = model(input)
    print(predicted_example.shape)

(32, 100, 65)


In [131]:
predicted_example[0][0]

<tf.Tensor: shape=(65,), dtype=float32, numpy=
array([3.5938262e-03, 3.0688548e-02, 4.6976193e-04, 5.3626401e-05,
       3.4623543e-05, 1.1225550e-03, 5.4782006e-04, 3.0121815e-05,
       2.5461137e-03, 2.2743354e-04, 2.1512891e-01, 3.8794577e-04,
       8.3608821e-04, 6.8289451e-02, 2.7665282e-03, 8.9873141e-03,
       2.9536379e-03, 2.2809071e-02, 5.5330904e-04, 1.5011266e-03,
       5.2522095e-03, 6.9116496e-02, 3.6170695e-05, 4.0671337e-04,
       4.1097519e-03, 6.5891113e-04, 2.0375688e-05, 2.2689320e-02,
       5.0640851e-04, 1.8345378e-04, 1.0770534e-03, 7.1407743e-03,
       8.2177348e-02, 3.4019724e-02, 7.1926817e-04, 4.1525549e-04,
       2.5272454e-04, 1.3994767e-03, 5.5581291e-04, 2.5084388e-02,
       2.9218950e-06, 5.2371621e-03, 3.1635744e-05, 1.4636083e-01,
       7.7865716e-06, 2.2307000e-05, 4.5928132e-02, 7.0654839e-02,
       3.3628818e-04, 7.0038222e-04, 1.7287569e-04, 6.5381249e-04,
       1.7150949e-05, 8.3535679e-02, 1.8570747e-03, 9.1692130e-04,
       3.428306

In [132]:
# select a sequence from the batch
# here predicted_example[0] has a shape (seq_len, vocab_size)
# where row[i, :] contains prob for each word in the vocab for the ith word in the sequence
prediction = tf.random.categorical(predicted_example[0], num_samples=1)
prediction = tf.squeeze(prediction, axis=-1)
prediction

<tf.Tensor: shape=(100,), dtype=int64, numpy=
array([51, 43, 26, 21, 13,  5, 58, 36, 23,  7, 22, 45, 12, 48, 57, 42, 30,
       64, 24, 34,  1,  1, 60, 32, 27, 61, 47, 45, 50, 12, 58,  7,  1, 24,
       54, 45, 10, 18, 18, 57,  4, 41, 25, 12, 51, 20,  5, 52, 38, 39, 63,
        2,  1, 18, 59, 38, 63, 35, 51, 31, 43, 46, 52, 54, 50, 17,  5, 62,
       13, 40, 17,  6, 49, 16, 33, 18, 27, 19, 55, 15,  6, 40, 36, 25, 17,
       63, 44, 48, 51, 64, 53, 45, 13,  1, 20, 56, 57,  1, 42, 46])>

In [133]:
# we should expect very random sequence because the model is not trained yet
print(repr("".join([idx2char[c] for c in prediction])))

"meNIA'tXK-Jg?jsdRzLV  vTOwigl?t- Lpg:FFs&cM?mH'nZay! FuZyWmSehnplE'xAbE,kDUFOGqC,bXMEyfjmzogA Hrs dh"


# **Creating a Checkpoint Callback for saving our Model**

In [118]:
checkpoint_dir = "/checkpoints"

checkpoint_prefix = os.path.join(checkpoint_dir, "{epoch}")

checkpoint_callback = keras.callbacks.ModelCheckpoint(checkpoint_prefix,
                                                      save_weights_only=True)

# **Training our Model**

In [119]:
history = model.fit(dataset, epochs=30, callbacks=[checkpoint_callback], verbose=1)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


# **Building a new model with saved weights but different batch size for Text Generation**

In [120]:
tf.train.latest_checkpoint(checkpoint_dir)

'/checkpoints/30'

In [121]:
model = build_model(batch_size=1, vocab_size=vocab_size,
                    embedding_dim=embedding_dim, rnn_units=rnn_units)

model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))

# here the batch_size should be 1
model.summary()

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (1, None, 256)            16640     
_________________________________________________________________
dropout_14 (Dropout)         (1, None, 256)            0         
_________________________________________________________________
lstm_14 (LSTM)               (1, None, 1024)           5246976   
_________________________________________________________________
lstm_15 (LSTM)               (1, None, 1024)           8392704   
_________________________________________________________________
dropout_15 (Dropout)         (1, None, 1024)           0         
_________________________________________________________________
time_distributed_7 (TimeDist (None, None, 65)          66625     
Total params: 13,722,945
Trainable params: 13,722,945
Non-trainable params: 0
__________________________________________

In [128]:
def generate_text(model, start_string):

    text_length = 1000
    text_generated = []

    input_sequence = np.array([char2idx[c] for c in start_string])
    input_sequence = tf.expand_dims(input_sequence, 0)

    for i in range(text_length):
        prediction = model(input_sequence)

        prediction = tf.squeeze(prediction, 0)
        prediction_id = tf.random.categorical(prediction, num_samples=1)[-1, 0].numpy()

        input_sequence = tf.expand_dims([prediction_id], 0)

        text_generated.append(idx2char[prediction_id])

    return start_string + "".join(text_generated)


In [129]:
print(generate_text(model, "Romeo:"))

Romeo:VHuR
yyMFguWUiU-jyJmqy3kjJdHeGIc,qfcbxfx$iikeeTBk'nwqfEQzovmXxdZnCN-G?z;MAqmx$AO:Mcp'-L;vqBcBsZNd ua;JUiIRY.q
k
LzborRim'dOwpGrcklt$&&i-xW
Y!d
3pHt:sfzOp$,vby'I:UAqD&-xwTRCvAGprUjEXI33Zj-nwVsjiyBu&!WYcvb:hOlR&I ?jzBc?mxAvg?WcqNFjNjhQPa

Aok ZM,yDYuiqw:dMpW
qunjSRNGw3$MeyBvkaSb:aWhSiwFFps.p
vFK!Su .QDgd&zWauAyqugm,kr
JF!R&noAZ.Ss:.uXo!NPfKrdvErJQFL:Q,s 
?
.KR:SjBeFJNfdmC'$3bHFVYr!cMdXO,ncv lna.DaA.CetN'?3rb&bo?eeaCUjmIimJLq.SvnZX'xrGrN' :M'MnR3qElDRDmgVhk!G
TN'sqBYwD3we&J oHeeCJ& CprfnINgzGlpcW' '3:'YLaBa
ty!,QCVR!Fxid'DiPSdJZCDzeXBpQYFRodlgFAzxm?$lSO'eqM!uWd,jotfP?MywIiaXbgc
b3rVzeMMV3GgqNAovvRldT&,IUSIyrfmkYiXt-ahchsaCGzMqTzlstBX$S3,jx;Yq
n.;ZOOVEbRmf,.,S?CCChlXsio:nRLlugp33f
ERHqpgJ,PjfqNVA&FT
Rkd&,QRTJEA!K-b!eTW;:TaHqXbLgO,Qo?P??LChOCC:NN:L;-thlOX'Qze
Ig?WDO
-XaoO ExeU'zCa-B-ynr!qFOuE
RgwSGO&U3
'iFo,zrH
VvHdlMq'D&IaAmuxylyPP-H&InAJrUW?fG$&NjAwdl.;;PbqW3dnVA$yDekF??3jGp-UVR!nR!XhRNfNdDyLJvFQJx!LO;a RV;HE?&gbPzSM-cXdMQbXK$K
Eo$r RpYOeHmHndph'Hv!WcxS'AeWGE'Dul?t?txIk-herkJrhTon'U