<a href="https://colab.research.google.com/github/unniths/Text-Generator-RNN/blob/master/Text_generation_with_an_RNN_with_comments.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Shiva Unnithan
# Text generation with an RNN

In [0]:
import tensorflow as tf

import numpy as np
import os
import time

In [2]:
path_to_file = tf.keras.utils.get_file('shakespeare.txt', 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')

Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt


In [3]:
# Read, then decode for py2 compat
text = open(path_to_file, 'rb').read().decode(encoding='utf-8')
# length of text is the number of characters in it
print ('Length of text: {} character'.format(len(text)))

Length of text: 1115394 character


In [4]:
print(text[:1000]) # printing the first 1000 characters

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods know I
speak this in hunger for bread, not in thirst for revenge.



In [5]:
# The unique characters in the file
vocab = sorted(set(text))
print ('{} unique character'.format(len(vocab)))

65 unique character


In [0]:
# Creating a mapping from unique characters to indices 
char2idx = {u:i for i, u in enumerate(vocab)} # look up table to map characters to numbers
idx2char = np.array(vocab) 

text_as_int = np.array([char2idx[c] for c in text]) # make each character an integer

In [7]:
print('{')
for char,_ in zip(char2idx, range(20)): # printing the range 0 to 20 showing just a piece of every character turned into an int
    print('  {:4s}: {:3d},'.format(repr(char), char2idx[char])) #formatting to see which character represents which integer
print('  ...\n}')

{
  '\n':   0,
  ' ' :   1,
  '!' :   2,
  '$' :   3,
  '&' :   4,
  "'" :   5,
  ',' :   6,
  '-' :   7,
  '.' :   8,
  '3' :   9,
  ':' :  10,
  ';' :  11,
  '?' :  12,
  'A' :  13,
  'B' :  14,
  'C' :  15,
  'D' :  16,
  'E' :  17,
  'F' :  18,
  'G' :  19,
  ...
}


In [8]:
print('{} ---- characters mapped to int ----> {}'.format(repr(text[:13]), text_as_int[:13])) # Quick example of how First Citizen is mapped in integers using the method

'First Citizen' ---- characters mapped to int ----> [18 47 56 57 58  1 15 47 58 47 64 43 52]


In [9]:
# The maximum length sentence we want for a single input in characters
seq_length = 1000 # length of text measured by characters
examples_per_epoch = len(text)//(seq_length+1) 

# Create training examples / target
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int) # Converting the text vector into a steam of character indices

for i in char_dataset.take(5): #take the first 5 characters from the dataset
    print(idx2char[i.numpy()]) #print the first five indices from the array using idx2char

F
i
r
s
t


In [10]:
sequences = char_dataset.batch(seq_length+1, drop_remainder=True) # in dataset, batch converts individual characters to sequences or chunks

for item in sequences.take(5): # 5 is the desired length for the sequence
    print(repr(''.join(idx2char[item.numpy()]))) 

"First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou are all resolved rather to die than to famish?\n\nAll:\nResolved. resolved.\n\nFirst Citizen:\nFirst, you know Caius Marcius is chief enemy to the people.\n\nAll:\nWe know't, we know't.\n\nFirst Citizen:\nLet us kill him, and we'll have corn at our own price.\nIs't a verdict?\n\nAll:\nNo more talking on't; let it be done: away, away!\n\nSecond Citizen:\nOne word, good citizens.\n\nFirst Citizen:\nWe are accounted poor citizens, the patricians good.\nWhat authority surfeits on would relieve us: if they\nwould yield us but the superfluity, while it were\nwholesome, we might guess they relieved us humanely;\nbut they think we are too dear: the leanness that\nafflicts us, the object of our misery, is as an\ninventory to particularise their abundance; our\nsufferance is a gain to them Let us revenge this with\nour pikes, ere we become rakes: for the gods know I\nspeak this in hunger 

In [0]:
def split_input_target(chunk): 
    input_text = chunk[:-1] # seq_length -> input
    target_text = chunk[1:] # seq_length+1 -> target
    return input_text, target_text # the values of -1 and 1 have to do with the tanh squashing function which is used in RNN.

dataset = sequences.map(split_input_target) # using map function to apply this method to each batch

In [12]:
# This prints the example input value and target value, which also shows the way the data works
for input_example, target_example in dataset.take(1): 
    print('Input data: ', repr(''. join(idx2char[input_example.numpy()])))
    print('Target data: ', repr(''.join(idx2char[target_example.numpy()]))) 

Input data:  "First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou are all resolved rather to die than to famish?\n\nAll:\nResolved. resolved.\n\nFirst Citizen:\nFirst, you know Caius Marcius is chief enemy to the people.\n\nAll:\nWe know't, we know't.\n\nFirst Citizen:\nLet us kill him, and we'll have corn at our own price.\nIs't a verdict?\n\nAll:\nNo more talking on't; let it be done: away, away!\n\nSecond Citizen:\nOne word, good citizens.\n\nFirst Citizen:\nWe are accounted poor citizens, the patricians good.\nWhat authority surfeits on would relieve us: if they\nwould yield us but the superfluity, while it were\nwholesome, we might guess they relieved us humanely;\nbut they think we are too dear: the leanness that\nafflicts us, the object of our misery, is as an\ninventory to particularise their abundance; our\nsufferance is a gain to them Let us revenge this with\nour pikes, ere we become rakes: for the gods know I\nspeak th

In [13]:
# This part is showing how the model is predicting the next character. So the model first gets the index for F, and expects 'i'. When it puts the input of i, RNN means it would remember the previous results and continue with preds.
for i, (input_idx, target_idx) in enumerate(zip(input_example[:10], target_example[:10])):
    print("Step {:4d}".format(i))
    print("  input: {} ({:s})".format(input_idx, repr(idx2char[input_idx])))
    print("  expected output: {} ({:s})".format(target_idx, repr(idx2char[target_idx])))

Step    0
  input: 18 ('F')
  expected output: 47 ('i')
Step    1
  input: 47 ('i')
  expected output: 56 ('r')
Step    2
  input: 56 ('r')
  expected output: 57 ('s')
Step    3
  input: 57 ('s')
  expected output: 58 ('t')
Step    4
  input: 58 ('t')
  expected output: 1 (' ')
Step    5
  input: 1 (' ')
  expected output: 15 ('C')
Step    6
  input: 15 ('C')
  expected output: 47 ('i')
Step    7
  input: 47 ('i')
  expected output: 58 ('t')
Step    8
  input: 58 ('t')
  expected output: 47 ('i')
Step    9
  input: 47 ('i')
  expected output: 64 ('z')


## Create Training Batches
https://www.tensorflow.org/tutorials/text/text_generation#create_training_batches

In [14]:
# Batch size means how big a batch of data that was split from the text source will be
BATCH_SIZE = 64

# Buffer size to shuffle the dataset 
# (TF data is design to work with possibly infinite sequences, so it doesn't attempt to shuffle the entire sequence in memory.
# Instead, it maintains a buffer in which is shuffles elements).
BUFFER_SIZE = 10000

dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

dataset

<BatchDataset shapes: ((64, 1000), (64, 1000)), types: (tf.int64, tf.int64)>

## Build The Model
https://www.tensorflow.org/tutorials/text/text_generation#build_the_model

In [15]:
# Length of the vocabulary in chars
vocab_size = len(vocab)

# The embedding dimension 
embedding_dim = 256

# Number of RNN units 
rnn_units = 1024
print(vocab_size)

65


In [0]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    model = tf.keras.Sequential([ # Sequential is used to group a linear stack of layers. Used since all the layers have a single input and produce a single output. 
        tf.keras.layers.Embedding(vocab_size, embedding_dim, batch_input_shape=[batch_size, None]), # Embedding is the input layer. A trainable lookup table that will map numbers of each char to a vector using embedding_dim dimensions.
        tf.keras.layers.GRU(rnn_units, # SWITCHING TO LSTM LAYER INSTEAD, GRU IS A TYPE OF RNN WITH A SIZE TO MAKE IT MORE ACCURATE
                           return_sequences=True,
                           stateful=True,
                           recurrent_initializer='glorot_uniform'),
        tf.keras.layers.Dense(vocab_size) # Output layer, output are determined by vocab_size
    ])
    return model

In [0]:
model = build_model(
    vocab_size = len(vocab), # 65
    embedding_dim=embedding_dim, #using embedding dm which we declared as 256
    rnn_units=rnn_units, # using rnn_unnits which was declared at 1024 
    batch_size=BATCH_SIZE) # using BATCH_SIZE which was declared as 64

## Try the Model
https://www.tensorflow.org/tutorials/text/text_generation#try_the_model

In [18]:
#Testing the model
for input_example_batch, target_example_batch in dataset.take(1):
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")

(64, 1000, 65) # (batch_size, sequence_length, vocab_size)


In [19]:
model.summary() # summary to show every individual layer of the model

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (64, None, 256)           16640     
_________________________________________________________________
gru (GRU)                    (64, None, 1024)          3938304   
_________________________________________________________________
dense (Dense)                (64, None, 65)            66625     
Total params: 4,021,569
Trainable params: 4,021,569
Non-trainable params: 0
_________________________________________________________________


In [0]:
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1) # First examle in the batch
sampled_indices = tf.squeeze(sampled_indices, axis=-1).numpy() # removing a specific axis of size (-1) 

In [21]:
sampled_indices

array([22, 60,  0, 49, 54, 50, 48, 61, 23, 49, 25, 13,  9, 43, 64, 21, 26,
       23, 46, 51, 21, 42, 20, 34, 55,  5, 50, 47, 37, 64,  7, 49, 16, 24,
       64, 44, 62,  8, 57, 47,  3, 14, 54, 22, 61, 22, 41, 47, 35, 47, 35,
       32,  9,  8, 45,  3, 19, 19, 25, 25, 43, 23, 29, 59, 44, 40,  6, 18,
       18, 22, 10, 58, 39,  5, 31, 10, 32, 47, 41, 57, 54, 61, 52,  4, 41,
       57,  8, 17, 62,  9, 38,  6, 51, 40,  4, 54,  1, 50, 14, 42,  4, 42,
       42, 64, 19, 32,  6, 54, 35, 57, 45, 23,  9, 16, 35, 21, 24,  7, 43,
       64, 43, 34, 32, 21,  2, 62, 30, 49, 49, 17, 12,  7,  1, 11, 37, 33,
       57, 61, 30, 15, 36, 37, 42, 37, 30, 62,  0,  6, 26, 62, 38, 42, 12,
        6, 12, 15,  2, 34, 29, 38, 41, 18,  0, 62, 33,  3, 31, 56, 11,  8,
       39, 51, 28, 39, 42, 32, 56,  3, 64, 49, 18, 23,  0, 22,  6, 11, 30,
       41, 50, 31, 30, 21,  3, 18,  5, 36, 11, 62, 39, 29,  5, 38, 10,  2,
        1, 49, 21, 12,  1,  7, 57, 52,  9, 61,  4, 44,  2, 43, 56, 31, 54,
       16, 56,  6, 18, 48

In [22]:
print("Input: \n", repr("".join(idx2char[input_example_batch[0]]))) # The regular input batch 
print()
print("Next Char Predictions: \n", repr("".join(idx2char[sampled_indices ]))) # The predictions made by the model W/O training 

Input: 
 "To blazon it, then sweeten with thy breath\nThis neighbour air, and let rich music's tongue\nUnfold the imagined happiness that both\nReceive in either by this dear encounter.\n\nJULIET:\nConceit, more rich in matter than in words,\nBrags of his substance, not of ornament:\nThey are but beggars that can count their worth;\nBut my true love is grown to such excess\nI cannot sum up sum of half my wealth.\n\nFRIAR LAURENCE:\nCome, come with me, and we will make short work;\nFor, by your leaves, you shall not stay alone\nTill holy church incorporate two in one.\n\nBENVOLIO:\nI pray thee, good Mercutio, let's retire:\nThe day is hot, the Capulets abroad,\nAnd, if we meet, we shall not scape a brawl;\nFor now, these hot days, is the mad blood stirring.\n\nMERCUTIO:\nThou art like one of those fellows that when he\nenters the confines of a tavern claps me his sword\nupon the table and says 'God send me no need of\nthee!' and by the operation of the second cup draws\nit on the drawer

## Train the Model
https://www.tensorflow.org/tutorials/text/text_generation#train_the_model

In [23]:
def loss(labels, logits): #takes in labels and logits. Logits are raw predictions that have not gone through the normalization process by the model.
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True) # calculates sparse categorial crossentropy loss, from_logits=True when the model is returning logits

example_batch_loss = loss(target_example_batch, example_batch_predictions) 
print("Prediction shape: ", example_batch_predictions.shape, "# (base_size, sequence_length, vocab_size)")
print("scalar_loss:       ", example_batch_loss.numpy().mean())

Prediction shape:  (64, 1000, 65) # (base_size, sequence_length, vocab_size)
scalar_loss:        4.174617


In [0]:
model.compile(optimizer='adam', loss=loss) #specific optomizer which is called adam

## Configure Checkpoints
https://www.tensorflow.org/tutorials/text/text_generation#configure_checkpoints

In [0]:
# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints'
# Name of the checkpoints files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}") #checkpoints are used to save specific points during the process of the model

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint( # Save checkpoints at a specific frequency
    filepath=checkpoint_prefix,
    save_weights_only=True)

## Execute the Training
To keep training time reasonable, use 10 epochs to train the model. In Colab, set the runtime to GPU for faster training.

In [0]:
# unit of time, increasing the epochs would decrease the loss function, thus training the model harder for better results. Pushing this up would help but runs longer and harder. 
EPOCHS=10

In [27]:

history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback]) #fitting into the model the dataset, the epochs designated before (10), and the checkpoint callbacks previously mentioned.

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


## Generate Text
### Restore the Latest Checkpooint
https://www.tensorflow.org/tutorials/text/text_generation#generate_text

In [28]:
tf.train.latest_checkpoint(checkpoint_dir) # Going back to a specific checkpoint since the model can fit a specific batch size only

'./training_checkpoints/ckpt_10'

In [0]:
model = build_model(vocab_size, embedding_dim, rnn_units,batch_size=1) # to make the model with a different batch size, you would have to rebuild and restore weights from the checkpoint
model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
model.build(tf.TensorShape([1, None]))

In [30]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (1, None, 256)            16640     
_________________________________________________________________
gru_1 (GRU)                  (1, None, 1024)           3938304   
_________________________________________________________________
dense_1 (Dense)              (1, None, 65)             66625     
Total params: 4,021,569
Trainable params: 4,021,569
Non-trainable params: 0
_________________________________________________________________


## The Prediction Loop

In [0]:
def generate_text(model, start_string):
  # Evaluation step (generating text using the learned model)
  
  # Number of characters to generate
  num_generate = 1000

  # Converting our start to string to numbers (vectorizing)
  input_eval = [char2idx[s] for s in start_string]
  input_eval = tf.expand_dims(input_eval, 0)

  # Empty string to store our results
  text_generated = []

  # Low temperatures results in more predictable text.
  # Higher temperatures result in more surprising text.
  # Experiment to find the best setting.
  temperature = 1.0

  # Here batch size == 1
  model.reset_states()
  for i in range(num_generate):
    predictions = model(input_eval)
    # remove the batch dimension
    predictions = tf.squeeze(predictions, 0)

    # using a categorical distribution to predict the character returned by the model
    predictions = predictions / temperature
    predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()

    # We pass the predicted character as the next input to the model
    # along with previous hidden state
    input_eval = tf.expand_dims([predicted_id], 0)

    text_generated.append(idx2char[predicted_id])
  
  return (start_string +''.join(text_generated))

In [32]:
print(generate_text(model, start_string=u'ROMEO: ')) #small number of epochs leads the model to generate predictions that arent coherent sentences.

ROMEO: for:
Thells sis bute hang. Nust bety,
Soud our the saith claryound,
Bul-chat nole, leat to nor wtill hen my tofer if that your.

ETREBYI:
I whan hay fous fare, in I he. mus is nts se, theru.

LUCHIRD:
Me pay all di'g is wo mack.

USANS:
My, miseno'd for sith!
BEs Ior of trot ioth fit.

SINCHOR:
Ap thim, will is pade ale tienee merswions fonarin d;
char mo thece nows leate ou pror o frisp, than dow.
Hand fathy get keno,
Baded itiunt on 'd tofm wood kine hearSs comsembends;
His lood, foul ar Wart, I love, in hen woors arres thou heme
Toccuss, whares if duknd pey mapt tour.

ELHBUN:
If lord mirss, gally, with letide tillvild wnou 'park.

FOrCYORRZRY:
To, wir, then hesee ranty, sithter.

USFORA:
Ascomly, shepter pay netinges wrar wh tich;
Ond thes upatby, knaw, wead is dpies soy.

PAMEIT:
Aly lin fagringarn dook wxact dietet's wing;
Andukins is mis in beerid dist preawe you.

SAglingt wim,
Fat I all thee thee tan thy wich, That lisby;
Aes and fom of on and he to go tods ofe, hioks f

The easiest thing you can do to improve the results it to train it for longer (try EPOCHS=30).

You can also experiment with a different start string, or try adding another RNN layer to improve the model's accuracy, or adjusting the temperature parameter to generate more or less random predictions.

## Advanced: Customized Training 

The above training procedure is simple, but does not give you much control.

So now that you've seen how to run the model manually let's unpack the training loop, and implement it ourselves. This gives a starting point if, for example, to implement curriculum learning to help stabilize the model's open-loop output.



In [0]:
model = build_model(
    vocab_size = len(vocab),
    embedding_dim=embedding_dim,
    rnn_units=rnn_units,
    batch_size=BATCH_SIZE)

In [0]:
optimizer = tf.keras.optimizers.Adam()

In [0]:
@tf.function
def train_step(inp, target):
  with tf.GradientTape() as tape:
    predictions = model(inp)
    loss = tf.reduce_mean(
        tf.keras.losses.sparse_categorical_crossentropy(
            target, predictions, from_logits=True))
    grads = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(grads, model.trainable_variables))

    return loss


In [0]:
# Training step
EPOCHS = 10

for epoch in range(EPOCHS):
  start = time.time()

  # initializing the hidden state at the start of every epoch
  # initally hidden is None
  hidden = model.reset_states()

  for (batch_n, (inp, target)) in enumerate(dataset):
    loss = train_step(inp, target)

    if batch_n % 100 == 0:
      template = 'Epoch {} Batch {} Loss {}'
      print(template.format(epoch+1, batch_n, loss))

  # saving (checkpoint) the model every 5 epochs
  if (epoch + 1) % 5 == 0:
    model.save_weights(checkpoint_prefix.format(epoch=epoch))

  print ('Epoch {} Loss {:.4f}'.format(epoch+1, loss))
  print ('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

model.save_weights(checkpoint_prefix.format(epoch=epoch))

Epoch 1 Batch 0 Loss 4.175279140472412
Epoch 1 Batch 100 Loss 2.3285584449768066
Epoch 1 Loss 2.1420
Time taken for 1 epoch 8.299541711807251 sec

Epoch 2 Batch 0 Loss 2.10976505279541
Epoch 2 Batch 100 Loss 1.9365309476852417
Epoch 2 Loss 1.7947
Time taken for 1 epoch 7.230211496353149 sec

Epoch 3 Batch 0 Loss 1.7390222549438477
Epoch 3 Batch 100 Loss 1.6487317085266113
Epoch 3 Loss 1.5598
Time taken for 1 epoch 7.26666784286499 sec

Epoch 4 Batch 0 Loss 1.5677311420440674
Epoch 4 Batch 100 Loss 1.5128766298294067
Epoch 4 Loss 1.4948
Time taken for 1 epoch 7.295722961425781 sec

Epoch 5 Batch 0 Loss 1.481947422027588
Epoch 5 Batch 100 Loss 1.4040343761444092
Epoch 5 Loss 1.4222
Time taken for 1 epoch 7.355914354324341 sec

Epoch 6 Batch 0 Loss 1.3769409656524658
Epoch 6 Batch 100 Loss 1.4182506799697876
Epoch 6 Loss 1.4112
Time taken for 1 epoch 7.293524980545044 sec

Epoch 7 Batch 0 Loss 1.3176634311676025
Epoch 7 Batch 100 Loss 1.357419729232788
Epoch 7 Loss 1.3546
Time taken for 1