In [1]:
import tensorflow as tf

import tensorflow_datasets as tfds
import numpy as np
import os
import time

In [2]:
path_to_file = tf.keras.utils.get_file('shakespeare.txt', 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')

###  Looking the data

In [3]:
text = open(path_to_file, 'rb').read().decode(encoding='utf-8')
print(f'Length of the text: {len(text)} characters')

Length of the text: 1115394 characters


In [4]:
# First 250 characters
print(text[:250])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.



In [5]:
# Unique characters in the file
vocabulary = sorted(set(text))
print(f'{len(vocabulary)} unique characters')

65 unique characters


### Processing the text

In [6]:
char2idx = {u: index for index, u in enumerate(vocabulary)}
idx2char = np.array(vocabulary)
text_as_int = np.array([char2idx[c] for c in text])

In [7]:
text_as_int

array([18, 47, 56, ..., 45,  8,  0])

In [8]:
len(text_as_int)

1115394

In [9]:
# Show how the first 13 characters from the text are mapped to integers
print ('{} ---- characters mapped to int ---- > {}'.format(repr(text[:13]), text_as_int[:13]))

'First Citizen' ---- characters mapped to int ---- > [18 47 56 57 58  1 15 47 58 47 64 43 52]


In [10]:
# The maximum length sentence we want for a single input in characters
seq_length = 100
examples_per_epoch = len(text)//(seq_length+1)

In [11]:
# Create training examples / targets
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

for i in char_dataset.take(5):
  print(idx2char[i.numpy()])

F
i
r
s
t


The `batch` method lets us easily convert these individual characters to sequences of the desired size.

In [12]:
sequences = char_dataset.batch(seq_length+1, drop_remainder=True)
for item in sequences.take(5):
     print(repr(''.join(idx2char[item.numpy()])))

'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou '
'are all resolved rather to die than to famish?\n\nAll:\nResolved. resolved.\n\nFirst Citizen:\nFirst, you k'
"now Caius Marcius is chief enemy to the people.\n\nAll:\nWe know't, we know't.\n\nFirst Citizen:\nLet us ki"
"ll him, and we'll have corn at our own price.\nIs't a verdict?\n\nAll:\nNo more talking on't; let it be d"
'one: away, away!\n\nSecond Citizen:\nOne word, good citizens.\n\nFirst Citizen:\nWe are accounted poor citi'


For each sequence, duplicate and shift it to form the input and target text by using the `map` method to apply a simple function to each batch:

In [13]:
def split_input_target(chunk):
    input_text = chunk[:-1]
    output_text = chunk[1:]
    return input_text, output_text

dataset = sequences.map(split_input_target)

In [14]:
for input_data, target_data in dataset.take(5):
    print ('Input data: ', repr(''.join(idx2char[input_data.numpy()])))
    print ('Target data:', repr(''.join(idx2char[target_data.numpy()])))

Input data:  'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou'
Target data: 'irst Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou '
Input data:  'are all resolved rather to die than to famish?\n\nAll:\nResolved. resolved.\n\nFirst Citizen:\nFirst, you '
Target data: 're all resolved rather to die than to famish?\n\nAll:\nResolved. resolved.\n\nFirst Citizen:\nFirst, you k'
Input data:  "now Caius Marcius is chief enemy to the people.\n\nAll:\nWe know't, we know't.\n\nFirst Citizen:\nLet us k"
Target data: "ow Caius Marcius is chief enemy to the people.\n\nAll:\nWe know't, we know't.\n\nFirst Citizen:\nLet us ki"
Input data:  "ll him, and we'll have corn at our own price.\nIs't a verdict?\n\nAll:\nNo more talking on't; let it be "
Target data: "l him, and we'll have corn at our own price.\nIs't a verdict?\n\nAll:\nNo more talking on't; let it be d"
Input data:  'one: away, a

Each index of these vectors are processed as one time step. For the input at time step 0, the model receives the index for "F" and trys to predict the index for "i" as the next character. At the next timestep, it does the same thing but the `RNN` considers the previous step context in addition to the current input character.

In [15]:
for i, (input_idx, target_idx) in enumerate(zip(input_data[:5], target_data[:5])):
    print("Step {:4d}".format(i))
    print("  input: {} ({:s})".format(input_idx, repr(idx2char[input_idx])))
    print("  expected output: {} ({:s})".format(target_idx, repr(idx2char[target_idx])))

Step    0
  input: 53 ('o')
  expected output: 52 ('n')
Step    1
  input: 52 ('n')
  expected output: 43 ('e')
Step    2
  input: 43 ('e')
  expected output: 10 (':')
Step    3
  input: 10 (':')
  expected output: 1 (' ')
Step    4
  input: 1 (' ')
  expected output: 39 ('a')


### Creating training batch

In [16]:
# Batch size
BATCH_SIZE = 64

# Buffer size to shuffle the dataset
# (TF data is designed to work with possibly infinite sequences,
# so it doesn't attempt to shuffle the entire sequence in memory. Instead,
# it maintains a buffer in which it shuffles elements).
BUFFER_SIZE = 10000

dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

### Build The Model

In [17]:
# Length of the vocabulary in chars
vocab_size = len(vocabulary)

# The embedding dimension
embedding_dim = 256

# Number of RNN units
rnn_units = 1024

In [18]:
def build_model(vocabulary_size, embedding_dim, rnn_units, batch_size):
    model = tf.keras.Sequential()
    #Turns positive integers (indexes) into dense vectors of fixed size.
    #This layer can only be used as the first layer in a model.
    model.add(tf.keras.layers.Embedding(vocabulary_size, embedding_dim, batch_input_shape=[batch_size, None]))
    model.add(tf.keras.layers.GRU(rnn_units,
                        return_sequences=True,  
                        stateful=True,
                        recurrent_initializer='glorot_uniform'))
    model.add(tf.keras.layers.Dense(vocabulary_size))
    
    return model

In [19]:
model = build_model(
    vocabulary_size = len(vocabulary),
    embedding_dim = embedding_dim,
    rnn_units = rnn_units,
    batch_size = BATCH_SIZE
)

For each character the model looks up the embedding, runs the GRU one timestep with the embedding as input, and applies the dense layer to generate logits predicting the log-likelihood of the next character:


### Try the model

In [20]:
for input_example_batch, target_example_batch in dataset.take(1):
  example_batch_predictions = model(input_example_batch)
  print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")

(64, 100, 65) # (batch_size, sequence_length, vocab_size)


In [21]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (64, None, 256)           16640     
_________________________________________________________________
gru (GRU)                    (64, None, 1024)          3938304   
_________________________________________________________________
dense (Dense)                (64, None, 65)            66625     
Total params: 4,021,569
Trainable params: 4,021,569
Non-trainable params: 0
_________________________________________________________________


In [22]:
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)

In [23]:
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices,axis=-1).numpy()

In [24]:
sampled_indices

array([40, 26, 14, 64, 37, 40, 24,  0, 29,  6, 62, 54, 10, 34, 51, 10, 49,
       32, 61, 40, 11, 15,  0, 53, 63, 41, 36, 24,  3, 44, 38, 30, 14, 14,
       24, 45, 51, 52, 29,  5, 49, 55, 60, 27, 64, 17, 51, 48, 11, 32, 52,
       50, 14,  0, 40,  6,  2, 45, 29, 50, 45, 21, 19, 36, 13, 30, 15, 22,
       36, 24, 63, 48, 64, 55, 52, 21, 22,  3, 10, 41, 22, 42, 30,  0, 58,
       27, 53, 58, 24, 45, 46, 60, 40, 12, 24, 15, 30, 36, 62, 45])

This gives us, at each timestep, a prediction of the next character index:

In [25]:
print("Input: \n", repr("".join(idx2char[input_example_batch[0]])))
print()
print("Next Char Predictions: \n", repr("".join(idx2char[sampled_indices ])))

Input: 
 'ves in Scotland at his ease,\nWhere having nothing, nothing can he lose.\nAnd as for you yourself, our'

Next Char Predictions: 
 "bNBzYbL\nQ,xp:Vm:kTwb;C\noycXL$fZRBBLgmnQ'kqvOzEmj;TnlB\nb,!gQlgIGXARCJXLyjzqnIJ$:cJdR\ntOotLghvb?LCRXxg"


### Train the model

In [26]:
def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)


example_batch_loss  = loss(target_example_batch, example_batch_predictions)
print("Prediction shape: ", example_batch_predictions.shape, " # (batch_size, sequence_length, vocab_size)")
print("scalar_loss:      ", example_batch_loss.numpy().mean())

Prediction shape:  (64, 100, 65)  # (batch_size, sequence_length, vocab_size)
scalar_loss:       4.175618


In [27]:
model.compile(optimizer='adam', loss=loss)

### Configure checkpoints
To save checkpoints during training

In [28]:
# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "checkpoint_{epoch}")

checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

### Starting the training

In [29]:
EPOCHS = 1

In [30]:
history = model.fit(dataset, epochs=EPOCHS, callbacks = [checkpoint_callback])

Train for 172 steps


### Generate Text

#### Restore the latest checkpoint

To keep this prediction step simple, use a batch size of 1.

Because of the way the RNN state is passed from timestep to timestep, the model only accepts a fixed batch size once built.

To run the model with a different `batch_size`, we need to rebuild the model and restore the weights from the checkpoint.


In [31]:
tf.train.latest_checkpoint(checkpoint_dir)

'./training_checkpoints/checkpoint_1'

In [32]:
model = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)

model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))

model.build(tf.TensorShape([1, None]))

### The prediction loop

The following code block generates the text:

* It Starts by choosing a start string, initializing the RNN state and setting the number of characters to generate.

* Get the prediction distribution of the next character using the start string and the RNN state.

* Then, use a categorical distribution to calculate the index of the predicted character. Use this predicted character as our next input to the model.

* The RNN state returned by the model is fed back into the model so that it now has more context, instead than only one character. After predicting the next character, the modified RNN states are again fed back into the model, which is how it learns as it gets more context from the previously predicted characters.


![To generate text the model's output is fed back to the input]

Looking at the generated text, you'll see the model knows when to capitalize, make paragraphs and imitates a Shakespeare-like writing vocabulary. With the small number of training epochs, it has not yet learned to form coherent sentences.

In [39]:
def generate_text(model, start_string):
  # Evaluation step (generating text using the learned model)

  # Number of characters to generate
  num_generate = 1000

  # Converting our start string to numbers (vectorizing)
  input_eval = [char2idx[s] for s in start_string]
  input_eval = tf.expand_dims(input_eval, 0)

  # Empty string to store our results
  text_generated = []

  # Low temperatures results in more predictable text.
  # Higher temperatures results in more surprising text.
  # Experiment to find the best setting.
  temperature = 1.0

  # Here batch size == 1
  model.reset_states()
  for i in range(num_generate):
      predictions = model(input_eval)
      # remove the batch dimension
      predictions = tf.squeeze(predictions, 0)
      # using a categorical distribution to predict the character returned by the model
      predictions = predictions / temperature
      predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()

      # We pass the predicted character as the next input to the model
      # along with the previous hidden state
      input_eval = tf.expand_dims([predicted_id], 0)

      text_generated.append(idx2char[predicted_id])

  return (start_string + ''.join(text_generated))

In [40]:
print(generate_text(model, start_string=u"ROMEO: "))

tf.Tensor(
[[ 7.6644677e-01  2.3019363e-01  1.6588932e-01  3.8227135e-01
   5.2362782e-01  2.6769608e-01 -2.6008919e-01  5.5997241e-01
  -1.9547041e-01  7.8993165e-01  8.1805432e-01 -8.9774057e-02
  -8.4075689e-02  2.2829421e+00  9.0381449e-01  1.4069397e+00
   1.4447377e+00  1.7389988e+00  9.9342120e-01  1.3965456e+00
   1.5404297e+00  2.0946014e+00  5.3538603e-01  1.3519460e+00
   1.0343521e+00  7.7608478e-01  9.8369032e-01  2.1360676e+00
   7.8836298e-01  8.1819588e-01  8.5310078e-01  8.9669818e-01
   1.1225864e+00  2.0165570e+00  7.3223174e-01  1.1901190e+00
   8.9250559e-01  1.8406593e+00  1.1617644e+00 -1.4933947e-01
  -8.9333248e-01 -1.0192790e+00 -1.1246064e+00 -1.0706441e-01
  -1.0124080e+00 -1.2538949e+00 -1.4036027e-01 -5.7495236e-03
  -3.1693336e-01 -1.0538899e+00 -1.1774251e+00 -1.2425655e+00
  -1.8041238e+00  1.9632499e-01 -9.4269782e-01 -4.8716184e-01
  -1.5522283e+00 -1.1617420e+00 -1.0310657e+00 -2.4902444e-01
  -1.1373061e+00 -6.5421748e-01 -6.3903010e-01 -5.4674518e-