<a href="https://colab.research.google.com/github/windupbirdjdt/django-example/blob/main/TensorFlowShakespeare.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [35]:
# this notebook attempts to train a basic RNN to generate sentences based on purely taking shakespeare as input, and tensorflow tutorial
# https://www.tensorflow.org/text/tutorials/text_generation

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
import os
import time

# data is stored in tensorflow
path_to_file = tf.keras.utils.get_file('shakespeare.txt', 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')


# Read, then decode for py2 compat.
text_tf = open(path_to_file, 'rb').read().decode(encoding='utf-8')
# length of text is the number of characters in it
print(f'Length of text: {len(text_tf)} characters')

# you also have your data set from Jose P
alternative_path ='/content/shakespeare.txt'
text_jose =  open(alternative_path, 'rb').read().decode(encoding='utf-8')
print(f'Length of text: {len(text_jose)} characters')

# ok so the Jose P data set is significantly bigger
print('--------')
print(f'The first text in Jose file is {text_jose[0:100]}')
print('--------')
print(f'The first text in TF file is {text_tf[0:100]}')


Length of text: 1115394 characters
Length of text: 5445609 characters
--------
The first text in Jose file is 
                     1
  From fairest creatures we desire increase,
  That thereby beauty's rose mi
--------
The first text in TF file is First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


In [36]:
# now start to set up the vocab
# The unique characters in the file
vocab = sorted(set(text_jose))
print(f'{len(vocab)} unique characters in jose data')

84 unique characters in jose data


In [66]:
# now start to tokenize the vocab list into ids
# convert vocab into ids
ids_from_chars = tf.keras.layers.StringLookup(
    vocabulary=list(vocab), mask_token=None)

chars_from_ids = tf.keras.layers.StringLookup(
    vocabulary=ids_from_chars.get_vocabulary(), invert=True, mask_token=None)

In [60]:
# you can play around with arbitrary strings
chars_jt = tf.strings.unicode_split('hello', input_encoding='UTF-8')
ids = ids_from_chars(chars_jt)
ids

<tf.Tensor: shape=(5,), dtype=int64, numpy=array([64, 61, 68, 68, 71])>

In [59]:
# now convert back
chars_jt = chars_from_ids(ids)
chars_jt

<tf.Tensor: shape=(5,), dtype=string, numpy=array([b'h', b'e', b'l', b'l', b'o'], dtype=object)>

In [67]:
# so you convert text to ids, then use the tensor slices to get the ids
all_ids = ids_from_chars(tf.strings.unicode_split(text_jose, 'UTF-8'))
ids_dataset = tf.data.Dataset.from_tensor_slices(all_ids)


In [None]:
# so can convert these slices back to characters
for ids in ids_dataset.take(40):
    print(chars_from_ids(ids).numpy().decode('UTF-8'))

In [73]:
# choose an arbitarry length for the sequence, say 100
seq_length = 100

In [74]:
# so now convert the datasets into the sequences
sequences = ids_dataset.batch(seq_length+1, drop_remainder=True)

# lets have a look at the first sequence as an example
for seq in sequences.take(1):
  print(chars_from_ids(seq))

tf.Tensor(
[b'\n' b' ' b' ' b' ' b' ' b' ' b' ' b' ' b' ' b' ' b' ' b' ' b' ' b' '
 b' ' b' ' b' ' b' ' b' ' b' ' b' ' b' ' b'1' b'\n' b' ' b' ' b'F' b'r'
 b'o' b'm' b' ' b'f' b'a' b'i' b'r' b'e' b's' b't' b' ' b'c' b'r' b'e'
 b'a' b't' b'u' b'r' b'e' b's' b' ' b'w' b'e' b' ' b'd' b'e' b's' b'i'
 b'r' b'e' b' ' b'i' b'n' b'c' b'r' b'e' b'a' b's' b'e' b',' b'\n' b' '
 b' ' b'T' b'h' b'a' b't' b' ' b't' b'h' b'e' b'r' b'e' b'b' b'y' b' '
 b'b' b'e' b'a' b'u' b't' b'y' b"'" b's' b' ' b'r' b'o' b's' b'e' b' '
 b'm' b'i' b'g'], shape=(101,), dtype=string)


In [79]:
# we can define a function to retrive teh text back from characters (using the reduce join function in TF)
def text_from_ids(ids):
  return tf.strings.reduce_join(chars_from_ids(ids), axis=-1)

# and we can use this function to return example 
for seq in sequences.take(5):
  print(text_from_ids(seq).numpy())


b"\n                     1\n  From fairest creatures we desire increase,\n  That thereby beauty's rose mig"
b'ht never die,\n  But as the riper should by time decease,\n  His tender heir might bear his memory:\n  B'
b"ut thou contracted to thine own bright eyes,\n  Feed'st thy light's flame with self-substantial fuel,\n"
b'  Making a famine where abundance lies,\n  Thy self thy foe, to thy sweet self too cruel:\n  Thou that '
b"art now the world's fresh ornament,\n  And only herald to the gaudy spring,\n  Within thine own bud bur"


In [80]:
# we need to set up source target pairs for the supervised learning whihc will be the sequences shifted by 1 characters
# eg if we have sequence Hello, the source is 'Hell', and the output is 'ello'

def split_input_target(sequence):
    input_text = sequence[:-1]
    target_text = sequence[1:]
    return input_text, target_text


In [81]:
# so an example of how this works
split_input_target(list("Tensorflow"))

(['T', 'e', 'n', 's', 'o', 'r', 'f', 'l', 'o'],
 ['e', 'n', 's', 'o', 'r', 'f', 'l', 'o', 'w'])

In [82]:
# so create the data set with the input and target set
dataset = sequences.map(split_input_target)

In [84]:
# so show this as an example
for input_example, target_example in dataset.take(2):
    print("Input :", text_from_ids(input_example).numpy())
    print("Target:", text_from_ids(target_example).numpy())

Input : b"\n                     1\n  From fairest creatures we desire increase,\n  That thereby beauty's rose mi"
Target: b"                     1\n  From fairest creatures we desire increase,\n  That thereby beauty's rose mig"
Input : b'ht never die,\n  But as the riper should by time decease,\n  His tender heir might bear his memory:\n  '
Target: b't never die,\n  But as the riper should by time decease,\n  His tender heir might bear his memory:\n  B'


In [85]:
# Batch size (technical thing to help with training)
BATCH_SIZE = 64

# Buffer size to shuffle the dataset
# (TF data is designed to work with possibly infinite sequences,
# so it doesn't attempt to shuffle the entire sequence in memory. Instead,
# it maintains a buffer in which it shuffles elements).
BUFFER_SIZE = 10000

dataset = (
    dataset
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE, drop_remainder=True)
    .prefetch(tf.data.experimental.AUTOTUNE))

dataset

<_PrefetchDataset element_spec=(TensorSpec(shape=(64, 100), dtype=tf.int64, name=None), TensorSpec(shape=(64, 100), dtype=tf.int64, name=None))>

In [87]:
# now start to build the model

# Length of the vocabulary in StringLookup Layer
vocab_size = len(ids_from_chars.get_vocabulary())

# The embedding dimension
embedding_dim = 256

# Number of RNN units
rnn_units = 1024

In [89]:
# now set up a class that will essentially define the parameters of the model
class MyModel(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, rnn_units):
    super().__init__(self)
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(rnn_units,
                                   return_sequences=True,
                                   return_state=True)
    self.dense = tf.keras.layers.Dense(vocab_size)

  def call(self, inputs, states=None, return_state=False, training=False):
    x = inputs
    x = self.embedding(x, training=training)
    if states is None:
      states = self.gru.get_initial_state(x)
    x, states = self.gru(x, initial_state=states, training=training)
    x = self.dense(x, training=training)

    if return_state:
      return x, states
    else:
      return x

In [102]:
# so now create instance of that class
model = MyModel(
    vocab_size=vocab_size,
    embedding_dim=embedding_dim,
    rnn_units=rnn_units)

# set loss to return from Logits
loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True)

# now compile the model with the optimizer function
model.compile(optimizer='adam', loss=loss)


In [100]:
# as an example show the outputs of teh predictions
for input_example_batch, target_example_batch in dataset.take(1):
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")

  

(64, 100, 85) # (batch_size, sequence_length, vocab_size)


In [105]:
# this shows an example of the predictions batch
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices, axis=-1).numpy()
sampled_indices

# show example prediction on the untrained model
print("Input:\n", text_from_ids(input_example_batch[0]).numpy())
print()
print("Next Char Predictions:\n", text_from_ids(sampled_indices).numpy())

Input:
 b'es be but young and fair,\n    They have the gift to know it; and in his brain,\n    Which is as dry a'

Next Char Predictions:
 b'pEU?PETXGK8QW58J7QbPXm)<u6ijrn1UH[UNK]anW}!WRq5SkN!1mD3bX.[UNK]kRRKzh37>KEVwj>rOUlV&&?Hjtqx&<4lW)u,]tuS[3->9'


In [106]:
#  now train the model

# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)




In [108]:
# set the number of epochs

EPOCHS = 30

In [109]:
# now actually train it
history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [110]:
# below code makes a prediction!!

class OneStep(tf.keras.Model):
  def __init__(self, model, chars_from_ids, ids_from_chars, temperature=1.0):
    super().__init__()
    self.temperature = temperature
    self.model = model
    self.chars_from_ids = chars_from_ids
    self.ids_from_chars = ids_from_chars

    # Create a mask to prevent "[UNK]" from being generated.
    skip_ids = self.ids_from_chars(['[UNK]'])[:, None]
    sparse_mask = tf.SparseTensor(
        # Put a -inf at each bad index.
        values=[-float('inf')]*len(skip_ids),
        indices=skip_ids,
        # Match the shape to the vocabulary
        dense_shape=[len(ids_from_chars.get_vocabulary())])
    self.prediction_mask = tf.sparse.to_dense(sparse_mask)

  @tf.function
  def generate_one_step(self, inputs, states=None):
    # Convert strings to token IDs.
    input_chars = tf.strings.unicode_split(inputs, 'UTF-8')
    input_ids = self.ids_from_chars(input_chars).to_tensor()

    # Run the model.
    # predicted_logits.shape is [batch, char, next_char_logits]
    predicted_logits, states = self.model(inputs=input_ids, states=states,
                                          return_state=True)
    # Only use the last prediction.
    predicted_logits = predicted_logits[:, -1, :]
    predicted_logits = predicted_logits/self.temperature
    # Apply the prediction mask: prevent "[UNK]" from being generated.
    predicted_logits = predicted_logits + self.prediction_mask

    # Sample the output logits to generate token IDs.
    predicted_ids = tf.random.categorical(predicted_logits, num_samples=1)
    predicted_ids = tf.squeeze(predicted_ids, axis=-1)

    # Convert from token ids to characters
    predicted_chars = self.chars_from_ids(predicted_ids)

    # Return the characters and model state.
    return predicted_chars, states

In [111]:

one_step_model = OneStep(model, chars_from_ids, ids_from_chars)

In [114]:
# overall this works well output example pasted below. learns words but not meaning


start = time.time()
states = None
next_char = tf.constant(['ROMEO:'])
result = [next_char]

for n in range(2000):
  next_char, states = one_step_model.generate_one_step(next_char, states=states)
  result.append(next_char)

result = tf.strings.join(result)
end = time.time()
print(result[0].numpy().decode('utf-8'), '\n\n' + '_'*80)
print('\nRun time:', end - start)

ROMEO: PROTEUTUS hallow-day, to give, and made me cry
               here. What half my sword, Clifford.
  QUICKLY. Most true, and that which I look to't.
  OLIVIA. Hast thou no other dear love?
  THIRD SERVANT. I desire you this.
  MALVOLIO. Kneater, good Comparise within. You jog and a cause to-night,
    I know not why, which thou forget'st to report.  
    'Tis near a troution. Let me seeet you were
    Herself, does pieces. Cassion's burial fight.
    A direct gods they thus must prove retreat her.
  CLOWN. O Fool were I! Was it was to horse!
  DUKE. Not you, sir,
    You would entreat them descend; and then to Post
    Whose couple is not passionate Pompe and Troilus' shits
    That swear I'll strudge their topriguous lost.

                      Re-enter PYRAMUS mine eyes obscury,
                                     and others
  ADAM, Nurse, Orleans, TOUCHSTONE and GENTLEMEN, how fleet
  PATLE CLIFF and Verges' flict of speaking fathful, for
          and OFFICERSHE FOR YOUR
  

In [113]:
tf.saved_model.save(one_step_model, 'one_step')
one_step_reloaded = tf.saved_model.load('one_step')



In [None]:
#### output shown below


     

ROMEO: PROTEUTUS hallow-day, to give, and made me cry
               here. What half my sword, Clifford.
  QUICKLY. Most true, and that which I look to't.
  OLIVIA. Hast thou no other dear love?
  THIRD SERVANT. I desire you this.
  MALVOLIO. Kneater, good Comparise within. You jog and a cause to-night,
    I know not why, which thou forget'st to report.  
    'Tis near a troution. Let me seeet you were
    Herself, does pieces. Cassion's burial fight.
    A direct gods they thus must prove retreat her.
  CLOWN. O Fool were I! Was it was to horse!
  DUKE. Not you, sir,
    You would entreat them descend; and then to Post
    Whose couple is not passionate Pompe and Troilus' shits
    That swear I'll strudge their topriguous lost.

                      Re-enter PYRAMUS mine eyes obscury,
                                     and others
  ADAM, Nurse, Orleans, TOUCHSTONE and GENTLEMEN, how fleet
  PATLE CLIFF and Verges' flict of speaking fathful, for
          and OFFICERSHE FOR YOUR
    Give her masquier.
  AGUECHEEK. And you of the rest good, at your presence nice a
    Pendow, lord Angelo, I am sure you love what 's thou
    saw her, to curtsire away. The conduit, I take thee she were a
    perioding thy plight should in; and it was already abused,
    but thou shalt hust? Why, for your eye-sweet musicion.
    To call thee in the work of him.
                             [A level rite but not of it.  Sir Viceoneby.

  Leon. Curs'd be thy lord, with hit painting in my mouth- that your money
    is too ungrinkisd for life?
  PAGE. Yea, and pass not him to me, I will seek thee hence.
  PROTEUS. Good for your crush, give nine special court.
  PEDANT. Good morrow, rain!
  IAGO. I do believe that I am not a fool;
    In thy orifect esteeming that I would
    have sent in my huel abroad-cueks to the voice. 'a would speak with you!
    Take this give black my wit, thy loving worth,
    Thou art the fogla's trenched heel of writting
    A lily wrong in thy affliction truth.
  LE 