# Text Generation

Mainly copied and adapted from the Text Generation RNN example

In [1]:
from __future__ import absolute_import, division, print_function

import tensorflow as tf
tf.enable_eager_execution()
from tensorflow import keras

import numpy as np
import os
import time

In [2]:
#import dataset
path_to_file = "jokes.csv"


In [3]:
# Read, then decode for py2 compat.
text = open(path_to_file, 'rb').read().decode(encoding='utf-8')
# length of text is the number of characters in it
print ('Length of text: {} characters'.format(len(text)))

Length of text: 3288227 characters


In [4]:
# The unique characters in the file
vocab = sorted(set(text))
print ('{} unique characters'.format(len(vocab)))

239 unique characters


## Text Vectorisation

In [5]:
# Creating a mapping from unique characters to indices
char2idx = {u:i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)

text_as_int = np.array([char2idx[c] for c in text])

In [6]:
# The maximum length sentence we want for a single input in characters
seq_length = 100
examples_per_epoch = len(text)//seq_length

# Create training examples / targets
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

for i in char_dataset.take(5):
  print(idx2char[i.numpy()])

D
i
d
 
y


In [7]:
sequences = char_dataset.batch(seq_length+1, drop_remainder=True)

for item in sequences.take(5):
  print(repr(''.join(idx2char[item.numpy()])))

'Did you hear about the Native American man that drank 200 cups of tea?,He nearly drown in his own tea'
" pee.\r\nWhat's the best anti diarrheal prescription?,Mycheexarphlexin\r\nWhat do you call a person who i"
's outside a door and has no arms nor legs?,Matt\r\nWhich Star Trek character is a member of the magic c'
"ircle?,Jean-Luc Pickacard\r\nWhat's the difference between a bullet and a human?,A bullet doesn't miss "
"Harambe\r\nWhy was the Ethiopian baby crying?,He was having a mid-life crisis\r\nWhat's the difference be"


In [8]:
def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

dataset = sequences.map(split_input_target)

## Training Batches

In [9]:
# Batch size 
BATCH_SIZE = 64
steps_per_epoch = examples_per_epoch//BATCH_SIZE

# Buffer size to shuffle the dataset
# (TF data is designed to work with possibly infinite sequences, 
# so it doesn't attempt to shuffle the entire sequence in memory. Instead, 
# it maintains a buffer in which it shuffles elements).
BUFFER_SIZE = 10000

dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

dataset

<BatchDataset shapes: ((64, 100), (64, 100)), types: (tf.int64, tf.int64)>

## Build Model

In [10]:
# Length of the vocabulary in chars
vocab_size = len(vocab)

# The embedding dimension 
embedding_dim = 256

# Number of RNN units
rnn_units = 1024

In [11]:
if tf.test.is_gpu_available(): #GPU
  rnn = tf.keras.layers.CuDNNGRU
else:
  import functools
  rnn = functools.partial(
    tf.keras.layers.GRU, recurrent_activation='sigmoid')

In [4]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
  model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, 
                              batch_input_shape=[batch_size, None]),
    rnn(rnn_units,
        return_sequences=True, 
        recurrent_initializer='glorot_uniform',
        stateful=True),
    # tack on two RNN layers 
    rnn(rnn_units,
        return_sequences=True, 
        recurrent_initializer='glorot_uniform',
        stateful=True),
    
    rnn(rnn_units,
        return_sequences=True, 
        recurrent_initializer='glorot_uniform',
        stateful=True),
      
    tf.keras.layers.Dense(vocab_size)
  ])
  return model

In [13]:
model = build_model(
  vocab_size = len(vocab), 
  embedding_dim=embedding_dim, 
  rnn_units=rnn_units, 
  batch_size=BATCH_SIZE)

## Try Model

In [14]:
for input_example_batch, target_example_batch in dataset.take(1): 
  example_batch_predictions = model(input_example_batch)
  print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")

(64, 100, 239) # (batch_size, sequence_length, vocab_size)


In [15]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (64, None, 256)           61184     
_________________________________________________________________
cu_dnngru (CuDNNGRU)         (64, None, 1024)          3938304   
_________________________________________________________________
cu_dnngru_1 (CuDNNGRU)       (64, None, 1024)          6297600   
_________________________________________________________________
cu_dnngru_2 (CuDNNGRU)       (64, None, 1024)          6297600   
_________________________________________________________________
dense (Dense)                (64, None, 239)           244975    
Total params: 16,839,663
Trainable params: 16,839,663
Non-trainable params: 0
_________________________________________________________________


## Train

In [16]:
def loss(labels, logits):
#   return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)
  return tf.nn.sparse_softmax_cross_entropy_with_logits(labels=labels, logits=logits)

example_batch_loss  = loss(target_example_batch, example_batch_predictions)
print("Prediction shape: ", example_batch_predictions.shape, " # (batch_size, sequence_length, vocab_size)") 
print("scalar_loss:      ", example_batch_loss.numpy().mean())

Prediction shape:  (64, 100, 239)  # (batch_size, sequence_length, vocab_size)
scalar_loss:       5.47675


In [17]:
model.compile(
    optimizer = tf.train.AdamOptimizer(),
    loss = loss)

Checkpoints

In [18]:
# Directory where the checkpoints will be saved
checkpoint_dir = './text100'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

In [19]:
EPOCHS=300

In [21]:
history = model.fit(dataset.repeat(), epochs=EPOCHS, steps_per_epoch=steps_per_epoch, callbacks=[checkpoint_callback])

Epoch 1/300
 12/513 [..............................] - ETA: 1:51 - loss: 3.2904

KeyboardInterrupt: 

## Generate

In [1]:
#tf.train.latest_checkpoint(checkpoint_dir)

In [5]:
model = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1) 

#model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
model.load_weights('./text100/ckpt_31')


model.build(tf.TensorShape([1, None]))

NameError: name 'vocab_size' is not defined

In [37]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (1, None, 256)            61184     
_________________________________________________________________
cu_dnngru_12 (CuDNNGRU)      (1, None, 1024)           3938304   
_________________________________________________________________
cu_dnngru_13 (CuDNNGRU)      (1, None, 1024)           6297600   
_________________________________________________________________
cu_dnngru_14 (CuDNNGRU)      (1, None, 1024)           6297600   
_________________________________________________________________
dense_4 (Dense)              (1, None, 239)            244975    
Total params: 16,839,663
Trainable params: 16,839,663
Non-trainable params: 0
_________________________________________________________________


In [38]:
def generate_text(model, start_string):
  # Evaluation step (generating text using the learned model)

  # Number of characters to generate
  num_generate = 4500

  # Converting our start string to numbers (vectorizing) 
  input_eval = [char2idx[s] for s in start_string]
  input_eval = tf.expand_dims(input_eval, 0)

  # Empty string to store our results
  text_generated = []

  temperature = 0.8 #reduce temperature to 0.7 non words appear at higher temperatures 

  # Here batch size == 1
  model.reset_states()
  for i in range(num_generate):
      predictions = model(input_eval)
      # remove the batch dimension
      predictions = tf.squeeze(predictions, 0)

      # using a multinomial distribution to predict the word returned by the model
      predictions = predictions / temperature
      predicted_id = tf.multinomial(predictions, num_samples=1)[-1,0].numpy()
      
      # We pass the predicted word as the next input to the model
      # along with the previous hidden state
      input_eval = tf.expand_dims([predicted_id], 0)
      
      text_generated.append(idx2char[predicted_id])

  return (start_string + ''.join(text_generated))

In [39]:
print(generate_text(model, start_string=u"Why"))


Why did Einstein?,A cereman cry
Did you hear that 7.12 inches long?,Nuts. She's going to except myself.
Why does the Energizer Bunny go to Jail?,He's all right not.
What is and Saithrait's stafficient's Etherste Opponents
What's a transvestite?,A: You're 10/200$=/2
Why do jewish people hate sunblack tires?,They're always married Moundae- happened orgh.
What's the difference between a prostitute and a puppy?,One is a crusty bus station and the other is a simplet sick?,A: A cow's cub
What did the Irish guy do when he has a small off
What's the difference between a 465 post and a 10-ide?,Orange Jews
Why did the hipster cross the road?,To get to the same drop.
How many cops does it take to screw in a light bulb?,None. They'll just charged them.
What does a bride wear?,Astro
How do you triggers clearly?,Eripéchett
What does Obama's Assanting 50% off the door?,Islamic gorize
What do the Polak and MIT?,"A: Free Guts is a stupidest, meaning that audition?,I'll put their luctorits