In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import tensorflow as tf
from tensorflow import keras

import os
import time

In [2]:
# from google.colab import files

# files.upload()

Saving Seinfeld_transcripts.txt to Seinfeld_transcripts.txt




In [3]:
path_to_transcripts = 'Seinfeld_transcripts.txt'
text = open(path_to_transcripts, 'rb').read().decode(encoding='utf-8')

f'Length of text: {len(text)}'

'Length of text: 4405981'

In [4]:
vocab = sorted(set(text))

f'Number of unique characters: {len(vocab)}'

'Number of unique characters: 95'

Next, a function to convert to utf-8.

In [5]:
ids_from_chars = keras.layers.experimental.preprocessing.StringLookup(
    vocabulary=list(vocab)
)
ids = ids_from_chars(tf.strings.unicode_split(
    [['abcd'], ['ABCD']],
    input_encoding='UTF-8')
)
ids

<tf.RaggedTensor [[[66, 67, 68, 69]], [[35, 36, 37, 38]]]>

And the reverse.

In [6]:
chars_from_ids = keras.layers.experimental.preprocessing.StringLookup(
    vocabulary=ids_from_chars.get_vocabulary(), 
    invert=True
)

chars_from_ids(ids)

<tf.RaggedTensor [[[b'a', b'b', b'c', b'd']], [[b'A', b'B', b'C', b'D']]]>

Next, building a list of ids from our entire text.

In [7]:
all_ids = ids_from_chars(tf.strings.unicode_split(text, 'UTF-8'))
all_ids

<tf.Tensor: shape=(4405981,), dtype=int64, numpy=array([ 3,  2,  3, ..., 38,  2,  3])>

In [8]:
ids_dataset = tf.data.Dataset.from_tensor_slices(all_ids)

for ids in ids_dataset.take(10):
    print(chars_from_ids(ids).numpy().decode('utf-8'))

 


 


 
G
O
O
D
 


Set the maximum length of sequences to 100 (this can be tuned).

In [9]:
seq_length = 100
examples_per_epoch = len(text) // (seq_length+1)

In [10]:
sequences = ids_dataset.batch(seq_length+1, drop_remainder=True)
for seq in sequences.take(1):
    print(chars_from_ids(seq))

tf.Tensor(
[b' ' b'\n' b' ' b'\n' b' ' b'G' b'O' b'O' b'D' b' ' b'N' b'E' b'W' b'S'
 b',' b' ' b'B' b'A' b'D' b' ' b'N' b'E' b'W' b'S' b'\n' b'\n' b'\n' b'\n'
 b' ' b'W' b'r' b'i' b't' b't' b'e' b'n' b' ' b'b' b'y' b'\n' b'\n' b' '
 b'L' b'a' b'r' b'r' b'y' b' ' b'D' b'a' b'v' b'i' b'd' b' ' b'&' b' '
 b'J' b'e' b'r' b'r' b'y' b' ' b'S' b'e' b'i' b'n' b'f' b'e' b'l' b'd'
 b'\n' b'\n' b'\n' b'\n' b'\n' b' ' b'\n' b' ' b'(' b'C' b'o' b'm' b'e'
 b'd' b'y' b' ' b'c' b'l' b'u' b'b' b')' b'\n' b'\n' b' ' b'\n' b' ' b'J'
 b'E' b'R' b'R' b'Y'], shape=(101,), dtype=string)


Helper function to get actual text instead of a list of chars.

In [11]:
def text_from_ids(ids):
    return tf.strings.reduce_join(chars_from_ids(ids), axis=-1)

In [12]:
for seq in sequences.take(1):
    print(text_from_ids(seq))

tf.Tensor(b' \n \n GOOD NEWS, BAD NEWS\n\n\n\n Written by\n\n Larry David & Jerry Seinfeld\n\n\n\n\n \n (Comedy club)\n\n \n JERRY', shape=(), dtype=string)


Offset the sequences by 1 in order to build the inputs and targets.

In [13]:
def split_input_target(sequence):
    input_text = sequence[:-1]
    target_text = sequence[1:]
    return input_text, target_text

split_input_target(list('Seinfeld'))

(['S', 'e', 'i', 'n', 'f', 'e', 'l'], ['e', 'i', 'n', 'f', 'e', 'l', 'd'])

In [14]:
dataset = sequences.map(split_input_target)

In [15]:
for input_example, target_example in dataset.take(1):
    print('Input: ', text_from_ids(input_example).numpy())
    print('Target: ', text_from_ids(target_example).numpy())

Input:  b' \n \n GOOD NEWS, BAD NEWS\n\n\n\n Written by\n\n Larry David & Jerry Seinfeld\n\n\n\n\n \n (Comedy club)\n\n \n JERR'
Target:  b'\n \n GOOD NEWS, BAD NEWS\n\n\n\n Written by\n\n Larry David & Jerry Seinfeld\n\n\n\n\n \n (Comedy club)\n\n \n JERRY'


Since tf.data can work with potentially infinite sequences, a BUFFER_SIZE is given, so it doesn't shuffle the entire sequence, just the parts in the buffer.

In [16]:
BATCH_SIZE = 64
BUFFER_SIZE = 10000

dataset = (
    dataset.shuffle(BUFFER_SIZE)
      .batch(BATCH_SIZE, drop_remainder=True)
      .prefetch(tf.data.experimental.AUTOTUNE)
)

dataset

<PrefetchDataset shapes: ((64, 100), (64, 100)), types: (tf.int64, tf.int64)>

In [28]:
vocab_size = len(vocab)
embedding_dim = 256
rnn_units = 1024

In [37]:
class GruModel(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, rnn_units):
        super().__init__(self)
        self.embedding = keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = keras.layers.GRU(rnn_units, return_sequences=True, return_state=True, dropout=0.2)
        self.dense = keras.layers.Dense(vocab_size)


    def call(self, inputs, states=None, return_state=False, training=False):
        x = inputs
        x = self.embedding(x, training=training)
        if states is None:
            states = self.gru.get_initial_state(x)
        x, states = self.gru(x, initial_state=states, training=training)
        x = self.dense(x, training=training)

        if return_state:
            return x, states
        else:
            return x

In [38]:
gru_model = GruModel(
    vocab_size=len(ids_from_chars.get_vocabulary()),
    embedding_dim=embedding_dim,
    rnn_units=rnn_units
)

Testing if the model outputs the right shape.

In [77]:
for input_example_batch, target_example_batch in dataset.take(1):
    example_batch_predictions = gru_model(input_example_batch)
    print(example_batch_predictions.shape, '# (batch_size, sequence_length, vocab_size)')
    print(input_example_batch.shape)

(64, 100, 97) # (batch_size, sequence_length, vocab_size)
(64, 100)


In [40]:
gru_model.summary()

Model: "gru_model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_9 (Embedding)      multiple                  24832     
_________________________________________________________________
gru_10 (GRU)                 multiple                  3938304   
_________________________________________________________________
dense_5 (Dense)              multiple                  99425     
Total params: 4,062,561
Trainable params: 4,062,561
Non-trainable params: 0
_________________________________________________________________


In [42]:
loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True)
optimizer = tf.optimizers.Adam()

In [43]:
gru_model.compile(loss=loss, optimizer=optimizer)

In [44]:
epochs = 15

gru_model.fit(dataset, epochs=epochs)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<tensorflow.python.keras.callbacks.History at 0x7f901f195850>

A class for predicting the next step. Creates a mask to prevent certain characters. Also provides a temperature parameter that modifies how random the predictions can get.

In [49]:
class OneStep(tf.keras.Model):
  def __init__(self, model, chars_from_ids, ids_from_chars, temperature=1.0):
    super().__init__()
    self.temperature = temperature
    self.model = model
    self.chars_from_ids = chars_from_ids
    self.ids_from_chars = ids_from_chars

    # Create a mask to prevent "" or "[UNK]" from being generated
    skip_ids = self.ids_from_chars(['', '[UNK]'])[:, None]
    sparse_mask = tf.SparseTensor(
        # Put a -inf at each bad index
        values = [-float('inf')]*len(skip_ids),
        indices = skip_ids,
        # Match the shape to the vocabulary,
        dense_shape=[len(ids_from_chars.get_vocabulary())]
    )
    self.prediction_mask = tf.sparse.to_dense(sparse_mask)

  @tf.function
  def generate_one_step(self, inputs, states=None):
    # Covert strings to token IDs.
    input_chars = tf.strings.unicode_split(inputs, 'UTF-8')
    input_ids = self.ids_from_chars(input_chars).to_tensor()

    # Run the model
    # predicted_logits.shape is [batch, char, next_char_logits]
    predicted_logits, states = self.model(inputs=input_ids, states=states, return_state=True)

    # Only use the last prediction
    predicted_logits = predicted_logits[:, -1, :]
    predicted_logits = predicted_logits / self.temperature
    # Applt the prediction mask: prevent "" or "[UNK]" from being generated
    predicted_logits = predicted_logits + self.prediction_mask

    # Sample the output logits to generate token IDs
    predicted_ids = tf.random.categorical(predicted_logits, num_samples=1)
    predicted_ids = tf.squeeze(predicted_ids, axis=-1)

    # Convert from token ids to characters
    predicted_chars = self.chars_from_ids(predicted_ids)

    return predicted_chars, states


In [59]:
one_step_model = OneStep(gru_model, chars_from_ids, ids_from_chars, temperature=0.5)

Generating some text.

In [60]:
states = None
next_char = tf.constant(['JERRY:'])
result = [next_char]

for n in range(1000):
  next_char, states = one_step_model.generate_one_step(next_char, states=states)
  result.append(next_char)

result = tf.strings.join(result)
end = time.time()

print(result[0].numpy().decode('utf-8'))

JERRY: CRAIG 
 Do we have to eat the show?
 

 GEORGE 
 Hey (he sees a box of cereal. He sits on the street, and 
 are talking on
 
 the table. She scrats his hand her begins to leave, but hits the 
 door.
 
 
 JERRY
 Hey, I have to say anything.

 
 ELAINE
 It's Mickey Mantle bad.

 
 JERRY
 He was a funny guy?

 
 ELAINE
 Why would you expect me to change 
 that?
 
 
 JERRY
 I was just leaving.

 
 GEORGE
 Hey. This is what you think they're 
 beautiful. What do you think?
 
 
 
 JERRY
 Oh, I got it from the vidious name. 
 That's a show about this green.
 
 
 JERRY
 Maybe you've got a pen.

 
 ELAINE
 What?

 
 JERRY
 Yeah, the big news are going to be 
 trading that one.
 
 
 GEORGE
 No, no no no no. That's right. It was 
 a complete was the last one. The only thing 
 I know what this is not me.
 
 
 JERRY
 Why don't you just go down there? The 
 area? The cable guy gave her to the 
 Hamptons.
 
 
 GEORGE
 (still looking at the store) It's the 
 world of soda in the sidewalk and 
 