In [1]:
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import sklearn
import pandas as pd
import os
import sys
import time
import tensorflow as tf

from tensorflow import keras

print(tf.__version__)
print(sys.version_info)

for module in mpl, np, pd, sklearn, tf, keras:
    print(module.__name__, module.__version__)

2.0.0
sys.version_info(major=3, minor=7, micro=4, releaselevel='final', serial=0)
matplotlib 3.1.2
numpy 1.17.4
pandas 0.25.3
sklearn 0.22
tensorflow 2.0.0
tensorflow_core.keras 2.2.4-tf


In [2]:
input_filepath = './shakespeare.txt'
text = open(input_filepath, 'r').read()

print(len(text))
print(text[0:100])

65536
First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


In [3]:
# 1. geneate vocab
# 2. buling mapping char -> id
# 3. data -> id_data
# 4. abcd -> bcd<eos>

vocab = sorted(set(text))
print(len(vocab))

59


In [4]:
print(vocab)

['\n', ' ', '!', "'", ',', '-', '.', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'R', 'S', 'T', 'U', 'V', 'W', 'Y', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [5]:
char2idx = {char:idx for idx, char in enumerate(vocab)}

In [6]:
print(char2idx)

{'\n': 0, ' ': 1, '!': 2, "'": 3, ',': 4, '-': 5, '.': 6, ':': 7, ';': 8, '?': 9, 'A': 10, 'B': 11, 'C': 12, 'D': 13, 'E': 14, 'F': 15, 'G': 16, 'H': 17, 'I': 18, 'J': 19, 'K': 20, 'L': 21, 'M': 22, 'N': 23, 'O': 24, 'P': 25, 'R': 26, 'S': 27, 'T': 28, 'U': 29, 'V': 30, 'W': 31, 'Y': 32, 'a': 33, 'b': 34, 'c': 35, 'd': 36, 'e': 37, 'f': 38, 'g': 39, 'h': 40, 'i': 41, 'j': 42, 'k': 43, 'l': 44, 'm': 45, 'n': 46, 'o': 47, 'p': 48, 'q': 49, 'r': 50, 's': 51, 't': 52, 'u': 53, 'v': 54, 'w': 55, 'x': 56, 'y': 57, 'z': 58}


In [7]:
idx2char = np.array(vocab)
print(idx2char)

['\n' ' ' '!' "'" ',' '-' '.' ':' ';' '?' 'A' 'B' 'C' 'D' 'E' 'F' 'G' 'H'
 'I' 'J' 'K' 'L' 'M' 'N' 'O' 'P' 'R' 'S' 'T' 'U' 'V' 'W' 'Y' 'a' 'b' 'c'
 'd' 'e' 'f' 'g' 'h' 'i' 'j' 'k' 'l' 'm' 'n' 'o' 'p' 'q' 'r' 's' 't' 'u'
 'v' 'w' 'x' 'y' 'z']


In [8]:
text_as_int = np.array([char2idx[c] for c in text])
print(text_as_int[0:10])
print(text[0:10])

[15 41 50 51 52  1 12 41 52 41]
First Citi


In [9]:
def split_input_target(id_text):
    """ abcde -> abcd, bacde"""
    return id_text[0:-1], id_text[1:]

char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)
seq_length = 100
seq_dataset = char_dataset.batch(seq_length + 1, drop_remainder = True)

for ch_id in char_dataset.take(2):
    print(ch_id, idx2char[ch_id.numpy()])

for seq_id in seq_dataset.take(2):
    print(seq_id)
    print(repr(''.join(idx2char[seq_id.numpy()])))

tf.Tensor(15, shape=(), dtype=int64) F
tf.Tensor(41, shape=(), dtype=int64) i
tf.Tensor(
[15 41 50 51 52  1 12 41 52 41 58 37 46  7  0 11 37 38 47 50 37  1 55 37
  1 48 50 47 35 37 37 36  1 33 46 57  1 38 53 50 52 40 37 50  4  1 40 37
 33 50  1 45 37  1 51 48 37 33 43  6  0  0 10 44 44  7  0 27 48 37 33 43
  4  1 51 48 37 33 43  6  0  0 15 41 50 51 52  1 12 41 52 41 58 37 46  7
  0 32 47 53  1], shape=(101,), dtype=int64)
'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou '
tf.Tensor(
[33 50 37  1 33 44 44  1 50 37 51 47 44 54 37 36  1 50 33 52 40 37 50  1
 52 47  1 36 41 37  1 52 40 33 46  1 52 47  1 38 33 45 41 51 40  9  0  0
 10 44 44  7  0 26 37 51 47 44 54 37 36  6  1 50 37 51 47 44 54 37 36  6
  0  0 15 41 50 51 52  1 12 41 52 41 58 37 46  7  0 15 41 50 51 52  4  1
 57 47 53  1 43], shape=(101,), dtype=int64)
'are all resolved rather to die than to famish?\n\nAll:\nResolved. resolved.\n\nFirst Citizen:\nFirst, you k'


In [10]:
seq_dataset = seq_dataset.map(split_input_target)

In [11]:
for item_input, item_output in seq_dataset.take(2):
    print(item_input)
    print(item_output)

tf.Tensor(
[15 41 50 51 52  1 12 41 52 41 58 37 46  7  0 11 37 38 47 50 37  1 55 37
  1 48 50 47 35 37 37 36  1 33 46 57  1 38 53 50 52 40 37 50  4  1 40 37
 33 50  1 45 37  1 51 48 37 33 43  6  0  0 10 44 44  7  0 27 48 37 33 43
  4  1 51 48 37 33 43  6  0  0 15 41 50 51 52  1 12 41 52 41 58 37 46  7
  0 32 47 53], shape=(100,), dtype=int64)
tf.Tensor(
[41 50 51 52  1 12 41 52 41 58 37 46  7  0 11 37 38 47 50 37  1 55 37  1
 48 50 47 35 37 37 36  1 33 46 57  1 38 53 50 52 40 37 50  4  1 40 37 33
 50  1 45 37  1 51 48 37 33 43  6  0  0 10 44 44  7  0 27 48 37 33 43  4
  1 51 48 37 33 43  6  0  0 15 41 50 51 52  1 12 41 52 41 58 37 46  7  0
 32 47 53  1], shape=(100,), dtype=int64)
tf.Tensor(
[33 50 37  1 33 44 44  1 50 37 51 47 44 54 37 36  1 50 33 52 40 37 50  1
 52 47  1 36 41 37  1 52 40 33 46  1 52 47  1 38 33 45 41 51 40  9  0  0
 10 44 44  7  0 26 37 51 47 44 54 37 36  6  1 50 37 51 47 44 54 37 36  6
  0  0 15 41 50 51 52  1 12 41 52 41 58 37 46  7  0 15 41 50 51 52  4  1
 57 47 

In [12]:
batch_size = 64
buffer_size = 10000
seq_dataset = seq_dataset.shuffle(buffer_size).batch(batch_size, drop_remainder=True)

In [14]:
vocab_size = len(vocab)
embedding_dim = 256
rnn_units = 1024

def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    model = keras.models.Sequential([
        keras.layers.Embedding(vocab_size, embedding_dim, batch_input_shape = [batch_size, None]),
        keras.layers.LSTM(units = rnn_units,
                          stateful = True,
                          recurrent_initializer = 'glorot_uniform',
                          return_sequences = True),
        keras.layers.Dense(vocab_size)
    ])
    return model

model = build_model(vocab_size=vocab_size,
                    embedding_dim=embedding_dim,
                    rnn_units=rnn_units,
                    batch_size=batch_size)

model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (64, None, 256)           15104     
_________________________________________________________________
lstm_1 (LSTM)                (64, None, 1024)          5246976   
_________________________________________________________________
dense_1 (Dense)              (64, None, 59)            60475     
Total params: 5,322,555
Trainable params: 5,322,555
Non-trainable params: 0
_________________________________________________________________


In [36]:
for input_example_batch, target_example_batch in seq_dataset.take(1):
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape)

(64, 100, 59)


In [39]:
sample_indices = tf.random.categorical(
    logits=example_batch_predictions[0], num_samples=1)
print(sample_indices)
sample_indices = tf.squeeze(sample_indices, axis = -1)

tf.Tensor(
[[22]
 [ 1]
 [ 2]
 [44]
 [15]
 [44]
 [16]
 [37]
 [42]
 [21]
 [17]
 [38]
 [ 9]
 [ 0]
 [ 1]
 [22]
 [ 8]
 [17]
 [ 7]
 [ 5]
 [26]
 [42]
 [10]
 [39]
 [32]
 [39]
 [46]
 [17]
 [ 8]
 [36]
 [29]
 [37]
 [40]
 [58]
 [36]
 [41]
 [26]
 [31]
 [14]
 [16]
 [ 8]
 [35]
 [50]
 [43]
 [58]
 [49]
 [42]
 [ 7]
 [26]
 [ 8]
 [45]
 [ 5]
 [27]
 [44]
 [56]
 [26]
 [10]
 [ 8]
 [14]
 [58]
 [ 0]
 [49]
 [28]
 [47]
 [28]
 [27]
 [23]
 [21]
 [20]
 [57]
 [ 9]
 [38]
 [55]
 [58]
 [26]
 [32]
 [47]
 [ 2]
 [ 4]
 [34]
 [20]
 [31]
 [45]
 [51]
 [47]
 [15]
 [ 3]
 [17]
 [57]
 [34]
 [58]
 [44]
 [20]
 [36]
 [57]
 [12]
 [23]
 [55]
 [ 1]
 [ 5]], shape=(100, 1), dtype=int64)


In [40]:
print('input:', repr(''.join(idx2char[input_example_batch[0]])))
print('output:',repr(''.join(idx2char[target_example_batch[0]])) )
print('Predictions:', repr(''.join(idx2char[sample_indices])))

input: 'sts to you,\nWhere he should find you lions, finds you hares;\nWhere foxes, geese: you are no surer, n'
output: 'ts to you,\nWhere he should find you lions, finds you hares;\nWhere foxes, geese: you are no surer, no'
Predictions: "M !lFlGejLHf?\n M;H:-RjAgYgnH;dUehzdiRWEG;crkzqj:R;m-SlxRA;Ez\nqToTSNLKy?fwzRYo!,bKWmsoF'HybzlKdyCNw -"


In [43]:
def loss(labels, logits):
    return keras.losses.sparse_categorical_crossentropy(
        labels, logits, from_logits=True)

model.compile(optimizer='adam', loss = loss)
example_loss = loss(target_example_batch, example_batch_predictions)
print(example_loss.shape)
print(example_loss.numpy().mean())

(64, 100)
4.079889


In [44]:
output_dir = './text_generation_checkpoints'
if not os.path.exists(output_dir):
    os.mkdir(output_dir)
checkpoint_prefix = os.path.join(output_dir, 'ckpt_{epoch}')
checkpoint_callback = keras.callbacks.ModelCheckpoint(
    filepath = checkpoint_prefix,
    save_weights_only = True)

epochs =100
history = model.fit(seq_dataset, epochs = epochs,
                   callbacks = [checkpoint_callback])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [45]:
tf.train.latest_checkpoint(output_dir)

'./text_generation_checkpoints/ckpt_100'

In [47]:
model2 = build_model(vocab_size, embedding_dim, rnn_units, batch_size = 1)
model2.load_weights(tf.train.latest_checkpoint(output_dir))
model.build(tf.TensorShape([1, None]))
model2.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (1, None, 256)            15104     
_________________________________________________________________
simple_rnn_1 (SimpleRNN)     (1, None, 1024)           1311744   
_________________________________________________________________
dense_1 (Dense)              (1, None, 59)             60475     
Total params: 1,387,323
Trainable params: 1,387,323
Non-trainable params: 0
_________________________________________________________________


In [95]:
def generate_text(model, start_string, num_generate = 1000):
    input_eval = [char2idx[ch] for ch in start_string]
    input_eval = tf.expand_dims(input_eval, 0)
    
    
    text_generated = []
    model.reset_states()
    
    temperature = 0.5
    for _ in range(num_generate):
        predictions = model(input_eval)
        predictions = tf.squeeze(predictions, 0)
        predictions = predictions / temperature
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1, 0].numpy()
        text_generated.append(idx2char[predicted_id])
        input_eval = tf.expand_dims([predicted_id], 0)
    return start_string + ''.join(text_generated)

new_text = generate_text(model2, 'All:')

print(new_text)
        

All:
Wine he y t d chacoof tive----------------------ped t,
Aut, the thivequthad Jube tind tizeseres.
Thind akemanoffore
NUSize llen pat maxI I an Lalve,
Thacan ase mabuse whald he as thare t benoobuce aldir are hasthe pesthafoff pathathore vencond hak these the:
CO,
Nambetharur Yo'lldofis: thar coves the y t afoucharar,
Corut che ty ll besthe athaspllllize h wine tharersthano.
I th mee.
NUS:
CO ato the ty thin pavese COMad thave t t the habe y orenesuro be ponghenth me asize sthif gous hemil ake: se athito!
He, ano bese teFise thino,
The:
I t
Thakeves
Akeco.
acoussthellly A: ss ff pare p t on tingre thakemacato hik t.
CO
NUS ty anoure thess my m this thist thize t t tind aco wAnd t,
CI thas theston an trenof che or arevachinod I ses,
I s he ize ad th the hid thilllel he ano inof h had haremor co!
Conost nere ticknodifo thase chemy s acow the are all y thene VO, t ak coves I and ve: hareno thakn y topand withize heren qure; ize aknd,
Pr thad s y atis,
NENounonconoucous thare incagimy n