In [1]:
import numpy as np
import os
import re
import random
import unidecode
import time
import tensorflow as tf

path_to_file = tf.keras.utils.get_file('bible.txt', 'https://raw.githubusercontent.com/mxw/grmr/master/src/finaltests/bible.txt')

In [2]:
# Read, then decode for py2 compat.
text = open(path_to_file, 'rb').read().decode(encoding='utf-8')
# length of text is the number of characters in it
print('Length of text: {} characters'.format(len(text)))

# Take a look at the first 250 characters in text
print(text[:250])

Length of text: 4451368 characters
1:1 In the beginning God created the heaven and the earth.

1:2 And the earth was without form, and void; and darkness was upon
the face of the deep. And the Spirit of God moved upon the face of the
waters.

1:3 And God said, Let there be light


In [3]:
text = text[:500000]

In [4]:
vocab = sorted(set(text))
print('{} unique characters'.format(len(vocab)))

73 unique characters


In [5]:
# Creating a mapping from unique characters to indices
char2idx = {u:i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)

text_as_int = np.array([char2idx[c] for c in text])

In [6]:
print('{')
for char,_ in zip(char2idx, range(20)):
    print('  {:4s}: {:3d},'.format(repr(char), char2idx[char]))
print('  ...\n}')

{
  '\n':   0,
  '\r':   1,
  ' ' :   2,
  '!' :   3,
  "'" :   4,
  '(' :   5,
  ')' :   6,
  ',' :   7,
  '-' :   8,
  '.' :   9,
  '0' :  10,
  '1' :  11,
  '2' :  12,
  '3' :  13,
  '4' :  14,
  '5' :  15,
  '6' :  16,
  '7' :  17,
  '8' :  18,
  '9' :  19,
  ...
}


In [7]:
# Show how the first 13 characters from the text are mapped to integers
print('{} ---- characters mapped to int ---- > {}'.format(repr(text[:13]), text_as_int[:13]))

'1:1 In the be' ---- characters mapped to int ---- > [11 20 11  2 31 60  2 66 54 51  2 48 51]


In [8]:
# The maximum length sentence you want for a single input in characters
seq_length = 100
examples_per_epoch = len(text)//(seq_length+1)

# Create training examples / targets
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

for i in char_dataset.take(5):
    print(idx2char[i.numpy()])

1
:
1
 
I


In [9]:
sequences = char_dataset.batch(seq_length+1, drop_remainder=True)

for item in sequences.take(5):
    print(repr(''.join(idx2char[item.numpy()])))

'1:1 In the beginning God created the heaven and the earth.\r\n\r\n1:2 And the earth was without form, and'
' void; and darkness was upon\r\nthe face of the deep. And the Spirit of God moved upon the face of the\r'
'\nwaters.\r\n\r\n1:3 And God said, Let there be light: and there was light.\r\n\r\n1:4 And God saw the light, '
'that it was good: and God divided the light\r\nfrom the darkness.\r\n\r\n1:5 And God called the light Day, '
'and the darkness he called Night.\r\nAnd the evening and the morning were the first day.\r\n\r\n1:6 And God'


In [10]:
def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

dataset = sequences.map(split_input_target)

for input_example, target_example in  dataset.take(1):
    print('Input data: ', repr(''.join(idx2char[input_example.numpy()])))
    print('Target data:', repr(''.join(idx2char[target_example.numpy()])))

Input data:  '1:1 In the beginning God created the heaven and the earth.\r\n\r\n1:2 And the earth was without form, an'
Target data: ':1 In the beginning God created the heaven and the earth.\r\n\r\n1:2 And the earth was without form, and'


In [11]:
for i, (input_idx, target_idx) in enumerate(zip(input_example[:5], target_example[:5])):
    print("Step {:4d}".format(i))
    print("  input: {} ({:s})".format(input_idx, repr(idx2char[input_idx])))
    print("  expected output: {} ({:s})".format(target_idx, repr(idx2char[target_idx])))

Step    0
  input: 11 ('1')
  expected output: 20 (':')
Step    1
  input: 20 (':')
  expected output: 11 ('1')
Step    2
  input: 11 ('1')
  expected output: 2 (' ')
Step    3
  input: 2 (' ')
  expected output: 31 ('I')
Step    4
  input: 31 ('I')
  expected output: 60 ('n')


In [12]:
# Batch size
BATCH_SIZE = 64

# Buffer size to shuffle the dataset
# (TF data is designed to work with possibly infinite sequences,
# so it doesn't attempt to shuffle the entire sequence in memory. Instead,
# it maintains a buffer in which it shuffles elements).
BUFFER_SIZE = 10000

dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

dataset

<BatchDataset shapes: ((64, 100), (64, 100)), types: (tf.int32, tf.int32)>

In [13]:
# Length of the vocabulary in chars
vocab_size = len(vocab)

# The embedding dimension
embedding_dim = 256

# Number of RNN units
rnn_units = 1024

In [14]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size, embedding_dim,
                                  batch_input_shape = [batch_size, None]),
        tf.keras.layers.GRU(rnn_units,
                            return_sequences = True,
                            stateful = True,
                            recurrent_initializer = 'glorot_uniform'),
        tf.keras.layers.Dense(vocab_size)
    ])
    return model

In [15]:
model = build_model(
    vocab_size=len(vocab),
    embedding_dim=embedding_dim,
    rnn_units=rnn_units,
    batch_size=BATCH_SIZE)

In [16]:
for input_example_batch, target_example_batch in dataset.take(1):
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")

(64, 100, 73) # (batch_size, sequence_length, vocab_size)


In [17]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (64, None, 256)           18688     
_________________________________________________________________
gru (GRU)                    (64, None, 1024)          3938304   
_________________________________________________________________
dense (Dense)                (64, None, 73)            74825     
Total params: 4,031,817
Trainable params: 4,031,817
Non-trainable params: 0
_________________________________________________________________


In [18]:
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices,axis=-1).numpy()
sampled_indices

array([54,  7, 29, 26, 30, 17, 25, 10, 12,  0, 13, 51,  8, 16,  8, 32, 16,
        6, 20, 32, 35, 65, 21, 47, 28, 30, 24, 51, 10, 48, 19, 28, 34, 69,
       62, 50, 41,  1, 57, 38, 51,  4, 30, 51, 57, 20,  3,  7, 38, 58,  4,
       54, 66,  8,  0, 49, 51,  8, 14,  2, 15, 66, 53, 16, 62, 61, 17, 55,
       13, 39, 60,  3, 34, 36, 70, 16, 49, 71, 69, 31, 71, 44, 51, 60, 21,
       63,  3, 14, 25, 35, 68, 34, 53, 40,  5, 54, 42, 54, 60,  4],
      dtype=int64)

In [19]:
print("Input: \n", repr("".join(idx2char[input_example_batch[0]])))
print()
print("Next Char Predictions: \n", repr("".join(idx2char[sampled_indices ])))

Input: 
 'ish of spirit, and for cruel bondage.\r\n\r\n6:10 And the LORD spake unto Moses, saying, 6:11 Go in, spe'

Next Char Predictions: 
 "h,GDH7C02\n3e-6-J6):JMs;aFHBe0b9FLwpdT\rkPe'Hek:!,Pl'ht-\nce-4 5tg6po7i3Rn!LNx6cywIyWen;q!4CMvLgS(hUhn'"


In [20]:
def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

example_batch_loss = loss(target_example_batch, example_batch_predictions)
print("Prediction shape: ", example_batch_predictions.shape, " # (batch_size, sequence_length, vocab_size)")
print("scalar_loss:      ", example_batch_loss.numpy().mean())

Prediction shape:  (64, 100, 73)  # (batch_size, sequence_length, vocab_size)
scalar_loss:       4.2896166


In [21]:
model.compile(optimizer='adam', loss=loss)

In [23]:
EPOCHS = 50

In [None]:
history = model.fit(dataset, epochs=EPOCHS, verbose = 1)

Epoch 1/50

In [None]:
model = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)

model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))

model.build(tf.TensorShape([1, None]))

In [None]:
model.summary()

In [None]:
def generate_text(model, start_string):
    # Evaluation step (generating text using the learned model)

    # Number of characters to generate
    num_generate = 1000

    # Converting our start string to numbers (vectorizing)
    input_eval = [char2idx[s] for s in start_string]
    input_eval = tf.expand_dims(input_eval, 0)

    # Empty string to store our results
    text_generated = []

    # Low temperature results in more predictable text.
    # Higher temperature results in more surprising text.
    # Experiment to find the best setting.
    temperature = 1.0

    # Here batch size == 1
    model.reset_states()
    for i in range(num_generate):
        predictions = model(input_eval)
        # remove the batch dimension
        predictions = tf.squeeze(predictions, 0)

        # using a categorical distribution to predict the character returned by the model
        predictions = predictions / temperature
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()

        # Pass the predicted character as the next input to the model
        # along with the previous hidden state
        input_eval = tf.expand_dims([predicted_id], 0)

        text_generated.append(idx2char[predicted_id])

    return (start_string + ''.join(text_generated))

In [None]:
print(generate_text(model, start_string=u"If "))

In [None]:
print(generate_text(model, start_string=u"When "))

In [None]:
print(generate_text(model, start_string=u"Behold, "))

In [None]:
print(generate_text(model, start_string=u"I, the LORD "))

In [None]:
print(generate_text(model, start_string=u"And thou shalt "))