In [None]:
import tensorflow as tf

import tensorflow_datasets as tfds
import os

In [None]:
DIRECTORY_URL = 'https://storage.googleapis.com/download.tensorflow.org/data/illiad/'
FILE_NAMES = ['cowper.txt', 'derby.txt', 'butler.txt']

for name in FILE_NAMES:
    text_dir = tf.keras.utils.get_file(name, origin=DIRECTORY_URL + name)

parent_dir = os.path.dirname(text_dir)

parent_dir

In [None]:
!curl https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt > $HOME/.keras/datasets/shakespeare.txt

In [None]:
FILE_NAMES = ['cowper.txt', 'derby.txt', 'butler.txt', 'shakespeare.txt']

In [None]:
def labeler(example, index):
    return example, tf.cast(index, tf.int64)


labeled_data_sets = []

for i, file_name in enumerate(FILE_NAMES):
    lines_dataset = tf.data.TextLineDataset(os.path.join(
        parent_dir, file_name))
    labeled_dataset = lines_dataset.map(lambda ex: labeler(ex, i))
    labeled_data_sets.append(labeled_dataset)

In [None]:
BUFFER_SIZE = 50000
BATCH_SIZE = 64
TAKE_SIZE = 5000

In [None]:
all_labeled_data = labeled_data_sets[0]
for labeled_dataset in labeled_data_sets[1:]:
    all_labeled_data = all_labeled_data.concatenate(labeled_dataset)

all_labeled_data = all_labeled_data.shuffle(BUFFER_SIZE,
                                            reshuffle_each_iteration=False)

In [None]:
for ex in all_labeled_data.take(15):
    print(ex)

In [None]:
import tensorflow_text as tf_text

tokenizer = tf_text.WhitespaceTokenizer()

vocabulary_set = set()
for text_tensor, _ in all_labeled_data:
    some_tokens = tokenizer.tokenize(text_tensor.numpy())
    vocabulary_set.update(some_tokens.numpy())

vocab_size = len(vocabulary_set)
vocab_size


In [None]:
import tensorflow_datasets as tfds

encoder = tfds.deprecated.text.TokenTextEncoder(vocabulary_set)

In [None]:
example_text = next(iter(all_labeled_data))[0].numpy()
print(example_text) 

In [None]:
encoded_example = encoder.encode(example_text)
print(encoded_example)

In [None]:
def encode(text_tensor, label):
    encoded_text = encoder.encode(text_tensor.numpy())
    return encoded_text, label


def encode_map_fn(text, label):
    return tf.py_function(encode, inp=[text, label], Tout=(tf.int64, tf.int64))


all_encoded_data = all_labeled_data.map(encode_map_fn)

In [None]:
train_data = all_encoded_data.skip(TAKE_SIZE).shuffle(BUFFER_SIZE)
train_data = train_data.padded_batch(BATCH_SIZE, padded_shapes=([-1],[]))

test_data = all_encoded_data.take(TAKE_SIZE)
test_data = test_data.padded_batch(BATCH_SIZE, padded_shapes=([-1],[]))

In [None]:
sample_text, sample_labels = next(iter(test_data))

sample_text[0], sample_labels[0]

In [None]:
vocab_size += 1

In [None]:
model = tf.keras.Sequential()
model.add(tf.keras.layers.Embedding(vocab_size, 64))
model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)))
for units in [64, 64]:
    model.add(tf.keras.layers.Dense(units, activation='relu'))

# Output layer. The first argument is the number of labels.
model.add(tf.keras.layers.Dense(4, activation='softmax'))

In [None]:
model.summary()

In [None]:
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [None]:
%%time
# ~4 minutes
model.fit(train_data, epochs=3, validation_data=test_data)

In [None]:
eval_loss, eval_acc = model.evaluate(test_data)

print('\nEval loss: {:.3f}, Eval accuracy: {:.3f}'.format(eval_loss, eval_acc))

In [None]:
import numpy as np
example_text = b'to be or not to be'
example_text = encoder.encode(example_text)
encoded_example_np = np.array([example_text]) 
print(encoded_example_np)

In [None]:
model.predict( [encoded_example_np,])