In [1]:
import tensorflow as tf

import tensorflow_datasets as tfds
import os

In [2]:
DIRECTORY_URL = 'https://storage.googleapis.com/download.tensorflow.org/data/illiad/'
FILE_NAMES = ['cowper.txt', 'derby.txt', 'butler.txt']

for name in FILE_NAMES:
    text_dir = tf.keras.utils.get_file(name, origin=DIRECTORY_URL + name)

parent_dir = os.path.dirname(text_dir)

parent_dir

Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/illiad/cowper.txt
[1m815980/815980[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/illiad/derby.txt
[1m809730/809730[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/illiad/butler.txt
[1m807992/807992[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


'/root/.keras/datasets'

In [3]:
!curl https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt > $HOME/.keras/datasets/shakespeare.txt

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0

  pid, fd = os.forkpty()


100 1089k  100 1089k    0     0  6602k      0 --:--:-- --:--:-- --:--:-- 6641k


In [4]:
FILE_NAMES = ['cowper.txt', 'derby.txt', 'butler.txt', 'shakespeare.txt']

In [5]:
def labeler(example, index):
    return example, tf.cast(index, tf.int64)


labeled_data_sets = []

for i, file_name in enumerate(FILE_NAMES):
    lines_dataset = tf.data.TextLineDataset(os.path.join(
        parent_dir, file_name))
    labeled_dataset = lines_dataset.map(lambda ex: labeler(ex, i))
    labeled_data_sets.append(labeled_dataset)

In [6]:
BUFFER_SIZE = 50000
BATCH_SIZE = 64
TAKE_SIZE = 5000

In [7]:
all_labeled_data = labeled_data_sets[0]
for labeled_dataset in labeled_data_sets[1:]:
    all_labeled_data = all_labeled_data.concatenate(labeled_dataset)

all_labeled_data = all_labeled_data.shuffle(BUFFER_SIZE,
                                            reshuffle_each_iteration=False)

In [8]:
for ex in all_labeled_data.take(15):
    print(ex)

(<tf.Tensor: shape=(), dtype=string, numpy=b'Within the royal chariot all the lambs;'>, <tf.Tensor: shape=(), dtype=int64, numpy=0>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'of these King Agapenor son of Ancaeus was commander, and they had sixty'>, <tf.Tensor: shape=(), dtype=int64, numpy=2>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'And of all reverence, hast thou arrived,'>, <tf.Tensor: shape=(), dtype=int64, numpy=0>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'And the respect peculiar by ourselves'>, <tf.Tensor: shape=(), dtype=int64, numpy=0>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'Of thee, who wast to me by night and day'>, <tf.Tensor: shape=(), dtype=int64, numpy=1>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'other of the gods, and praying each one of them that he might live to'>, <tf.Tensor: shape=(), dtype=int64, numpy=2>)
(<tf.Tensor: shape=(), dtype=string, numpy=b"Ascended. Then leap'd Priam to the ground,">, <tf.Tensor: shape=(), dtype=int64, numpy=0>)
(<tf

In [9]:
import tensorflow_text as tf_text

tokenizer = tf_text.WhitespaceTokenizer()

vocabulary_set = set()
for text_tensor, _ in all_labeled_data:
    some_tokens = tokenizer.tokenize(text_tensor.numpy())
    vocabulary_set.update(some_tokens.numpy())

vocab_size = len(vocabulary_set)
vocab_size


51016

In [13]:
import tensorflow_datasets as tfds

encoder = tfds.deprecated.text.TokenTextEncoder(vocabulary_set)

In [47]:
example_text = next(iter(all_labeled_data))[0].numpy()
print(example_text) 

b'Within the royal chariot all the lambs;'


In [48]:
encoded_example = encoder.encode(example_text)
print(encoded_example)

[17382, 6020, 48875, 12162, 32293, 6020, 27878]


In [16]:
def encode(text_tensor, label):
    encoded_text = encoder.encode(text_tensor.numpy())
    return encoded_text, label


def encode_map_fn(text, label):
    return tf.py_function(encode, inp=[text, label], Tout=(tf.int64, tf.int64))


all_encoded_data = all_labeled_data.map(encode_map_fn)

In [17]:
train_data = all_encoded_data.skip(TAKE_SIZE).shuffle(BUFFER_SIZE)
train_data = train_data.padded_batch(BATCH_SIZE, padded_shapes=([-1],[]))

test_data = all_encoded_data.take(TAKE_SIZE)
test_data = test_data.padded_batch(BATCH_SIZE, padded_shapes=([-1],[]))

In [18]:
sample_text, sample_labels = next(iter(test_data))

sample_text[0], sample_labels[0]

(<tf.Tensor: shape=(15,), dtype=int64, numpy=
 array([17382,  6020, 48875, 12162, 32293,  6020, 27878,     0,     0,
            0,     0,     0,     0,     0,     0])>,
 <tf.Tensor: shape=(), dtype=int64, numpy=0>)

In [19]:
vocab_size += 1

In [20]:
model = tf.keras.Sequential()
model.add(tf.keras.layers.Embedding(vocab_size, 64))
model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)))
for units in [64, 64]:
    model.add(tf.keras.layers.Dense(units, activation='relu'))

# Output layer. The first argument is the number of labels.
model.add(tf.keras.layers.Dense(4, activation='softmax'))

In [68]:
model.summary()

In [21]:
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [22]:
%%time
# ~4 minutes
model.fit(train_data, epochs=3, validation_data=test_data)

Epoch 1/3
   1319/Unknown [1m462s[0m 129ms/step - accuracy: 0.6863 - loss: 0.6946

  self.gen.throw(typ, value, traceback)


[1m1322/1322[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m494s[0m 153ms/step - accuracy: 0.6865 - loss: 0.6941 - val_accuracy: 0.7582 - val_loss: 0.5884
Epoch 2/3
[1m1322/1322[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m490s[0m 153ms/step - accuracy: 0.9008 - loss: 0.2603 - val_accuracy: 0.8176 - val_loss: 0.4542
Epoch 3/3
[1m1322/1322[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m490s[0m 153ms/step - accuracy: 0.9529 - loss: 0.1292 - val_accuracy: 0.7938 - val_loss: 0.5457
CPU times: user 25min 8s, sys: 35.7 s, total: 25min 44s
Wall time: 24min 34s


<keras.src.callbacks.history.History at 0x7f87777d0b20>

In [53]:
eval_loss, eval_acc = model.evaluate(test_data)

print('\nEval loss: {:.3f}, Eval accuracy: {:.3f}'.format(eval_loss, eval_acc))

[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 397ms/step - accuracy: 0.7902 - loss: 0.5386

Eval loss: 0.546, Eval accuracy: 0.794


In [64]:
import numpy as np
example_text = b'to be or not to be'
example_text = encoder.encode(example_text)
encoded_example_np = np.array([example_text]) 
print(encoded_example_np)

[[37290  7285 49839 13458 37290  7285]]


In [65]:
model.predict( [encoded_example_np,])

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 191ms/step


array([[0.05457374, 0.07843722, 0.69496536, 0.17202371]], dtype=float32)