In [1]:
import os
os.environ["TF_FORCE_GPU_ALLOW_GROWTH"]="true"

import tensorflow as tf
import tensorflow_datasets as tfds

In [2]:
DIRECTORY_URL = "https://storage.googleapis.com/download.tensorflow.org/data/illiad/"
FILE_NAMES = ['cowper.txt', 'derby.txt', 'butler.txt']

for name in FILE_NAMES:
    text_dir = tf.keras.utils.get_file(name, origin=DIRECTORY_URL + name)
    
parent_dir = os.path.dirname(text_dir)
parent_dir

'/home/kitamura/.keras/datasets'

In [3]:
def labeler(example, index):
    return example, tf.cast(index, tf.int64)

labeled_data_sets = []

for i, file_name in enumerate(FILE_NAMES):
    lines_dataset = tf.data.TextLineDataset(os.path.join(parent_dir, file_name))
    labeled_dataset = lines_dataset.map(lambda ex : labeler(ex, i))
    labeled_data_sets.append(labeled_dataset)

In [4]:
BUFFER_SIZE = 50000
BATCH_SIZE = 64
TAKE_SIZE = 5000

In [5]:
all_labeled_data = labeled_data_sets[0]
for labeled_dataset in labeled_data_sets[1:]:
    all_labeled_data = all_labeled_data.concatenate(labeled_dataset)

all_labeled_data = all_labeled_data.shuffle(BUFFER_SIZE, reshuffle_each_iteration=False)

In [6]:
for ex in all_labeled_data.take(5):
    print(ex)

(<tf.Tensor: id=74, shape=(), dtype=string, numpy=b'And to Dulichium wandering, there abode.'>, <tf.Tensor: id=75, shape=(), dtype=int64, numpy=0>)
(<tf.Tensor: id=76, shape=(), dtype=string, numpy=b'Well exercised in battle, who have shed'>, <tf.Tensor: id=77, shape=(), dtype=int64, numpy=0>)
(<tf.Tensor: id=78, shape=(), dtype=string, numpy=b'goodly array about his shoulders, and right glad was he that his'>, <tf.Tensor: id=79, shape=(), dtype=int64, numpy=2>)
(<tf.Tensor: id=80, shape=(), dtype=string, numpy=b'Son of Eumedes. I will all unfold,'>, <tf.Tensor: id=81, shape=(), dtype=int64, numpy=0>)
(<tf.Tensor: id=82, shape=(), dtype=string, numpy=b'Still should his flesh be firm and fresh as now:'>, <tf.Tensor: id=83, shape=(), dtype=int64, numpy=1>)


In [7]:
tokenizer = tfds.features.text.Tokenizer()

vocabulary_set = set()
for text_tensor, _ in all_labeled_data:
    some_tokens = tokenizer.tokenize(text_tensor.numpy())
    vocabulary_set.update(some_tokens)

vocab_size = len(vocabulary_set)
vocab_size

17178

In [8]:
encoder = tfds.features.text.TokenTextEncoder(vocabulary_set)

In [9]:
example_text = next(iter(all_labeled_data))[0].numpy()
print(example_text)

b'And to Dulichium wandering, there abode.'


In [10]:
encoded_example = encoder.encode(example_text)
print(encoded_example)

[10153, 10176, 2055, 6174, 10624, 3066]


In [11]:
def encode(text_tensor, label):
    encoded_text = encoder.encode(text_tensor.numpy())
    return encoded_text, label

def encode_map_fn(text, label):
    return tf.py_function(encode, inp=[text, label], Tout=(tf.int64, tf.int64))

all_encoded_data = all_labeled_data.map(encode_map_fn)

In [12]:
train_data = all_encoded_data.skip(TAKE_SIZE).shuffle(BUFFER_SIZE)
train_data = train_data.padded_batch(BATCH_SIZE, padded_shapes=([-1], []))

test_data = all_encoded_data.take(TAKE_SIZE)
test_data = test_data.padded_batch(BATCH_SIZE, padded_shapes=([-1], []))

In [13]:
sample_text, sample_labels = next(iter(test_data))
sample_text[0], sample_labels[0]

(<tf.Tensor: id=99547, shape=(15,), dtype=int64, numpy=
 array([10153, 10176,  2055,  6174, 10624,  3066,     0,     0,     0,
            0,     0,     0,     0,     0,     0])>,
 <tf.Tensor: id=99551, shape=(), dtype=int64, numpy=0>)

In [14]:
vocab_size += 1

In [15]:
model = tf.keras.Sequential()
model.add(tf.keras.layers.Embedding(vocab_size, 64))
model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)))

for units in [64, 64]:
    model.add(tf.keras.layers.Dense(units, activation='relu'))
    
model.add(tf.keras.layers.Dense(3, activation='softmax'))
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [16]:
model.fit(train_data, epochs=3, validation_data=test_data)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7efb5c785080>

In [17]:
eval_loss, eval_acc = model.evaluate(test_data)

