In [1]:
from __future__ import absolute_import, division, print_function, unicode_literals

import tensorflow as tf

import tensorflow_datasets as tfds
import os

In [2]:
gpus = tf.config.experimental.list_physical_devices('GPU')

for gpu in gpus:
    print("Name:", gpu.name, "  Type:", gpu.device_type)

tf.config.experimental.set_virtual_device_configuration(gpus[0], [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=2048)])

Name: /physical_device:GPU:0   Type: GPU


In [3]:
DIRECTORY_URL = 'https://storage.googleapis.com/download.tensorflow.org/data/illiad/'
FILE_NAMES = ['cowper.txt', 'derby.txt', 'butler.txt']

for name in FILE_NAMES:
  text_dir = tf.keras.utils.get_file(name, origin=DIRECTORY_URL+name)
  
parent_dir = os.path.dirname(text_dir)

parent_dir

Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/illiad/cowper.txt
Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/illiad/derby.txt
Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/illiad/butler.txt


'C:\\Users\\sunlin\\.keras\\datasets'

In [4]:
def labeler(example, index):
  return example, tf.cast(index, tf.int64)  

labeled_data_sets = []

for i, file_name in enumerate(FILE_NAMES):
  lines_dataset = tf.data.TextLineDataset(os.path.join(parent_dir, file_name))
  labeled_dataset = lines_dataset.map(lambda ex: labeler(ex, i))
  labeled_data_sets.append(labeled_dataset)

In [5]:
BUFFER_SIZE = 50000
BATCH_SIZE = 64
TAKE_SIZE = 5000

In [6]:
all_labeled_data = labeled_data_sets[0]
for labeled_dataset in labeled_data_sets[1:]:
  all_labeled_data = all_labeled_data.concatenate(labeled_dataset)
  
all_labeled_data = all_labeled_data.shuffle(
    BUFFER_SIZE, reshuffle_each_iteration=False)

In [7]:
for ex in all_labeled_data.take(5):
  print(ex)

(<tf.Tensor: id=74, shape=(), dtype=string, numpy=b'While, every Grecian heart he tamed, and took'>, <tf.Tensor: id=75, shape=(), dtype=int64, numpy=0>)
(<tf.Tensor: id=76, shape=(), dtype=string, numpy=b'that led off at the start must have been disabled out on the plain. I'>, <tf.Tensor: id=77, shape=(), dtype=int64, numpy=2>)
(<tf.Tensor: id=78, shape=(), dtype=string, numpy=b"From slaughter'd Trojans, after Ocean's God">, <tf.Tensor: id=79, shape=(), dtype=int64, numpy=0>)
(<tf.Tensor: id=80, shape=(), dtype=string, numpy=b'Around, a darksome trench; beyond, a fence'>, <tf.Tensor: id=81, shape=(), dtype=int64, numpy=1>)
(<tf.Tensor: id=82, shape=(), dtype=string, numpy=b'of fleet Acamas chief of the Thracians. "Sons of Priam," said he, "how'>, <tf.Tensor: id=83, shape=(), dtype=int64, numpy=2>)


In [8]:
tokenizer = tfds.features.text.Tokenizer()

vocabulary_set = set()
for text_tensor, _ in all_labeled_data:
  some_tokens = tokenizer.tokenize(text_tensor.numpy())
  vocabulary_set.update(some_tokens)

vocab_size = len(vocabulary_set)
vocab_size

17178

In [9]:
encoder = tfds.features.text.TokenTextEncoder(vocabulary_set)

In [10]:
example_text = next(iter(all_labeled_data))[0].numpy()
print(example_text)

b'While, every Grecian heart he tamed, and took'


In [11]:
encoded_example = encoder.encode(example_text)
print(encoded_example)

[2976, 5831, 724, 3308, 11976, 6430, 12047, 15273]


In [12]:
def encode(text_tensor, label):
  encoded_text = encoder.encode(text_tensor.numpy())
  return encoded_text, label

def encode_map_fn(text, label):
  return tf.py_function(encode, inp=[text, label], Tout=(tf.int64, tf.int64))

all_encoded_data = all_labeled_data.map(encode_map_fn)

In [13]:
train_data = all_encoded_data.skip(TAKE_SIZE).shuffle(BUFFER_SIZE)
train_data = train_data.padded_batch(BATCH_SIZE, padded_shapes=([-1],[]))

test_data = all_encoded_data.take(TAKE_SIZE)
test_data = test_data.padded_batch(BATCH_SIZE, padded_shapes=([-1],[]))

In [14]:
sample_text, sample_labels = next(iter(test_data))

sample_text[0], sample_labels[0]

(<tf.Tensor: id=99547, shape=(16,), dtype=int64, numpy=
 array([ 2976,  5831,   724,  3308, 11976,  6430, 12047, 15273,     0,
            0,     0,     0,     0,     0,     0,     0], dtype=int64)>,
 <tf.Tensor: id=99551, shape=(), dtype=int64, numpy=0>)

In [15]:
vocab_size += 1

In [16]:
model = tf.keras.Sequential()

In [17]:
model.add(tf.keras.layers.Embedding(vocab_size, 64))

In [18]:
model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)))

In [19]:
# One or more dense layers.
# Edit the list in the `for` line to experiment with layer sizes.
for units in [64, 64]:
  model.add(tf.keras.layers.Dense(units, activation='relu'))

# Output layer. The first argument is the number of labels.
model.add(tf.keras.layers.Dense(3, activation='softmax'))

In [20]:
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [21]:
model.fit(train_data, epochs=3, validation_data=test_data)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x225471c7c08>

In [22]:
eval_loss, eval_acc = model.evaluate(test_data)

print('\nEval loss: {:.3f}, Eval accuracy: {:.3f}'.format(eval_loss, eval_acc))


Eval loss: 0.382, Eval accuracy: 0.844
