In [1]:
# Load text

In [2]:
from __future__ import absolute_import, division, print_function, unicode_literals

In [3]:
import tensorflow as tf

import tensorflow_datasets as tfds
import os

In [4]:
# 텍스트들은 William Cowper, Edward, earl of Derby, Samuel Butler의 글들이 있다.
DIRECTORY_URL = 'https://storage.googleapis.com/download.tensorflow.org/data/illiad/'
FILE_NAMES = ['cowper.txt', 'derby.txt', 'butler.txt']

In [5]:
for name in FILE_NAMES:
    text_dir = tf.keras.utils.get_file(name, origin=DIRECTORY_URL+name)

parent_dir = os.path.dirname(text_dir)

In [6]:
parent_dir

'C:\\Users\\tjsxo\\.keras\\datasets'

In [7]:
# tf.data.Dataset.map를 이용하여 데이터와 라벨을 매핑한다.
# tf.cast는 객체를 새로운 텐서 속성으로 변환시켜준다.
def labeler(example, index):
    return example, tf.cast(index, tf.int64)

In [8]:
labeled_data_sets = []

for i, file_name in enumerate(FILE_NAMES):
    lines_dataset = tf.data.TextLineDataset(os.path.join(parent_dir, file_name)) # 읽어들인 txt파일을 한줄단위로 저장한다.
    labeled_dataset = lines_dataset.map(lambda ex: labeler(ex,i)) # 각 줄을 저자를 label로 매핑한다.
    labeled_data_sets.append(labeled_dataset) # 매핑된 각 줄 데이터들을 한 리스트에 모은다.

In [9]:
BUFFER_SIZE = 50000
BATCH_SIZE = 64
TAKE_SIZE = 5000

In [10]:
all_labeled_data = labeled_data_sets[0]

# concatenate를 사용하여 데이터를 합쳐준다.
for labeled_dataset in labeled_data_sets[1:]:
    all_labeled_data = all_labeled_data.concatenate(labeled_dataset)
    
all_labeled_data = all_labeled_data.shuffle(BUFFER_SIZE, reshuffle_each_iteration=False)

In [11]:
for ex in all_labeled_data.take(5):
    print(ex)

(<tf.Tensor: id=74, shape=(), dtype=string, numpy=b'But when Bellerophon, at last, himself'>, <tf.Tensor: id=75, shape=(), dtype=int64, numpy=0>)
(<tf.Tensor: id=76, shape=(), dtype=string, numpy=b'That day the son of Atreus, in the midst'>, <tf.Tensor: id=77, shape=(), dtype=int64, numpy=0>)
(<tf.Tensor: id=78, shape=(), dtype=string, numpy=b'Son of Arisbas. Lycomedes saw'>, <tf.Tensor: id=79, shape=(), dtype=int64, numpy=0>)
(<tf.Tensor: id=80, shape=(), dtype=string, numpy=b'Thy love and thy regard, divide the prize'>, <tf.Tensor: id=81, shape=(), dtype=int64, numpy=0>)
(<tf.Tensor: id=82, shape=(), dtype=string, numpy=b'fire--though his hands be fire and his strength iron."'>, <tf.Tensor: id=83, shape=(), dtype=int64, numpy=2>)


In [12]:
# tensorflowdatabases의 Tokenizer를 사용하여 텍스트 안의 단어들을 토큰화 시켜 변환시킨다.
# 단어를 숫자로 치환한다.
tokenizer = tfds.features.text.Tokenizer()

vocabulary_set = set()

# 저자의 정보는 중요하지 않으므로 _로 표시한다..
for text_tensor, _ in all_labeled_data:
    some_tokens = tokenizer.tokenize(text_tensor.numpy())
    vocabulary_set.update(some_tokens)

vocab_size = len(vocabulary_set)
vocab_size

17178

In [13]:
# 인코더를 거침으로서 단어를 숫자로 치환할 수 있다.
encoder = tfds.features.text.TokenTextEncoder(vocabulary_set)

In [14]:
example_text = next(iter(all_labeled_data))[0].numpy()
print(example_text)

b'But when Bellerophon, at last, himself'


In [15]:
encoded_example = encoder.encode(example_text)
print(encoded_example)

[9745, 11380, 7089, 10481, 12160, 1357]


In [16]:
def encode(text_tensor, label):
    encoded_text = encoder.encode(text_tensor.numpy())
    return encoded_text, label

In [17]:
def encode_map_fn(text, label):
    encoded_text, label = tf.py_function(encode, inp=[text, label], Tout=(tf.int64, tf.int64))

    encoded_text.set_shape([None])
    label.set_shape([])

    return encoded_text, label

all_encoded_data = all_labeled_data.map(encode_map_fn)

In [18]:
train_data = all_encoded_data.skip(TAKE_SIZE).shuffle(BUFFER_SIZE)
train_data = train_data.padded_batch(BATCH_SIZE, padded_shapes=([None],[]))

test_data = all_encoded_data.take(TAKE_SIZE)
test_data = test_data.padded_batch(BATCH_SIZE, padded_shapes=([None],[]))

In [19]:
sample_text, sample_labels = next(iter(test_data))

sample_text[0], sample_labels[0]

(<tf.Tensor: id=99547, shape=(16,), dtype=int64, numpy=
 array([ 9745, 11380,  7089, 10481, 12160,  1357,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0], dtype=int64)>,
 <tf.Tensor: id=99551, shape=(), dtype=int64, numpy=0>)

In [20]:
vocab_size += 1

In [21]:
# Build the model
model = tf.keras.Sequential()

In [22]:
model.add(tf.keras.layers.Embedding(vocab_size, 64))
model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)))
for units in [64, 64]:
    model.add(tf.keras.layers.Dense(units, activation='relu'))

model.add(tf.keras.layers.Dense(3))

In [23]:
# Compile the model
model.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [24]:
# Train the model
model.fit(train_data, epochs=3, validation_data=test_data)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x241e4332148>

In [25]:
eval_loss, eval_acc = model.evaluate(test_data)

print('\nEval loss: {:.3f}, Eval accuracy: {:.3f}'.format(eval_loss, eval_acc))


Eval loss: 0.378, Eval accuracy: 0.837
