In [1]:
## Importing all required libraries

import tensorflow as tf
import tensorflow_datasets as tfds
import os
import datetime

In [2]:
## Define the directories and global constants

parent_dir = "/content"
FILE_NAMES = ['news.txt']

BUFFER_SIZE = 2000
BATCH_SIZE = 128
TAKE_SIZE = 200000

In [3]:
def labeler(example, index):
  return example, tf.cast(index, tf.int64)

In [4]:
labeled_data_sets = []

for i, file_name in enumerate(FILE_NAMES):
  lines_dataset = tf.data.TextLineDataset(os.path.join(parent_dir, file_name))
  labeled_dataset = lines_dataset.map(lambda ex: labeler(ex, i))
  labeled_data_sets.append(labeled_dataset)

all_labeled_data = labeled_data_sets[0]
for labeled_dataset in labeled_data_sets[1:]:
  all_labeled_data = all_labeled_data.concatenate(labeled_dataset)
  
all_labeled_data = all_labeled_data.shuffle(
    BUFFER_SIZE, reshuffle_each_iteration=False)

print("Dataset Items Before Encoding:")
print("-------------------------------")
for ex in all_labeled_data.take(2):
  print(ex)
print("-------------------------------")

tokenizer = tfds.deprecated.text.Tokenizer()

vocabulary_set = set()

for text_tensor, _ in all_labeled_data:
  some_tokens = tokenizer.tokenize(text_tensor.numpy())
  vocabulary_set.update(some_tokens)

vocab_size = len(vocabulary_set)

print("Vocabulary size.   :" + str(vocab_size))
print("-------------------------------")
print(vocabulary_set)
print("-------------------------------")

encoder = tfds.deprecated.text.TokenTextEncoder(vocabulary_set)

example_text = next(iter(all_labeled_data))[0].numpy()
print("Example Sentence:")
print("-------------------------------")
print(example_text)

print("Encoded Example Sentence:")
print("-------------------------------")
encoded_example = encoder.encode(example_text)
print(encoded_example)

Dataset Items Before Encoding:
-------------------------------
(<tf.Tensor: shape=(), dtype=string, numpy=b'dance Sexy Nukim fall'>, <tf.Tensor: shape=(), dtype=int64, numpy=0>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'bts member RM J Hope see sit audience'>, <tf.Tensor: shape=(), dtype=int64, numpy=0>)
-------------------------------
Vocabulary size.   :13375
-------------------------------
-------------------------------
Example Sentence:
-------------------------------
b'dance Sexy Nukim fall'
Encoded Example Sentence:
-------------------------------
[1702, 5366, 2003, 11276]


In [5]:
def encode(text_tensor, label):
  encoded_text = encoder.encode(text_tensor.numpy())
  return encoded_text, label

In [6]:
def encode_map_fn(text, label):
  encoded_text, label = tf.py_function(encode, 
                                       inp=[text, label], 
                                       Tout=(tf.int64, tf.int64))
  return encoded_text, label

In [7]:
all_encoded_data = all_labeled_data.map(encode_map_fn)

In [8]:
# train_data = all_encoded_data.skip(TAKE_SIZE).shuffle(BUFFER_SIZE)
train_data = all_encoded_data.shuffle(BUFFER_SIZE)
train_data = train_data.padded_batch(BATCH_SIZE, padded_shapes=([2000],()))

test_data = all_encoded_data.take(TAKE_SIZE)
test_data = test_data.padded_batch(BATCH_SIZE, padded_shapes=([2000],()))


sample_text, sample_labels = next(iter(test_data))

print(sample_text[10])
print(sample_labels[10])

print(sample_text[11])
print(sample_labels[11])

tf.Tensor([2872 7262 1785 ...    0    0    0], shape=(2000,), dtype=int64)
tf.Tensor(0, shape=(), dtype=int64)
tf.Tensor([ 7919 10845 10058 ...     0     0     0], shape=(2000,), dtype=int64)
tf.Tensor(0, shape=(), dtype=int64)


In [9]:
#Training a LSTM model to test the data pipeline
vocab_size += 1

model = tf.keras.Sequential()
model.add(tf.keras.layers.Embedding(vocab_size, 64))
model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)))

for units in [64, 64]:
  model.add(tf.keras.layers.Dense(units, activation='relu'))

In [11]:
# Output layer. The first argument is the number of labels.
model.add(tf.keras.layers.Dense(3, activation='softmax'))

optimizer = tf.keras.optimizers.Adam(learning_rate=0.005, amsgrad=True)


model.compile(optimizer= optimizer,
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'],
              run_eagerly=True)

log_dir="logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)


model.fit(train_data, epochs=10, steps_per_epoch=4, validation_data=test_data, callbacks=[tensorboard_callback])

eval_loss, eval_acc = model.evaluate(test_data)

print('\nEval loss: {:.3f}, Eval accuracy: {:.3f}'.format(eval_loss, eval_acc))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

Eval loss: 0.599, Eval accuracy: 1.000
