##### Copyright 2018 The TensorFlow Authors.



In [0]:
#@title Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Load text with tf.data

<table class="tfo-notebook-buttons" align="left">
  <td>
    <a target="_blank" href="https://www.tensorflow.org/alpha/tutorials/load_data/text"><img src="https://www.tensorflow.org/images/tf_logo_32px.png" />View on TensorFlow.org</a>
  </td>
  <td>
    <a target="_blank" href="https://colab.research.google.com/github/tensorflow/docs/blob/master/site/en/tutorials/load_data/text.ipynb"><img src="https://www.tensorflow.org/images/colab_logo_32px.png" />Run in Google Colab</a>
  </td>
  <td>
    <a target="_blank" href="https://github.com/tensorflow/docs/blob/master/site/en/tutorials/load_data/text.ipynb"><img src="https://www.tensorflow.org/images/GitHub-Mark-32px.png" />View source on GitHub</a>
  </td>
</table>

This tutorial provides an example of how to use `tf.data.TextLineDataset` to load examples from text files. `TextLineDataset` is designed to create a dataset from a text file, in which each example is a line of text from the original file. This is potentially useful for any text data that is primarily line-based (for example, poetry or dialogue).

In this tutorial, we'll use three different English translations of the same work, Homer's Illiad, and train a model to identify the translator given a single line of text.

## Setup

In [0]:
from __future__ import absolute_import, division, print_function 

import os
import requests
import time

import tensorflow as tf
import tensorflow_datasets as tfds


The texts of the three translations are from Project Gutenberg:

 - [William Cowper](http://www.gutenberg.org/cache/epub/16452/pg16452.txt)
 
 - [Edward, Earl of Derby](http://www.gutenberg.org/cache/epub/6150/pg6150.txt)
 
- [Samuel Butler](http://www.gutenberg.org/cache/epub/2199/pg2199.txt)

The text files used in this tutorial have undergone some typical preprocessing tasks, mostly removing stuff — document header and footer, line numbers, chapter titles. Download these lightly munged files locally. 

In [0]:
DIRECTORY_URL = 'https://s3.amazonaws.com/illiad/'
FILE_NAMES = ['cowper.txt', 'derby.txt', 'butler.txt']

for name in FILE_NAMES:
  r = requests.get("".join([DIRECTORY_URL, name]), allow_redirects=True)
  with open(name, 'w') as f:
    f.write(r.content)

In [0]:
!ls

butler.txt  cowper.txt	derby.txt  sample_data


## Load text into datasets

Iterate through the files, loading each one into its own dataset.

Each example needs to be labeled individually labeled, so use `tf.data.Dataset.map` to apply a labeler function to each one. This will iterate over every example in the dataset, returning (`example, label`) pairs.

In [0]:
def get_labeler(label_int):
  """Returns a labeler function initialized with a specfic label."""
  label_tensor = tf.cast(label_int, tf.int64)

  def labeler(example):
    """Returns a labeled example."""
    return example, label_tensor
  
  return labeler

labeled_data_sets = []
for i, file_name in enumerate(FILE_NAMES):
  lines_dataset = tf.data.TextLineDataset(file_name)
  labeler_function = get_labeler(i)
  labeled_dataset = lines_dataset.map(labeler_function)
  labeled_data_sets.append(labeled_dataset)
  
  
  

Combine these labeled datasets into a single dataset, and shuffle it.


In [0]:
all_labeled_data = labeled_data_sets[0].concatenate(labeled_data_sets[1])
all_labeled_data = all_labeled_data.concatenate(labeled_data_sets[2])

all_labeled_data = all_labeled_data.shuffle(
                    tf.cast(50000, tf.int64), 
                    reshuffle_each_iteration=False)

You can use `tf.data.Dataset.take` and `print` to see what the `(example, label)` pairs look like. The `numpy` property shows each Tensor's value.

In [0]:
for ex in all_labeled_data.take(5):
  print(ex)

(<tf.Tensor: id=65, shape=(), dtype=string, numpy='What Prince or Chief of the degenerate race'>, <tf.Tensor: id=66, shape=(), dtype=int64, numpy=0>)
(<tf.Tensor: id=69, shape=(), dtype=string, numpy='The Trojans, whom your deadly hate pursues.'>, <tf.Tensor: id=70, shape=(), dtype=int64, numpy=1>)
(<tf.Tensor: id=73, shape=(), dtype=string, numpy="All clad in gold, the golden lash he grasp'd">, <tf.Tensor: id=74, shape=(), dtype=int64, numpy=1>)
(<tf.Tensor: id=77, shape=(), dtype=string, numpy="The beakers crown'd, and wine from right to left">, <tf.Tensor: id=78, shape=(), dtype=int64, numpy=0>)
(<tf.Tensor: id=81, shape=(), dtype=string, numpy='"Now, however, let us obey the behests of night and get our suppers,'>, <tf.Tensor: id=82, shape=(), dtype=int64, numpy=2>)


## Encode text lines as numbers

Machine Learning models work on numbers, not words, so the string values need to be converted into lists of numbers. To do that, map each unique word to a unique integer.

### Build Vocabulary

First, build a vocabulary by tokenizing the text into a collection of individual unique words. There are a few ways to do this in both TensorFlow and Python. For this tutorial:

1. Iterate over each example's `numpy` value.
2. Use `tfds.features.text.Tokenizer` to split it into tokens.
3. Collect these tokens into a Python set, to remove duplicates.
4. Get the size of the vocabulary for later use.

In [0]:
tokenizer = tfds.features.text.Tokenizer()
vocabulary_set = set()
for text_tensor, _ in all_labeled_data:
  some_tokens = tokenizer.tokenize(text_tensor.numpy())
  vocabulary_set.update(some_tokens)


vocab_size = len(vocabulary_set)
vocab_size  

17178

### Encode examples

Create an encoder by passing the `vocabulary_set` to `tfds.features.text.TokenTextEncoder`. The encoder's `encode` method takes in a string of text and returns a list of integers.

In [0]:
encoder = tfds.features.text.TokenTextEncoder(vocabulary_set)

You can try this on a single line to see what the output looks like.

In [0]:
example_text = next(iter(all_labeled_data))[0].numpy()
example_text

'What Prince or Chief of the degenerate race'

In [0]:
encoded_example = encoder.encode(example_text)
encoded_example

[14475, 9470, 14887, 6973, 14875, 5117, 8812, 3339]

Now run the encoder on the dataset by wrapping it in `tf.py_function` and  passing that to the dataset's `map` method.

In [0]:
def encode(text_tensor, label):
  encoded_text = encoder.encode(text_tensor.numpy())
  return encoded_text, label

def encode_map_fn(text, label):
  return tf.py_function(
    encode, inp=[text, label], Tout=(tf.int64, tf.int64)
  )

all_encoded_data = all_labeled_data.map(encode_map_fn)

In [0]:
#MAX_LINE_LEN = max([tf.size(ex[0]) for ex in all_encoded_data]).numpy()
#MAX_LINE_LEN = 20

In [0]:
#MAX_LINE_LEN

## Split the dataset into text and train batches

Use `tf.data.Dataset.take` and `tf.data.Dataset.skip` to create a small test dataset and a larger training set.

Before being passed into the model, the datasets need to be batched. Typically, the examples inside of a batch need to be the same size and shape. But, the examples in these datasets are not all the same size — each line of text had a different number of words. So use `tf.data.Dataset.padded_batch` (instead of `batch`) to pad the examples to the same size.

In [0]:
test_data = all_encoded_data.take(5000).padded_batch(50, padded_shapes=([-1],[]))
train_data = all_encoded_data.skip(5000).padded_batch(50, padded_shapes=([-1],[]))

Now, `test_data` and `train_data` are not collections of (`example, label`) pairs, but collections of batches. Each batch is a pair of (*many examples*, *many labels*) represented as arrays.

To illustrate:

In [0]:
one_batch = next(iter(test_data))


# a single text line, out of many
# notice the zero values padding out the list
one_batch[0].numpy()[0]

array([14475,  9470, 14887,  6973, 14875,  5117,  8812,  3339,     0,
           0,     0,     0,     0,     0,     0])

In [0]:
# a bunch of labels
one_batch[1]

<tf.Tensor: id=149372, shape=(50,), dtype=int64, numpy=
array([0, 1, 1, 0, 2, 2, 2, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 2, 2, 0,
       0, 1, 2, 0, 2, 1, 0, 0, 0, 0, 2, 2, 2, 0, 0, 0, 2, 1, 1, 1, 0, 0,
       1, 0, 0, 1, 2, 2])>

Since we have introduced a new token encoding (the zero used for padding), the vocabulary size has increased by one.

In [0]:
vocab_size += 1

## Build the model



For simplicity's sake, start with a `tf.keras.Sequential` model.

In [0]:
model = tf.keras.Sequential()

The input is collection of integers representing text tokens (words). There are two downsides to this representation:

*  The integer-encoding is arbitrary (it does not capture any relationship between words).

*  An integer-encoding can be challenging for a model to interpret. A linear classifier, for example, learns a single weight for each feature. Because different words may have a similar encoding, this feature-weight combination is not meaningful.

To overcome these challenges, the first layer of the model will be a `tf.keras.laters.Embedding` layer. This transforms each integer into a dense collection of floating point values. The exact encoding of integers to collections of floats is learned during training, so the encodings become meaningful to the model, rather than arbitrary integer assignments.

When creating an embedding layer, the two required arguments are:

*   input dimensionality — the number of integers appearing in the input set (that is, the size of the vocabulary)
*   embedding dimension — the number of floating point values that will represent a single integer (higher values discover more complex relationships between words, but require more data)

See the [Word Embeddings](../tutorials/sequences/word_embeddings) tutorial for more details on this using this layer.

In [0]:
model.add(tf.keras.layers.Embedding(vocab_size, 64))

Next, we'll create a bidirectional Long Short-Term Memory layer by wrapping `tf.keras.layers.LSTM` in `tf.keras.layers.Birectional`. An LSTM is a type of Recurrent Neural Network that allows the model to understand data points in relationship to the datapoints that came before it. That is, a word in the context of a sentence, rather than a word in isolation.

To see the effect of the LSTM, you can try building the model with and without it, or try replacing it with an additional dense layer. 

In [0]:
model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)))

Finally we'll have a series of one or more densely connected layers, with the last one being the output layer.

The output layer produces a one-hot encoding of one of three values (the labels representing the three different translators). Use the `softmax` activation to output three probability values.



In [0]:
# One or more dense layers.
# Edit the list in the `for` line to experiment with layer sizes.
for units in [64, 64]:
  model.add(tf.keras.layers.Dense(units, activation=tf.keras.backend.relu))

# Output layer. The first argument is the number of labels.
model.add(tf.keras.layers.Dense(3, activation='softmax'))

Finally, compile the model. For a softmax categorization model, use `sparse_categorical_crossentropy` as the loss function. You can try other optimizers, but `adam` is very common.

In [0]:
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

## Train the model

This model running on this data produces decent results (about 83%) after just two epochs and levels off quickly. A third epoch lowers accuracy by a tiny amount.

If you edit the model, try running several epochs to see when improvement levels off and overfitting sets in. To check progress after each epoch, validate against the `test_data` set. (This is the same as running `model.evaluate(test_data)` after each epoch.)

In [0]:
model.fit(train_data, epochs=3, validation_data=test_data)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7fdf7a87bfd0>

In [0]:
model.evaluate(test_data)

    100/Unknown - 3s 28ms/step - loss: 0.4339 - accuracy: 0.8308

[0.4339443758130074, 0.8308]

## Make text files available to Python

In the local directory, examples are split into `train` and `test` directories. Within each of those, positive reviews will be in a directory called `pos`, and negative ones in a directory called `neg`.

Your text data is probably organized differently than this, and may be in a database or other format. The important thing to notice in this step is making the text files available in a Python iterable. In this example, the iterable is a list of file names.

In [0]:
train_files = [
    os.path.join(path, 'train', label, '*') for label in ['pos', 'neg']
]
test_files = [
    os.path.join(path, 'test', label, '*') for label in ['pos', 'neg']
]

## Create TensorFlow datasets

We need to turn a bunch of files into labeled data. 

The original dataTo do this:

1. Use `tf.data.Dataset.list_files` to create a Dataset of file names.
2. Use `tf.data.Dataset.flat_map` to iterate through each file name and:

  a. Label the item `1` for positive or `0` for negative.
  
  b. Load the text from the file with `tf.data.TextLineDataset`.
  
  c. Combine the label with the text data using `tf.data.Dataset.zip`.

Apply this process to both the training data files and the test data files.


In [0]:
def get_labeled_dataset(patterns):

  files = tf.data.Dataset.list_files(patterns).shuffle(len(patterns))
  
  # Maps a filename to a dataset that produces (review, sentiment) pair.
  def flat_map_fn(filename):
    label = tf.data.Dataset.from_tensors(
        tf.cast(tf.strings.regex_full_match(filename, '^.*pos.*$'), tf.float64))
    return tf.data.Dataset.zip((tf.data.TextLineDataset(filename), label))

  
  return files.flat_map(flat_map_fn)

train_data = get_labeled_dataset(train_files)
test_data = get_labeled_dataset(test_files)

In [0]:
tokenizer = tfds.features.text.Tokenizer()

In [0]:
ex = next(iter(train_data))

In [0]:
tokenized_ex = tokenizer.tokenize(ex[0].numpy())

In [0]:
# create vocabulary
vocabulary_list = []
for text_tensor, label_tensor in train_data.concatenate(test_data):
  some_tokens = tokenizer.tokenize(text_tensor.numpy())
  vocabulary_list = vocabulary_list + some_tokens
  
len(vocabulary_list)

In [0]:
filename = next(iter(train_files))

In [0]:
filename

In [0]:
tl_ds = tf.data.TextLineDataset('/root/.keras/datasets/aclImdb/train/pos/0_9.txt')

In [0]:
for text_line in tl_ds.take(5):
  print(text_line.numpy())

## Build a vocabulary

There are many ways to represent text data for input into a machine learning model. For this tutorial, the text of each review will be represented as a list of integers, with each integer representing a single unique word.

The first step to doing this is to create a map of unique words to integers. This mapping will be called the `vocabulary`, and will be a Python dictionary with words as keys and integer Tensors as values.

To build the vocabulary:

1. Use `tf.data.Dataset.map` to tokenize each example:

  a. Remove punctuation and other non-word characters from the text examples.
  
  b. Split them into arrays of tokens (word-like substrings).

2. Create a new dataset in which each element is a token from the text data.

3. Remove duplicates eith `tf.data.experimental.unique`.

4. Assign a unique integer to each token with `tf.data.experimental.Counter`.

5. Create a Python dictionary in which the keys are tokens and the values are integer tensors.

6. Pass the keys (words) and values (integers) to `tf.lookup.KeyValueTensorInitializer`.

7. Use the initializer to create a `tf.lookup.StaticVocabularyTable`.

The `StaticVocabularyTable` holds the mapping of word tokens to integers, and also handles the encoding of word lists to integer lists.

In [0]:
def tokenize(text):
  # Replace line breaks with spaces.
  text = tf.strings.regex_replace(text, r'\<br \/\>', ' ')
  # Replace punctuation with spaces.
  text = tf.strings.regex_replace(text, r'\W', ' ')
  # Turn the single long string into a list of strings.
  tokens = tf.strings.split([text], sep=" ").values
  return tokens


def get_vocabulary_table(dataset):
  # Tokenize the example text and drop the label.
  dataset = dataset.map(lambda text, label: tokenize(text))
  # Gather all the word tensors into a single dataset.
  dataset = dataset.flat_map(tf.data.Dataset.from_tensor_slices)
  # Remove duplicates.
  dataset = dataset.apply(tf.data.experimental.unique())
  # Assign an integer to each token.
  dataset = tf.data.Dataset.zip((dataset, tf.data.experimental.Counter()))
  # Turn (word, integer) pairs into a dict, so they can be passed easily into initializer.
  vocabulary_dict = {word.numpy():index for word, index in iter(dataset)}
  
  vocabulary_table_initializer = tf.lookup.KeyValueTensorInitializer(
      vocabulary_dict.keys(), 
      vocabulary_dict.values(), 
      tf.string
  )
  
  return tf.lookup.StaticVocabularyTable(vocabulary_table_initializer, 1)
  
vocabulary_table = get_vocabulary_table(train_data.concatenate(test_data))

The `tf.lookup.StaticVocabularyTable` has a method `lookup`, which converts a list of words into a list of integers.

For example:

In [0]:
vocabulary_table.lookup(tf.constant(['I', 'loved', 'this', 'movie']))

## Turn text datasets into integer datasets

Now that we have a numbered vocabulary, we can encode each review as a Tensor of integers.

Each input Tensor needs to be the same length. So, first determine the length of the longest review. Then, tokenize the examples and encode them as a lists of integers, with a padding of zeroes to make all examples the same length.




In [0]:
next(iter(train_data))

In [0]:
tokenized_train_data = train_data.map(lambda text, label: (tokenize(text), label))

In [0]:


def get_int_encoded_dataset(vocabulary, dataset):

  def encode_and_pad(tokenized_text, label):

    def helper(tokenized_text):
      tokenized_text = tokenized_text.numpy()
      result = []
      for word in tokenized_text:
        result.append(vocabulary[word])
      return tf.pad(result, [[0, MAX_LEN - len(result)]], 'CONSTANT')

    return tf.py_function(helper, [tokenized_text], tf.int64), label

  dataset = dataset.map(tokenize)
  dataset = dataset.map(encode_and_pad)
  dataset = dataset.shuffle(10 * BATCH_SIZE)
  dataset = dataset.batch(BATCH_SIZE)
  return dataset

train_data = get_int_encoded_dataset(vocabulary, train_data)


# THINGS TO TRY
# ragged tensor
# padded batch
# bucket by sequence length
# feature columns

In [0]:
text_batch,label_batch = next(iter(train_data))

In [0]:
text_batch

In [0]:
import matplotlib.pyplot as plt

plt.pcolormesh(text_batch.numpy() != 0)

In [0]:










def get_model(input_dim, embedding_dim=50, hidden_units=[100]):
  """Create a Keras Sequential model with layers.

  Args:
    input_dim: (int) Input dimensions for input layer.
    embedding_dim: (int) Embedding dimension for embedding layer.
    hidden_units: [int] the layer sizes of the DNN (input layer first)

  Returns:
    A Keras model.
  """

  model = tf.keras.Sequential()
  model.add(tf.keras.layers.Embedding(input_dim=input_dim,
                                      output_dim=embedding_dim,
                                      input_length=MAX_LEN))
  # convolutional layer or RNN
  model.add(tf.keras.layers.GlobalMaxPool1D())
  for units in hidden_units:
    model.add(tf.keras.layers.Dense(units, activation=tf.keras.backend.relu))
  model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
  model.compile(optimizer='adam',
                loss='binary_crossentropy',
                metrics=['accuracy'])
  return model










model = get_model(len(vocabulary))
model.fit(train_data, epochs=10)

test_data = get_indexed_dataset(vocabulary, test_files)
model.evaluate(test_data)
