##### Copyright 2018 The TensorFlow Authors.



In [0]:
#@title Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Load text with tf.data

<table class="tfo-notebook-buttons" align="left">
  <td>
    <a target="_blank" href="https://www.tensorflow.org/alpha/tutorials/load_data/text"><img src="https://www.tensorflow.org/images/tf_logo_32px.png" />View on TensorFlow.org</a>
  </td>
  <td>
    <a target="_blank" href="https://colab.research.google.com/github/tensorflow/docs/blob/master/site/en/tutorials/load_data/text.ipynb"><img src="https://www.tensorflow.org/images/colab_logo_32px.png" />Run in Google Colab</a>
  </td>
  <td>
    <a target="_blank" href="https://github.com/tensorflow/docs/blob/master/site/en/tutorials/load_data/text.ipynb"><img src="https://www.tensorflow.org/images/GitHub-Mark-32px.png" />View source on GitHub</a>
  </td>
</table>

This tutorial provides an example of how to use `tf.data.Dataset` to load examples from text files. It will also cover preprocessing and preparing text to be usable in a model.

*mention keras preprocessing and some tradeoffs*

## Setup

In [1]:
!pip install tensorflow==2.0.0-alpha0

Collecting tensorflow==2.0.0-alpha0
[?25l  Downloading https://files.pythonhosted.org/packages/bc/84/970bfb6eb04360a675627a38962127f0f5302ac1fd3ac4ad4f5d1befc9b7/tensorflow-2.0.0a0-cp27-cp27mu-manylinux1_x86_64.whl (79.9MB)
[K    100% |████████████████████████████████| 79.9MB 306kB/s 
Collecting tb-nightly<1.14.0a20190302,>=1.14.0a20190301 (from tensorflow==2.0.0-alpha0)
[?25l  Downloading https://files.pythonhosted.org/packages/7a/4f/369f43db86ee663826dc4a7cce7e18b3f9c58c8defc9e78368230b015d2b/tb_nightly-1.14.0a20190301-py2-none-any.whl (3.0MB)
[K    100% |████████████████████████████████| 3.0MB 6.5MB/s 
Collecting tf-estimator-nightly<1.14.0.dev2019030116,>=1.14.0.dev2019030115 (from tensorflow==2.0.0-alpha0)
[?25l  Downloading https://files.pythonhosted.org/packages/13/82/f16063b4eed210dc2ab057930ac1da4fbe1e91b7b051a6c8370b401e6ae7/tf_estimator_nightly-1.14.0.dev2019030115-py2.py3-none-any.whl (411kB)
[K    100% |████████████████████████████████| 419kB 14.8MB/s 
Collecting goo

In [0]:
from __future__ import absolute_import, division, print_function 

import os
import requests
import time

import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds


This tutorial shows you how to load text into a TensorFlow dataset. So, before we begin, we're going to download and extract the text examples into a local directory.



In [0]:
DIRECTORY_URL = 'https://s3.amazonaws.com/illiad/'
FILE_NAMES = ['cowper.txt', 'derby.txt', 'butler.txt']
#FILE_NAMES = ['french-text.txt', 'english-text.txt', 'german-text.txt']

for name in FILE_NAMES:
  r = requests.get("".join([DIRECTORY_URL, name]), allow_redirects=True)
  with open(name, 'w') as f:
    f.write(r.content)

In [0]:
!ls

In [0]:
data_sets = []
labeled_data_sets = []
for i, file_name in enumerate(FILE_NAMES):
  lines_dataset = tf.data.TextLineDataset(file_name)
  labeled_dataset = lines_dataset.map(lambda x: (x, tf.cast(i, tf.int64)))
  data_sets.append(lines_dataset)
  labeled_data_sets.append(labeled_dataset)
  
  
  

In [0]:
all_labeled_data = labeled_data_sets[0].concatenate(labeled_data_sets[1]).concatenate(labeled_data_sets[2])
# interleave?

In [0]:
all_labeled_data = all_labeled_data.shuffle(tf.cast(50000, tf.int64), reshuffle_each_iteration=False)

In [7]:
for ex in all_labeled_data.take(5):
  print(ex)
  
for ex in all_labeled_data.take(5):
  print(ex)

(<tf.Tensor: id=50, shape=(), dtype=string, numpy='army inside the gates. As he was thus doubting Phoebus Apollo drew near'>, <tf.Tensor: id=51, shape=(), dtype=int64, numpy=2>)
(<tf.Tensor: id=54, shape=(), dtype=string, numpy='took them into his own hands, then he said to Idomeneus, "Lay on, till'>, <tf.Tensor: id=55, shape=(), dtype=int64, numpy=2>)
(<tf.Tensor: id=58, shape=(), dtype=string, numpy='We had pronounced it false, and should the more'>, <tf.Tensor: id=59, shape=(), dtype=int64, numpy=0>)
(<tf.Tensor: id=62, shape=(), dtype=string, numpy="She said; and Mars, enrag'd, his brawny thigh">, <tf.Tensor: id=63, shape=(), dtype=int64, numpy=1>)
(<tf.Tensor: id=66, shape=(), dtype=string, numpy='With the encroachments of enfeebling age.'>, <tf.Tensor: id=67, shape=(), dtype=int64, numpy=0>)
(<tf.Tensor: id=76, shape=(), dtype=string, numpy='army inside the gates. As he was thus doubting Phoebus Apollo drew near'>, <tf.Tensor: id=77, shape=(), dtype=int64, numpy=2>)
(<tf.Tensor: 

In [0]:
# build vocabulary

tokenizer = tfds.features.text.Tokenizer()
vocabulary_set = set()
for text_tensor, _ in all_labeled_data:
  some_tokens = tokenizer.tokenize(text_tensor.numpy())
  vocabulary_set.update(some_tokens)


  

In [9]:
vocab_size = len(vocabulary_set)
vocab_size

17178

In [0]:
encoder = tfds.features.text.TokenTextEncoder(vocabulary_set)

In [0]:
x = encoder.encode(next(iter(all_labeled_data))[0].numpy())

In [12]:
encoder.decode(x)

u'army inside the gates As he was thus doubting Phoebus Apollo drew near'

In [13]:
x

[8742,
 1028,
 5109,
 10459,
 7506,
 16645,
 5271,
 16831,
 8079,
 11487,
 7342,
 8695,
 9325]

In [0]:
def encode(text_tensor, label):
  encoded_text = encoder.encode(text_tensor.numpy())
  encoded_padded_text = tf.pad(encoded_text, [[0, MAX_LINE_LEN - len(encoded_text)]], 'CONSTANT')
  return encoded_text, label
  
all_encoded_data = all_labeled_data.map(lambda text, label: tf.py_function(
    encode, inp=[text, label], Tout=(tf.int64, tf.int64)
))







In [0]:
#MAX_LINE_LEN = max([tf.size(ex[0]) for ex in all_encoded_data]).numpy()
MAX_LINE_LEN = 20

In [16]:
MAX_LINE_LEN

20

In [0]:
ts = all_encoded_data._element_structure

In [49]:
dir(ts)

['__abstractmethods__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__doc__',
 '__format__',
 '__getattribute__',
 '__hash__',
 '__init__',
 '__module__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_cache',
 '_abc_negative_cache',
 '_abc_negative_cache_version',
 '_abc_registry',
 '_batch',
 '_flat_nested_structure',
 '_flat_shapes',
 '_flat_shapes_list',
 '_flat_types',
 '_flat_types_list',
 '_from_compatible_tensor_list',
 '_from_tensor_list',
 '_nested_structure',
 '_register_custom_converter',
 '_tf_api_names',
 '_tf_api_names_v1',
 '_to_batched_tensor_list',
 '_to_legacy_output_classes',
 '_to_legacy_output_shapes',
 '_to_legacy_output_types',
 '_to_tensor_list',
 '_unbatch',
 'from_value',
 'is_compatible_with']

In [0]:
test_data = all_encoded_data.take(5000)
train_data = all_encoded_data.skip(5000).padded_batch(64, padded_shapes=([-1],[]))

#test_data = all_encoded_data.take(500).batch(1)
#train_data = all_encoded_data.skip(500).take(4000).batch(1)

In [0]:
def get_model(input_dim, embedding_dim=50, hidden_units=[64, 64]):
  """Create a Keras Sequential model with layers.

  Args:
    input_dim: (int) Input dimensions for input layer.
    embedding_dim: (int) Embedding dimension for embedding layer.
    hidden_units: [int] the layer sizes of the DNN (input layer first)

  Returns:
    A Keras model.
  """

  model = tf.keras.Sequential()
  model.add(tf.keras.layers.Embedding(input_dim+1, 64))
  model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)))
  #model.add(tf.keras.layers.Conv1D(10, 2, padding='same'))
  #model.add(tf.keras.layers.GlobalMaxPool1D())
  for units in hidden_units:
    model.add(tf.keras.layers.Dense(units, activation=tf.keras.backend.relu))
  model.add(tf.keras.layers.Dense(3, activation='softmax'))
  model.compile(optimizer='adam',
                loss='sparse_categorical_crossentropy',
                metrics=['accuracy'])
  return model

In [0]:
# next(iter(train_data))

In [46]:
model = get_model(vocab_size)
model.fit(train_data, epochs=3)


model.evaluate(test_data)

Epoch 1/3
Epoch 2/3
Epoch 3/3

KeyboardInterrupt: ignored

## Make text files available to Python

In the local directory, examples are split into `train` and `test` directories. Within each of those, positive reviews will be in a directory called `pos`, and negative ones in a directory called `neg`.

Your text data is probably organized differently than this, and may be in a database or other format. The important thing to notice in this step is making the text files available in a Python iterable. In this example, the iterable is a list of file names.

In [0]:
train_files = [
    os.path.join(path, 'train', label, '*') for label in ['pos', 'neg']
]
test_files = [
    os.path.join(path, 'test', label, '*') for label in ['pos', 'neg']
]

## Create TensorFlow datasets

We need to turn a bunch of files into labeled data. 

The original dataTo do this:

1. Use `tf.data.Dataset.list_files` to create a Dataset of file names.
2. Use `tf.data.Dataset.flat_map` to iterate through each file name and:

  a. Label the item `1` for positive or `0` for negative.
  
  b. Load the text from the file with `tf.data.TextLineDataset`.
  
  c. Combine the label with the text data using `tf.data.Dataset.zip`.

Apply this process to both the training data files and the test data files.


In [0]:
def get_labeled_dataset(patterns):

  files = tf.data.Dataset.list_files(patterns).shuffle(len(patterns))
  
  # Maps a filename to a dataset that produces (review, sentiment) pair.
  def flat_map_fn(filename):
    label = tf.data.Dataset.from_tensors(
        tf.cast(tf.strings.regex_full_match(filename, '^.*pos.*$'), tf.float64))
    return tf.data.Dataset.zip((tf.data.TextLineDataset(filename), label))

  
  return files.flat_map(flat_map_fn)

train_data = get_labeled_dataset(train_files)
test_data = get_labeled_dataset(test_files)

In [0]:
tokenizer = tfds.features.text.Tokenizer()

In [0]:
ex = next(iter(train_data))

In [0]:
tokenized_ex = tokenizer.tokenize(ex[0].numpy())

In [0]:
# create vocabulary
vocabulary_list = []
for text_tensor, label_tensor in train_data.concatenate(test_data):
  some_tokens = tokenizer.tokenize(text_tensor.numpy())
  vocabulary_list = vocabulary_list + some_tokens
  
len(vocabulary_list)

In [0]:
filename = next(iter(train_files))

In [0]:
filename

In [0]:
tl_ds = tf.data.TextLineDataset('/root/.keras/datasets/aclImdb/train/pos/0_9.txt')

In [0]:
for text_line in tl_ds.take(5):
  print(text_line.numpy())

## Build a vocabulary

There are many ways to represent text data for input into a machine learning model. For this tutorial, the text of each review will be represented as a list of integers, with each integer representing a single unique word.

The first step to doing this is to create a map of unique words to integers. This mapping will be called the `vocabulary`, and will be a Python dictionary with words as keys and integer Tensors as values.

To build the vocabulary:

1. Use `tf.data.Dataset.map` to tokenize each example:

  a. Remove punctuation and other non-word characters from the text examples.
  
  b. Split them into arrays of tokens (word-like substrings).

2. Create a new dataset in which each element is a token from the text data.

3. Remove duplicates eith `tf.data.experimental.unique`.

4. Assign a unique integer to each token with `tf.data.experimental.Counter`.

5. Create a Python dictionary in which the keys are tokens and the values are integer tensors.

6. Pass the keys (words) and values (integers) to `tf.lookup.KeyValueTensorInitializer`.

7. Use the initializer to create a `tf.lookup.StaticVocabularyTable`.

The `StaticVocabularyTable` holds the mapping of word tokens to integers, and also handles the encoding of word lists to integer lists.

In [0]:
def tokenize(text):
  # Replace line breaks with spaces.
  text = tf.strings.regex_replace(text, r'\<br \/\>', ' ')
  # Replace punctuation with spaces.
  text = tf.strings.regex_replace(text, r'\W', ' ')
  # Turn the single long string into a list of strings.
  tokens = tf.strings.split([text], sep=" ").values
  return tokens


def get_vocabulary_table(dataset):
  # Tokenize the example text and drop the label.
  dataset = dataset.map(lambda text, label: tokenize(text))
  # Gather all the word tensors into a single dataset.
  dataset = dataset.flat_map(tf.data.Dataset.from_tensor_slices)
  # Remove duplicates.
  dataset = dataset.apply(tf.data.experimental.unique())
  # Assign an integer to each token.
  dataset = tf.data.Dataset.zip((dataset, tf.data.experimental.Counter()))
  # Turn (word, integer) pairs into a dict, so they can be passed easily into initializer.
  vocabulary_dict = {word.numpy():index for word, index in iter(dataset)}
  
  vocabulary_table_initializer = tf.lookup.KeyValueTensorInitializer(
      vocabulary_dict.keys(), 
      vocabulary_dict.values(), 
      tf.string
  )
  
  return tf.lookup.StaticVocabularyTable(vocabulary_table_initializer, 1)
  
vocabulary_table = get_vocabulary_table(train_data.concatenate(test_data))

The `tf.lookup.StaticVocabularyTable` has a method `lookup`, which converts a list of words into a list of integers.

For example:

In [0]:
vocabulary_table.lookup(tf.constant(['I', 'loved', 'this', 'movie']))

## Turn text datasets into integer datasets

Now that we have a numbered vocabulary, we can encode each review as a Tensor of integers.

Each input Tensor needs to be the same length. So, first determine the length of the longest review. Then, tokenize the examples and encode them as a lists of integers, with a padding of zeroes to make all examples the same length.




In [0]:
next(iter(train_data))

In [0]:
tokenized_train_data = train_data.map(lambda text, label: (tokenize(text), label))

In [0]:


def get_int_encoded_dataset(vocabulary, dataset):

  def encode_and_pad(tokenized_text, label):

    def helper(tokenized_text):
      tokenized_text = tokenized_text.numpy()
      result = []
      for word in tokenized_text:
        result.append(vocabulary[word])
      return tf.pad(result, [[0, MAX_LEN - len(result)]], 'CONSTANT')

    return tf.py_function(helper, [tokenized_text], tf.int64), label

  dataset = dataset.map(tokenize)
  dataset = dataset.map(encode_and_pad)
  dataset = dataset.shuffle(10 * BATCH_SIZE)
  dataset = dataset.batch(BATCH_SIZE)
  return dataset

train_data = get_int_encoded_dataset(vocabulary, train_data)


# THINGS TO TRY
# ragged tensor
# padded batch
# bucket by sequence length
# feature columns

In [0]:
text_batch,label_batch = next(iter(train_data))

In [0]:
text_batch

In [0]:
import matplotlib.pyplot as plt

plt.pcolormesh(text_batch.numpy() != 0)

In [0]:










def get_model(input_dim, embedding_dim=50, hidden_units=[100]):
  """Create a Keras Sequential model with layers.

  Args:
    input_dim: (int) Input dimensions for input layer.
    embedding_dim: (int) Embedding dimension for embedding layer.
    hidden_units: [int] the layer sizes of the DNN (input layer first)

  Returns:
    A Keras model.
  """

  model = tf.keras.Sequential()
  model.add(tf.keras.layers.Embedding(input_dim=input_dim,
                                      output_dim=embedding_dim,
                                      input_length=MAX_LEN))
  # convolutional layer or RNN
  model.add(tf.keras.layers.GlobalMaxPool1D())
  for units in hidden_units:
    model.add(tf.keras.layers.Dense(units, activation=tf.keras.backend.relu))
  model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
  model.compile(optimizer='adam',
                loss='binary_crossentropy',
                metrics=['accuracy'])
  return model










model = get_model(len(vocabulary))
model.fit(train_data, epochs=10)

test_data = get_indexed_dataset(vocabulary, test_files)
model.evaluate(test_data)
