##### Copyright 2018 The TensorFlow Authors.



In [0]:
#@title Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Load text with tf.data

<table class="tfo-notebook-buttons" align="left">
  <td>
    <a target="_blank" href="https://www.tensorflow.org/alpha/tutorials/load_data/text"><img src="https://www.tensorflow.org/images/tf_logo_32px.png" />View on TensorFlow.org</a>
  </td>
  <td>
    <a target="_blank" href="https://colab.research.google.com/github/tensorflow/docs/blob/master/site/en/tutorials/load_data/text.ipynb"><img src="https://www.tensorflow.org/images/colab_logo_32px.png" />Run in Google Colab</a>
  </td>
  <td>
    <a target="_blank" href="https://github.com/tensorflow/docs/blob/master/site/en/tutorials/load_data/text.ipynb"><img src="https://www.tensorflow.org/images/GitHub-Mark-32px.png" />View source on GitHub</a>
  </td>
</table>

This tutorial provides an example of how to use `tf.data.Dataset` to load examples from text files. It will also cover preprocessing and preparing text to be usable in a model. Finally, we'll do basic sentiment analysis, classifying each text example as "positive" or "negative." The text examples are movie reviews from IMDB.

## Setup

In [0]:
from __future__ import absolute_import, division, print_function 

!pip install tensorflow==2.0.0-alpha0

import numpy as np
import os
import tensorflow as tf
import time

This tutorial shows you how to load text into a TensorFlow dataset. So, before we begin, we're going to download and extract the text examples into a local directory.

In the local directory, examples are split into `train` and `test` directories. Within each of those positive reviews will be in a directory called `pos`, and negative ones in a directory called `neg`.

In [0]:
DATA_URL = 'http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'

file = 'aclImdb_v1.tar.gz'
file = tf.keras.utils.get_file(file, DATA_URL, extract=True)
path = os.path.join(os.path.dirname(file), 'aclImdb')

## Make text files available to Python

Your text data is probably organized differently than this, and may be in a database or other format. The important thing to notice in this step is making the text files available in a Python iterable.

In [0]:
train_files = [
    os.path.join(path, 'train', label, '*') for label in ['pos', 'neg']
]
test_files = [
    os.path.join(path, 'test', label, '*') for label in ['pos', 'neg']
]

## Build a vocabulary

There are many ways to represent text data for input into a machine learning model. We're going to use a representation called "[bag of words](https://en.wikipedia.org/wiki/Bag-of-words_model)".

Each movie review will be converted into a sparse vector. Each "slot" in the vector will represent a single word. If the word appears in the review, the slot has a `1`, otherwise the spot has a `0`. The length of the vector (the number of slots) will correspond to the number of individual uniqe words in the original set of text examples.

The first step to doing this is to create a map of integers (repesenting slots in the sparse vectors) and unique words. This mapping will be called the `vocabulary`. To build the vocabulary:

1. Remove punctuation and other non-word characters from the text examples.
2. Tokenize the text examples (split them into arrays of words).
3. Iterate through each array of words, adding unique words to the vocabulary index. 

In [0]:





BATCH_SIZE = 512
MAX_LEN = 256
NUM_REVIEWS = 25000


def get_model(input_dim, embedding_dim=50, hidden_units=[100]):
  """Create a Keras Sequential model with layers.

  Args:
    input_dim: (int) Input dimensions for input layer.
    embedding_dim: (int) Embedding dimension for embedding layer.
    hidden_units: [int] the layer sizes of the DNN (input layer first)

  Returns:
    A Keras model.
  """

  model = tf.keras.Sequential()
  model.add(tf.keras.layers.Embedding(input_dim=input_dim,
                                      output_dim=embedding_dim,
                                      input_length=MAX_LEN))
  model.add(tf.keras.layers.GlobalMaxPool1D())
  for units in hidden_units:
    model.add(tf.keras.layers.Dense(units, activation=tf.keras.backend.relu))
  model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
  model.compile(optimizer='adam',
                loss='binary_crossentropy',
                metrics=['accuracy'])
  return model


def get_labeled_dataset(patterns):

  # Maps a filename to a dataset that produces (review, sentiment) pair.
  def flat_map_fn(filename):
    label = tf.data.Dataset.from_tensors(
        tf.cast(tf.strings.regex_full_match(filename, '^.*pos.*$'), tf.float64))
    return tf.data.Dataset.zip((tf.data.TextLineDataset(filename), label))

  files = tf.data.Dataset.list_files(patterns).shuffle(NUM_REVIEWS)
  return files.flat_map(flat_map_fn)


def tokenize(line, label):
  # Replace line breaks with spaces.
  line = tf.strings.regex_replace(line, r'\<br \/\>', ' ')
  # Replace periods with spaces.
  line = tf.strings.regex_replace(line, r'\.', ' ')
  tokens = tf.strings.split([line], sep=" ").values
  return tokens, label


def get_vocabulary(patterns):
  dataset = get_labeled_dataset(patterns)
  dataset = dataset.map(tokenize)
  dataset = dataset.flat_map(lambda x, y: tf.data.Dataset.from_tensor_slices(x))
  dataset = dataset.apply(tf.data.experimental.unique())
  dataset = tf.data.Dataset.zip((dataset, tf.data.experimental.Counter()))

  vocabulary = {}
  for word, index in iter(dataset):
    vocabulary[word.numpy()] = index
  return vocabulary


def get_indexed_dataset(vocabulary, patterns):

  def index_and_pad(word_list, label):

    def helper(word_list):
      result = []
      for word, _ in zip(word_list, range(MAX_LEN)):
        result.append(vocabulary[word])
      return tf.pad(result, [[0, MAX_LEN - len(result)]], 'CONSTANT')

    return tf.numpy_function(helper, [word_list], tf.int64), label

  dataset = get_labeled_dataset(patterns)
  dataset = dataset.map(tokenize)
  dataset = dataset.map(index_and_pad)
  dataset = dataset.shuffle(10 * BATCH_SIZE)
  dataset = dataset.batch(BATCH_SIZE)
  return dataset






vocabulary = get_vocabulary(train_files + test_files)
print(len(vocabulary))
train_data = get_indexed_dataset(vocabulary, train_files)

model = get_model(len(vocabulary))
model.fit(train_data, epochs=10)

test_data = get_indexed_dataset(vocabulary, test_files)
model.evaluate(test_data)


345418


W0321 22:29:00.914769 140374431422336 deprecation.py:323] From /usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/script_ops.py:476: py_func (from tensorflow.python.ops.script_ops) is deprecated and will be removed in a future version.
Instructions for updating:
tf.py_func is deprecated in TF V2. Instead, there are two
    options available in V2.
    - tf.py_function takes a python function which manipulates tf eager
    tensors instead of numpy arrays. It's easy to convert a tf eager tensor to
    an ndarray (just call tensor.numpy()) but having access to eager tensors
    means `tf.py_function`s can use accelerators such as GPUs as well as
    being differentiable using a gradient tape.
    - tf.numpy_function maintains the semantics of the deprecated tf.py_func
    (it is not differentiable, and manipulates numpy arrays). It drops the
    stateful argument making all functions stateful.
    


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
     49/Unknown - 24s 496ms/step - loss: 0.3709 - accuracy: 0.8570

[0.37087987332927935, 0.85696]