# Load text with tf.data

In [0]:
from __future__ import absolute_import, division, print_function

import tensorflow.compat.v2 as tf
tf.enable_v2_behavior()
import tensorflow_datasets as tfds

BATCH_SIZE = 512
MAX_LEN = 256
NUM_REVIEWS = 25000

UNKNOWN_WORD = 0

In [0]:
# get the training data
train_raw = tfds.load(name='imdb_reviews/plain_text', split=tfds.Split.TRAIN)

The model will not take review texts as input directly. 

Instead, the input will be a tensor with a "slot" for each unique word that occurs at least once in any review (about 90,000 unique words in the entire dataset). If a word appears in a review, the slot representing that word has a `1` value. Otherwise, the slot has a `0`. (This representation is called a [bag of words](https://developers.google.com/machine-learning/glossary/#bag_of_words)).

The first step to making this conversion is to map each unique word in the training data to an integer (its "slot" number in the input tensors).

Create a function to *tokenize* a single text entry.


In [0]:
def tokenize(example):
  # Replace line breaks with spaces.
  example['text'] = tf.strings.regex_replace(example['text'], r'\<br \/\>', ' ')
  # Replace non-word characters with spaces.
  example['text'] = tf.strings.regex_replace(example['text'], r'[\W\d]+', ' ')
  # Split string into an array of words
  example['text'] = tf.strings.split([example['text']], sep=' ').values
  return example

def get_vocabulary(dataset):
  dataset = dataset.map(tokenize)
  dataset = dataset.flat_map(lambda example: tf.data.Dataset.
                             from_tensor_slices(example['text']))
  dataset = dataset.apply(tf.data.experimental.unique())
  dataset = tf.data.Dataset.zip((dataset, tf.data.experimental.Counter(1)))

  vocabulary = {}
  for word, index in iter(dataset):
    vocabulary[word.numpy()] = index
  return vocabulary

vocabulary = get_vocabulary(train_raw)

In [0]:
def get_indexed_dataset(vocabulary, dataset):

  def index_and_pad(example):

    def helper(words):
      result = []
      for word, _ in zip(words, range(MAX_LEN)):
        if word in vocabulary:
          result.append(vocabulary[word])
        else:
          result.append(UNKNOWN_WORD)
      return tf.pad(result, [[0, MAX_LEN - len(result)]], 'CONSTANT')

    example['text'] = tf.numpy_function(helper, [example['text']], tf.int64)
    return example

  dataset = dataset.map(tokenize)
  dataset = dataset.map(index_and_pad)
  dataset = dataset.shuffle(10 * BATCH_SIZE)
  dataset = dataset.batch(BATCH_SIZE)
  dataset = dataset.map(lambda example: (example['text'], example['label']))
  return dataset

train_data = get_indexed_dataset(vocabulary, train_raw)

In [0]:
def get_model(input_dim, embedding_dim=50, hidden_units=[100]):
  """Create a Keras Sequential model with layers.

  Args:
    input_dim: (int) Input dimensions for input layer.
    embedding_dim: (int) Embedding dimension for embedding layer.
    hidden_units: [int] the layer sizes of the DNN (input layer first)

  Returns:
    A Keras model.
  """

  model = tf.keras.Sequential()
  model.add(tf.keras.layers.Embedding(input_dim=input_dim,
                                      output_dim=embedding_dim,
                                      input_length=MAX_LEN))
  model.add(tf.keras.layers.GlobalMaxPool1D())
  for units in hidden_units:
    model.add(tf.keras.layers.Dense(units, activation=tf.keras.backend.relu))
  model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
  model.compile(optimizer='adam',
                loss='binary_crossentropy',
                metrics=['accuracy'])
  return model


model = get_model(len(vocabulary) + 1)
model.fit(train_data, epochs=1)




<google3.third_party.tensorflow.python.keras.callbacks.History at 0x7fbb154779d0>

In [0]:
test_raw = tfds.load(name='imdb_reviews/plain_text', split=tfds.Split.TEST)
test_data = get_indexed_dataset(vocabulary, test_raw)
model.evaluate(test_data)

     49/Unknown - 24s 498ms/step - loss: 0.6518 - accuracy: 0.7999

[0.65184134001634564, 0.79992002]