In [1]:
import tensorflow as tf
import numpy as np
import tensorflow_datasets as tfds

In [2]:
########## Dataset ##########

In [3]:
tfds.disable_progress_bar()

In [10]:
dataset = tfds.load('imdb_reviews', as_supervised=True)

In [11]:
train_dataset, test_dataset = dataset['train'], dataset['test']

In [12]:
BUFFER_SIZE = 10000
BATCH_SIZE = 64

In [13]:
# shuffle  - fills a buffer with `buffer_size` elements, then randomly samples elements
#            from this buffer, replacing the selected elements with new elements.
# batch    - combines consecutive elements of this dataset into batches.
# prefetch - allows later elements to be prepared while the current element is being processed

train_dataset = train_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
test_dataset = test_dataset.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

In [13]:
########## Model ##########

In [16]:
VOCAB_SIZE = 1000
# max_tokens is the maximum size of the vocabulary for this layer.
# The tensors of indices are 0-padded to the longest sequence in the batch (unless you set a fixed output_sequence_length
encoder = tf.keras.layers.TextVectorization(max_tokens=VOCAB_SIZE)
encoder.adapt(train_dataset.map(lambda text, label: text))

In [17]:
model = tf.keras.models.Sequential([
    encoder,
    tf.keras.layers.Embedding(input_dim=encoder.vocabulary_size(), output_dim=64, mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

In [18]:
loss = tf.keras.losses.BinaryCrossentropy()

In [19]:
model.compile(loss=loss, optimizer='adam', metrics=['accuracy'])

In [22]:
history = model.fit(train_dataset, epochs=10, validation_data=test_dataset, validation_steps=30)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [23]:
model.evaluate(test_dataset, verbose=2)

391/391 - 46s - loss: 0.3812 - accuracy: 0.8524


[0.3812360465526581, 0.8524399995803833]

In [25]:
sample_text = ('The movie was cool. The animation and the graphics '
               'were out of this world. I would recommend this movie.')
model.predict(np.array([sample_text]))

array([[0.5769514]], dtype=float32)

In [34]:
model.predict(np.array(['The movie was horrible!']))

array([[0.00467615]], dtype=float32)