In [2]:
import tensorflow as tf
import numpy as np
import tensorflow_datasets as tfds

In [3]:
########## Dataset ##########

In [4]:
tfds.disable_progress_bar()

In [5]:
dataset, info = tfds.load('imdb_reviews', with_info=True, as_supervised=True)

[1mDownloading and preparing dataset imdb_reviews/plain_text/1.0.0 (download: 80.23 MiB, generated: Unknown size, total: 80.23 MiB) to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0...[0m
Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incomplete5K3Q2I/imdb_reviews-train.tfrecord
Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incomplete5K3Q2I/imdb_reviews-test.tfrecord
Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incomplete5K3Q2I/imdb_reviews-unsupervised.tfrecord




[1mDataset imdb_reviews downloaded and prepared to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0. Subsequent calls will reuse this data.[0m


In [6]:
info.description

'Large Movie Review Dataset.\nThis is a dataset for binary sentiment classification containing substantially more data than previous benchmark datasets. We provide a set of 25,000 highly polar movie reviews for training, and 25,000 for testing. There is additional unlabeled data for use as well.'

In [7]:
train_dataset, test_dataset = dataset['train'], dataset['test']

In [8]:
train_dataset.element_spec

(TensorSpec(shape=(), dtype=tf.string, name=None),
 TensorSpec(shape=(), dtype=tf.int64, name=None))

In [9]:
for example, label in train_dataset.take(1):
    print('example: ', example.numpy())
    print('label: ', label.numpy())

example:  b"This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it."
label:  0


In [10]:
BUFFER_SIZE = 10000
BATCH_SIZE = 64

In [11]:
# shuffle  - fills a buffer with `buffer_size` elements, then randomly samples elements
#            from this buffer, replacing the selected elements with new elements.
# batch    - combines consecutive elements of this dataset into batches.
# prefetch - allows later elements to be prepared while the current element is being processed

train_dataset = train_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
test_dataset = test_dataset.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

In [12]:
for example, label in train_dataset.take(1):
    print(len(example), len(label))

64 64


In [13]:
########## Model ##########

In [14]:
VOCAB_SIZE = 1000
encoder = tf.keras.layers.TextVectorization(max_tokens=VOCAB_SIZE)
encoder.adapt(train_dataset.map(lambda text, label: text))

In [15]:
vocab = np.array(encoder.get_vocabulary())
print(vocab[:10])

['' '[UNK]' 'the' 'and' 'a' 'of' 'to' 'is' 'in' 'it']


In [16]:
encoded_example = encoder(example)
print(encoded_example)

tf.Tensor(
[[ 11   7  87 ...   0   0   0]
 [252 108   1 ...   0   0   0]
 [ 10  26   6 ...   0   0   0]
 ...
 [ 10 731   1 ...   0   0   0]
 [ 10  67   1 ...   0   0   0]
 [ 11   2   1 ...   0   0   0]], shape=(64, 860), dtype=int64)


In [17]:
example[0].numpy()

b'This is how I feel about the show.<br /><br />I started watching the show in reruns in 2001.<br /><br />I enjoy the show but it had too many faults.<br /><br />I HATE THE MICHELLE & JOEY CHARACTERS!<br /><br />Stealing story lines from old TV shows. They even stole from "The Partirdge Family." Then in 1 episode "The Partridge Family" was mentioned.<br /><br />Actors playing different roles in different episodes. MTV Martha Quinn the most notable doing this, especially when she played herself in 1 episode.<br /><br />The Michelle character COULD NOT take a joke but then they had this little kid act out "revenge" to her sisters for a joke by them on her.<br /><br />Story lines that came & went in 1 episode. Joey getting the TV show with Frankie & Annette, never heard from it again after that. Danny all of a sudden playing the guitar. 1 episode he is coaching soccer, 1 episode he is coaching softball/baseball. 1 game & you are out huh Danny? <br /><br />Jesse & Joey keep getting jobs RE

In [18]:
" ".join(vocab[encoded_example[0]])

'this is how i feel about the [UNK] br i started watching the show in [UNK] in [UNK] br i enjoy the show but it had too many [UNK] br i hate the [UNK] [UNK] [UNK] br [UNK] story lines from old tv shows they even [UNK] from the [UNK] family then in 1 episode the [UNK] family was [UNK] br actors playing different roles in different episodes [UNK] [UNK] [UNK] the most [UNK] doing this especially when she played herself in 1 [UNK] br the [UNK] character could not take a joke but then they had this little kid act out [UNK] to her [UNK] for a joke by them on [UNK] br story lines that came went in 1 episode [UNK] getting the tv show with [UNK] [UNK] never heard from it again after that [UNK] all of a [UNK] playing the [UNK] 1 episode he is [UNK] [UNK] 1 episode he is [UNK] [UNK] 1 game you are out [UNK] [UNK] br br [UNK] [UNK] keep getting [UNK] really quickly with no experience only in a tv [UNK] br i did like the [UNK] [UNK] characters wish [UNK] [UNK] could have [UNK] from [UNK] [UNK] [UNK

In [19]:
# encoder converts the text to a sequence of token indices.
# An embedding layer stores one vector per word.
# When called, it converts the sequences of word indices to sequences of vectors. These vectors are trainable.
# RNN processeses sequence input by iterating through the elements.
# RNNs pass the outputs from one timestep to their input on the next timestep.
# Bidirectional wrapper propagates the input forward and backwards through the RNN layer
# and then concatenates the final output.
model = tf.keras.models.Sequential([
    encoder,
    tf.keras.layers.Embedding(input_dim=encoder.vocabulary_size(), output_dim=64, mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

In [20]:
loss = tf.keras.losses.BinaryCrossentropy()

In [21]:
model.compile(loss=loss, optimizer='adam', metrics=['accuracy'])

In [22]:
history = model.fit(train_dataset, epochs=10, validation_data=test_dataset, validation_steps=30)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [23]:
model.evaluate(test_dataset, verbose=2)

391/391 - 46s - loss: 0.3812 - accuracy: 0.8524


[0.3812360465526581, 0.8524399995803833]

In [25]:
sample_text = ('The movie was cool. The animation and the graphics '
               'were out of this world. I would recommend this movie.')
model.predict(np.array([sample_text]))

array([[0.5769514]], dtype=float32)

In [34]:
model.predict(np.array(['The movie was horrible!']))

array([[0.00467615]], dtype=float32)