In [1]:
import tensorflow as tf
import numpy as np
import tensorflow_datasets as tfds

In [2]:
########## Dataset ##########

In [3]:
tfds.disable_progress_bar()

In [4]:
dataset, info = tfds.load('imdb_reviews', with_info=True, as_supervised=True)

In [5]:
info.description

'Large Movie Review Dataset.\nThis is a dataset for binary sentiment classification containing substantially more data than previous benchmark datasets. We provide a set of 25,000 highly polar movie reviews for training, and 25,000 for testing. There is additional unlabeled data for use as well.'

In [6]:
train_dataset, test_dataset = dataset['train'], dataset['test']

In [7]:
train_dataset.element_spec

(TensorSpec(shape=(), dtype=tf.string, name=None),
 TensorSpec(shape=(), dtype=tf.int64, name=None))

In [8]:
for example, label in train_dataset.take(1):
    print('example: ', example.numpy())
    print('label: ', label.numpy())

example:  b"This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it."
label:  0


In [9]:
BUFFER_SIZE = 10000
BATCH_SIZE = 64

In [10]:
# shuffle  - fills a buffer with `buffer_size` elements, then randomly samples elements
#            from this buffer, replacing the selected elements with new elements.
# batch    - combines consecutive elements of this dataset into batches.
# prefetch - allows later elements to be prepared while the current element is being processed

train_dataset = train_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
test_dataset = test_dataset.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

In [11]:
for example, label in train_dataset.take(1):
    print(len(example), len(label))

64 64


In [12]:
########## Model ##########

In [13]:
VOCAB_SIZE = 1000
encoder = tf.keras.layers.TextVectorization(max_tokens=VOCAB_SIZE)
encoder.adapt(train_dataset.map(lambda text, label: text))

In [14]:
vocab = np.array(encoder.get_vocabulary())
print(vocab[:10])

['' '[UNK]' 'the' 'and' 'a' 'of' 'to' 'is' 'in' 'it']


In [15]:
encoded_example = encoder(example)
print(encoded_example)

tf.Tensor(
[[ 49 557   6 ...   0   0   0]
 [ 10 116   1 ...   0   0   0]
 [ 10  26   1 ...   0   0   0]
 ...
 [ 11  18   7 ...   0   0   0]
 [ 57   1  13 ...   0   0   0]
 [ 10  14 176 ...   0   0   0]], shape=(64, 814), dtype=int64)


In [16]:
example[0].numpy()

b'What happens to washed up rock-n-roll stars in the late 1990\'s? They launch a comeback / reunion tour. At least, that\'s what the members of Strange Fruit, a (fictional) 70\'s stadium rock group do.<br /><br />Tony (Stephen Rea) has the concession on condom vending machines when he runs into the son of the promoter of a famous music festival. It was at that festival in the 70\'s that Strange Fruit broke up. The 70\'s are "retro" and the time is right to wide that wave. He sets off in search of the other members of the band.<br /><br />Part of what broke up the band was the death and replacement of Keith, the lead singer and brilliant song writer. The band was known for its excessive lifestyle and now they are all back amongst the working class from which they came. Beano, the drummer, played by Timothy Spall (who was brilliant in Secrets and Lies) is a layabout, the bass player is a roofer, and their lead singer is still a rocker. While he owns a huge mansion he has been forced to s

In [17]:
" ".join(vocab[encoded_example[0]])

'what happens to [UNK] up [UNK] stars in the late [UNK] they [UNK] a [UNK] [UNK] [UNK] at least thats what the [UNK] of strange [UNK] a [UNK] 70s [UNK] rock group [UNK] br [UNK] [UNK] [UNK] has the [UNK] on [UNK] [UNK] [UNK] when he [UNK] into the son of the [UNK] of a famous music [UNK] it was at that [UNK] in the 70s that strange [UNK] [UNK] up the 70s are [UNK] and the time is right to [UNK] that [UNK] he sets off in [UNK] of the other [UNK] of the [UNK] br part of what [UNK] up the [UNK] was the death and [UNK] of [UNK] the lead [UNK] and brilliant song writer the [UNK] was known for its [UNK] [UNK] and now they are all back [UNK] the working class from which they came [UNK] the [UNK] played by [UNK] [UNK] who was brilliant in [UNK] and [UNK] is a [UNK] the [UNK] [UNK] is a [UNK] and their lead [UNK] is still a [UNK] while he [UNK] a huge [UNK] he has been forced to [UNK] it as his [UNK] has not [UNK] [UNK] the lead [UNK] is dead so a young [UNK] is [UNK] to [UNK] [UNK] br somewhat

In [18]:
model = tf.keras.models.Sequential([
    encoder,
    tf.keras.layers.Embedding(input_dim=encoder.vocabulary_size(), output_dim=64, mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

In [19]:
loss = tf.keras.losses.BinaryCrossentropy()

In [20]:
model.compile(loss=loss, optimizer='adam', metrics=['accuracy'])

In [21]:
history = model.fit(train_dataset, epochs=10, validation_data=test_dataset, validation_steps=30)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [21]:
model.evaluate(test_dataset, verbose=2)

In [None]:
sample_text = ('The movie was cool. The animation and the graphics '
               'were out of this world. I would recommend this movie.')
model.predict(np.array([sample_text]))