In [9]:
!curl -O https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
!tar -xf aclImdb_v1.tar.gz
!rm -r aclImdb/train/unsup

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 80.2M  100 80.2M    0     0  33.8M      0  0:00:02  0:00:02 --:--:-- 33.8M


In [10]:
import numpy as np
import re
import string

import tensorflow as tf
import torch

In [11]:
torch.cuda.get_device_name(0)

'Tesla K80'

In [12]:
BATCH_SIZE = 32
MAX_FEATURES = 20000
EMBEDDING_DIM = 128
SEQUENCE_LENGTH = 500

In [13]:
raw_train_ds = tf.keras.preprocessing.text_dataset_from_directory(
    "aclImdb/train",
    batch_size=BATCH_SIZE,
    validation_split=0.2,
    subset="training",
    seed=1337
)

raw_val_ds = tf.keras.preprocessing.text_dataset_from_directory(
    "aclImdb/train",
    batch_size=BATCH_SIZE,
    validation_split=0.2,
    subset="validation",
    seed=1337
)

raw_test_ds = tf.keras.preprocessing.text_dataset_from_directory(
    "aclImdb/train",
    batch_size=BATCH_SIZE
)

print(f"Number of batches in raw_train_ds {tf.data.experimental.cardinality(raw_train_ds)}")
print(f"Number of batches in raw_val_ds {tf.data.experimental.cardinality(raw_val_ds)}")
print(f"Number of batches in raw_train_ds {tf.data.experimental.cardinality(raw_test_ds)}")

Found 25000 files belonging to 2 classes.
Using 20000 files for training.
Found 25000 files belonging to 2 classes.
Using 5000 files for validation.
Found 25000 files belonging to 2 classes.
Number of batches in raw_train_ds 625
Number of batches in raw_val_ds 157
Number of batches in raw_train_ds 782


In [14]:
for text_batch, label_batch in raw_train_ds.take(1):
  for i in range(5):
    print(text_batch.numpy()[i].decode("utf-8"))
    print(label_batch.numpy()[i])
    print()

I've seen tons of science fiction from the 70s; some horrendously bad, and others thought provoking and truly frightening. Soylent Green fits into the latter category. Yes, at times it's a little campy, and yes, the furniture is good for a giggle or two, but some of the film seems awfully prescient. Here we have a film, 9 years before Blade Runner, that dares to imagine the future as somthing dark, scary, and nihilistic. Both Charlton Heston and Edward G. Robinson fare far better in this than The Ten Commandments, and Robinson's assisted-suicide scene is creepily prescient of Kevorkian and his ilk. Some of the attitudes are dated (can you imagine a filmmaker getting away with the "women as furniture" concept in our oh-so-politically-correct-90s?), but it's rare to find a film from the Me Decade that actually can make you think. This is one I'd love to see on the big screen, because even in a widescreen presentation, I don't think the overall scope of this film would receive its due. Ch

In [15]:
def custom_standardization(input_data):
    lowercase = tf.strings.lower(input_data)
    stripped_html = tf.strings.regex_replace(lowercase, "<br />", " ")
    return tf.strings.regex_replace(
        stripped_html, "[%s]" % re.escape(string.punctuation), ""
    )

In [16]:
vectorize_layer = tf.keras.layers.TextVectorization(
    standardize=custom_standardization,
    max_tokens=MAX_FEATURES,
    output_mode="int",
    output_sequence_length=SEQUENCE_LENGTH
)

text_ds = raw_train_ds.map(lambda x, y: x)
vectorize_layer.adapt(text_ds)

In [17]:
def vectorize_text(text, label):
  text = tf.expand_dims(text, -1)
  return vectorize_layer(text), label

In [18]:
train_ds = raw_train_ds.map(vectorize_text)
val_ds = raw_val_ds.map(vectorize_text)
test_ds = raw_test_ds.map(vectorize_text)

train_ds = train_ds.cache().prefetch(buffer_size=10)
val_ds = val_ds.cache().prefetch(buffer_size=10)
test_ds = test_ds.cache().prefetch(buffer_size=10)

In [19]:
inputs = tf.keras.Input(shape=(None, ), dtype="int64")

x = tf.keras.layers.Embedding(MAX_FEATURES, EMBEDDING_DIM)(inputs)
x = tf.keras.layers.Dropout(0.5)(x)

x = tf.keras.layers.Conv1D(128, 7, padding="valid", activation="relu", strides=3)(x)
x = tf.keras.layers.Conv1D(128, 7, padding="valid", activation="relu", strides=3)(x)
x = tf.keras.layers.GlobalMaxPooling1D()(x)

x = tf.keras.layers.Dense(128, activation="relu")(x)
x = tf.keras.layers.Dropout(0.5)(x)

predictions = tf.keras.layers.Dense(1, activation="sigmoid", name="predictions")(x)

model = tf.keras.Model(inputs, predictions)
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [20]:
epochs = 5
model.fit(train_ds, validation_data=val_ds, epochs=epochs)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f22c0cb2090>

In [21]:
model.evaluate(test_ds)



[0.13627989590168, 0.9703599810600281]