In [2]:
import tensorflow as tf 
from tensorflow import keras
import numpy as np
import tensorflow_datasets as tfds
import os
import time
from pathlib import Path

In [3]:
DOWNLOAD_ROOT = "http://ai.stanford.edu/~amaas/data/sentiment/"
FILENAME = "aclImdb_v1.tar.gz"
DATASET_PATH = Path(keras.utils.get_file(FILENAME, DOWNLOAD_ROOT + FILENAME, extract=False)).parent / "aclImdb"

if not DATASET_PATH.exists():
    filepath = keras.utils.get_file(FILENAME, DOWNLOAD_ROOT + FILENAME, extract=True)
else:
    print("Dataset already downloaded.")

path = DATASET_PATH

Dataset already downloaded.


In [4]:
def review_paths(dirpath):
    return [str(path) for path in dirpath.glob("*.txt")]

train_pos = review_paths(path / "train" / "pos")
train_neg = review_paths(path / "train" / "neg")
test_valid_pos = review_paths(path / "test" / "pos")
test_valid_neg = review_paths(path / "test" / "neg")

In [5]:
test_pos = test_valid_pos[:5000]
test_neg = test_valid_neg[:5000]
valid_pos = test_valid_pos[5000:]
valid_neg = test_valid_neg[5000:]

In [6]:
def imdb_dataset(filepaths_positive, filepaths_negative):
    reviews = []
    labels = []
    for filepaths, label in ((filepaths_negative, 0), (filepaths_positive, 1)):
        for filepath in filepaths:
            with open(filepath, encoding="utf8") as review_file:
                reviews.append(review_file.read())
            labels.append(label)
    return tf.data.Dataset.from_tensor_slices((tf.constant(reviews), tf.constant(labels)))

for X, y in imdb_dataset(train_pos, train_neg).take(3):
    print(X)
    print(y)
    print()

raw_train_ds = imdb_dataset(train_pos, train_neg).shuffle(buffer_size=(len(train_neg) + len(train_pos)))
raw_valid_ds = imdb_dataset(valid_pos, valid_neg).shuffle(buffer_size=(len(train_neg) + len(train_pos)))
raw_test_ds = imdb_dataset(test_pos, test_neg)

tf.Tensor(b"Story of a man who has unnatural feelings for a pig. Starts out with a opening scene that is a terrific example of absurd comedy. A formal orchestra audience is turned into an insane, violent mob by the crazy chantings of it's singers. Unfortunately it stays absurd the WHOLE time with no general narrative eventually making it just too off putting. Even those from the era should be turned off. The cryptic dialogue would make Shakespeare seem easy to a third grader. On a technical level it's better than you might think with some good cinematography by future great Vilmos Zsigmond. Future stars Sally Kirkland and Frederic Forrest can be seen briefly.", shape=(), dtype=string)
tf.Tensor(0, shape=(), dtype=int32)

tf.Tensor(b"Airport '77 starts as a brand new luxury 747 plane is loaded up with valuable paintings & such belonging to rich businessman Philip Stevens (James Stewart) who is flying them & a bunch of VIP's to his estate in preparation of it being opened to the public a

In [7]:
max_features = 10000
sequence_length = 250

def remove_html(text):
    html_tag_pattern = r'<.*?>'
    text = tf.strings.regex_replace(text, html_tag_pattern, '')
    return text

@tf.function
def tf_standardize(text):
    text_no_html = remove_html(text) 
    text_standardized = tf.strings.regex_replace(text_no_html, r"[^\w\s]", "")
    text_standardized = tf.strings.lower(text_standardized)
    return text_standardized

vectorization_layer = keras.layers.TextVectorization(
    max_tokens=max_features,
    output_mode="int",
    output_sequence_length=sequence_length,
    standardize=tf_standardize
    )

text_ds = imdb_dataset(train_pos, train_neg).map(lambda x, y: x)

vectorization_layer.adapt(text_ds)

In [8]:
def vectorize_text(text, label):
    # text = tf.expand_dims(text, -1) 
    # print(f"Input text shape: {text.shape}")
    return vectorization_layer(text), label

text_batch, label_batch = next(iter(raw_train_ds))
first_review, first_label = text_batch, label_batch
print("Review", first_review)
print("Label", first_label)
vectorized_review = vectorize_text(first_review, first_label)
print("Vectorized review", vectorized_review)

vocab = vectorization_layer.get_vocabulary()
print("0 ---> ", vocab[0])  # padding token
print("1 ---> ", vocab[1])  # [UNK] token (unknown words)
print("1 ---> ", vocab[2])
vectorized_sequence = vectorized_review[0].numpy().squeeze()  # Remove extra dimensions
print(f"{vectorized_sequence[0]} ---> {vocab[vectorized_sequence[0]]}")
print(f"{vectorized_sequence[1]} ---> {vocab[vectorized_sequence[1]]}")
print(f"{vectorized_sequence[2]} ---> {vocab[vectorized_sequence[2]]}")
print('Vocabulary size: {}'.format(len(vectorization_layer.get_vocabulary())))

Review tf.Tensor(b'Judy Holliday struck gold in 1950 withe George Cukor\'s film version of "Born Yesterday," and from that point forward, her career consisted of trying to find material good enough to allow her to strike gold again.<br /><br />It never happened. In "It Should Happen to You" (I can\'t think of a blander title, by the way), Holliday does yet one more variation on the dumb blonde who\'s maybe not so dumb after all, but everything about this movie feels warmed over and half hearted. Even Jack Lemmon, in what I believe was his first film role, can\'t muster up enough energy to enliven this recycled comedy. The audience knows how the movie will end virtually from the beginning, so mostly it just sits around waiting for the film to catch up.<br /><br />Maybe if you\'re enamored of Holliday you\'ll enjoy this; otherwise I wouldn\'t bother.<br /><br />Grade: C', shape=(), dtype=string)
Label tf.Tensor(0, shape=(), dtype=int32)
Vectorized review (<tf.Tensor: shape=(250,), dtype=

In [9]:
for X_batch, y_batch in raw_train_ds.take(1):  # Take one batch of the dataset
    print(X_batch)
    print("Labels: ", y_batch.numpy()) 

tf.Tensor(b'Michael Keaton is "Johnny Dangerously" in this take-off on gangster movies done in 1984. Maureen Stapleton plays his sickly mother, Griffin Dunne is his DA brother, Peter Boyle is his boss, and Marilu Henner is his girlfriend. Other stars include Danny DeVito and Joe Piscopo. Keaton plays a pet store owner in the 1930s who catches a kid stealing a puppy and then tells him, in flashback, how he came to own the pet store. He turned to thievery at a young age to get his mother a pancreas operation ($49.95, special this week) and began working for a mob boss (Boyle). Johnny uses the last name "Dangerously" in the mobster world.<br /><br />There are some hilarious scenes in this film, and Stapleton is a riot as Johnny\'s foul-mouthed mother who needs ever organ in her body replaced. Peter Boyle as Johnny\'s boss gives a very funny performance, as does Griffin Dunne, a straight arrow DA who won\'t "play ball" with crooked Burr (Danny De Vito). As Johnny\'s nemesis, Joe Piscopo is

In [10]:
def load_dataset_with_tfds(batch_size=16):
    dataset, info = tfds.load("imdb_reviews", as_supervised=True, with_info=True)
    train_ds, val_test_ds = dataset["train"], dataset["test"]

    val_test_size = sum(1 for _ in val_test_ds)

    test_size = int(0.5 * val_test_size)

    test_ds = val_test_ds.take(test_size)
    val_ds = val_test_ds.skip(test_size)

    train_ds = train_ds.map(vectorize_text)
    val_ds = val_ds.map(vectorize_text)
    test_ds = test_ds.map(vectorize_text)

    AUTOTUNE = tf.data.AUTOTUNE

    train_ds = train_ds.batch(batch_size).cache().prefetch(buffer_size=AUTOTUNE)
    val_ds = val_ds.batch(batch_size).cache().prefetch(buffer_size=AUTOTUNE)
    test_ds = test_ds.batch(batch_size).cache().prefetch(buffer_size=AUTOTUNE)
    
    return train_ds, val_ds, test_ds

In [11]:
def prepare_datasets(tfds=False, batch_size=16):
    if tfds:
        return load_dataset_with_tfds(batch_size)
    else:
        train_ds = raw_train_ds.map(vectorize_text)
        valid_ds = raw_valid_ds.map(vectorize_text)
        test_ds = raw_test_ds.map(vectorize_text)

        AUTOTUNE = tf.data.AUTOTUNE

        train_ds = train_ds.batch(batch_size).cache().prefetch(buffer_size=AUTOTUNE)
        valid_ds = valid_ds.batch(batch_size).cache().prefetch(buffer_size=AUTOTUNE)
        test_ds = test_ds.batch(batch_size).cache().prefetch(buffer_size=AUTOTUNE)
        return train_ds, valid_ds, test_ds

In [12]:
def compute_mean_embedding(inputs):
    not_padding = tf.math.count_nonzero(inputs, axis=-1)
    n_words = tf.math.count_nonzero(not_padding, axis=-1, keepdims=True)
    sqrt_n_words = tf.math.sqrt(tf.cast(n_words, tf.float32))
    return tf.reduce_sum(inputs, axis=1) / sqrt_n_words

def build_model(type=0, max_features=10000, embedding_dim=64, network_width_factor=1, l2_reg=0.01, learning_rate=1e-4):
    model = tf.keras.Sequential([
        keras.layers.Embedding(max_features, embedding_dim, mask_zero=True, 
                               embeddings_regularizer=keras.regularizers.l2(l2_reg)),

        keras.layers.Conv1D(network_width_factor * 8, 5, kernel_regularizer=keras.regularizers.l2(l2_reg)),
        keras.layers.BatchNormalization(),
        keras.layers.Activation('relu'),
        keras.layers.MaxPooling1D(pool_size=2),

        keras.layers.Conv1D(network_width_factor * 16, 5, kernel_regularizer=keras.regularizers.l2(l2_reg)),
        keras.layers.BatchNormalization(),
        keras.layers.Activation('relu'),
        keras.layers.MaxPooling1D(pool_size=2),

        keras.layers.Conv1D(network_width_factor * 32, 5, kernel_regularizer=keras.regularizers.l2(l2_reg)),
        keras.layers.BatchNormalization(),
        keras.layers.Activation('relu'),
        keras.layers.MaxPooling1D(pool_size=2),

        keras.layers.GlobalAveragePooling1D() if type == 0 else keras.layers.Lambda(compute_mean_embedding),

        keras.layers.Dense(network_width_factor * 32, kernel_regularizer=keras.regularizers.l2(l2_reg)),
        keras.layers.BatchNormalization(),
        keras.layers.Activation('relu'),
        keras.layers.Dropout(0.5),

        keras.layers.Dense(1, activation='sigmoid')])
    
    optimizer = keras.optimizers.Nadam(learning_rate=learning_rate, )  
    
    model.compile(loss=keras.losses.BinaryCrossentropy(),
                  optimizer=optimizer,
                  metrics=['accuracy'])
    
    return model

# note on GlobalAveragePooling1D: 

# Purpose: This layer takes the average of the embeddings across the 
# sequence dimension. Instead of feeding each word's embedding individually 
# into the dense layers, it computes a single vector by averaging the 
# embeddings of all words in the review. Hence, you for embedding_dim 16
# you get a single 16-dimensional vector for each review.

# Why: This global average pooling effectively reduces the 1D sequence 
# of word embeddings into a single fixed-size vector, which represents 
# the overall sentiment or meaning of the entire review.

# Example: If a sequence is represented as a 250x16 matrix (250 words, 
# each with a 16-dimensional embedding), this layer will compute a single 
# 16-dimensional vector by averaging the embeddings across the 250 words.

In [13]:
def get_log_path():
    root_logdir = os.path.join(os.curdir, "my_logs")
    run_id = time.strftime("run_%Y_%m_%d-%H_%M")
    return os.path.join(root_logdir, run_id)

In [15]:
epochs = 100
run_log_dir = get_log_path()
embedding_dim = 64
learning_rate = 1e-4
l2_reg = 0.02
network_width_factor = 3
batch_size = 16
early_stopping_patience = 10
lr_scheduler_factor = 0.5
lr_scheduler_patience = 5

for i in range(2):
    for j in range(2):
        j = bool(j)
        train_ds, valid_ds, test_ds = prepare_datasets(tfds=j, batch_size=batch_size)   

        if i == 0:
            model_name = "imdb_global_average_pooling"
        else:
            model_name = "imdb_early_mean_embedding"

        if j == 0:
            model_name += "_tfds.keras"
        else:
            model_name += "_manual.keras"

        model_logdir = f"{run_log_dir}/{model_name}"
        os.makedirs(model_logdir, exist_ok=True)
        tensorboard_cb = keras.callbacks.TensorBoard(model_logdir)
        early_stopping_cb = keras.callbacks.EarlyStopping(patience=early_stopping_patience, restore_best_weights=True)
        lr_scheduler = keras.callbacks.ReduceLROnPlateau(factor=lr_scheduler_factor, patience=lr_scheduler_patience)

        model = build_model(i)
        model.fit(train_ds, epochs=epochs, validation_data=valid_ds, callbacks=[tensorboard_cb, early_stopping_cb, lr_scheduler])

        model.save(f"{model_logdir}/{model_name}")

        loss, accuracy = model.evaluate(test_ds)
        print(f"{model_name} Accuracy: {accuracy} Loss: {loss}")

        with open(f"{run_log_dir}/results.txt", "a") as f:
            f.write(f"{model_name} Accuracy: {accuracy} Loss: {loss}\n")

    
with open(f"{run_log_dir}/results.txt", "a") as f:
    f.write(f"\n---------------------------\n")
    f.write(f"NETWORK PARAMETERS:\n")
    f.write(f"---------------------------\n")
    f.write(f"No. of epochs: {epochs}\n")
    f.write(f"Embedding dim: {embedding_dim}\n")
    f.write(f"Learning rate: {learning_rate}\n")
    f.write(f"L2 reg: {l2_reg}\n")
    f.write(f"Network width factor: {network_width_factor}\n")
    f.write(f"Batch size: {batch_size}\n")
    f.write(f"Early stopping patience: {early_stopping_patience}\n")
    f.write(f"LR scheduler factor: {lr_scheduler_factor}\n")
    f.write(f"LR scheduler patience: {lr_scheduler_patience}\n")


Epoch 1/100




[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m55s[0m 32ms/step - accuracy: 0.5197 - loss: 4.6971 - val_accuracy: 0.7117 - val_loss: 2.1691 - learning_rate: 1.0000e-04
Epoch 2/100
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 38ms/step - accuracy: 0.7344 - loss: 1.8792 - val_accuracy: 0.8239 - val_loss: 1.2649 - learning_rate: 1.0000e-04
Epoch 3/100
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m58s[0m 37ms/step - accuracy: 0.8509 - loss: 1.1536 - val_accuracy: 0.8211 - val_loss: 1.0089 - learning_rate: 1.0000e-04
Epoch 4/100
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m58s[0m 37ms/step - accuracy: 0.8909 - loss: 0.8590 - val_accuracy: 0.7983 - val_loss: 0.9505 - learning_rate: 1.0000e-04
Epoch 5/100
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 32ms/step - accuracy: 0.9321 - loss: 0.6758 - val_accuracy: 0.8078 - val_loss: 0.9491 - learning_rate: 1.0000e-04
Epoch 6/100
[1m1563/1563[0m [32m━━



[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 34ms/step - accuracy: 0.5493 - loss: 4.7547 - val_accuracy: 0.7908 - val_loss: 2.1879 - learning_rate: 1.0000e-04
Epoch 2/100
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 27ms/step - accuracy: 0.8087 - loss: 1.9204 - val_accuracy: 0.8324 - val_loss: 1.3626 - learning_rate: 1.0000e-04
Epoch 3/100
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 27ms/step - accuracy: 0.8738 - loss: 1.2153 - val_accuracy: 0.8355 - val_loss: 1.0782 - learning_rate: 1.0000e-04
Epoch 4/100
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 26ms/step - accuracy: 0.9073 - loss: 0.9072 - val_accuracy: 0.8211 - val_loss: 0.9867 - learning_rate: 1.0000e-04
Epoch 5/100
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 26ms/step - accuracy: 0.9407 - loss: 0.7203 - val_accuracy: 0.8074 - val_loss: 1.0109 - learning_rate: 1.0000e-04
Epoch 6/100
[1m1563/1563[0m [32m━━



[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m111s[0m 60ms/step - accuracy: 0.5446 - loss: 5.0869 - val_accuracy: 0.7729 - val_loss: 2.5883 - learning_rate: 1.0000e-04
Epoch 2/100
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 52ms/step - accuracy: 0.7885 - loss: 2.2561 - val_accuracy: 0.8112 - val_loss: 1.5542 - learning_rate: 1.0000e-04
Epoch 3/100
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 29ms/step - accuracy: 0.8586 - loss: 1.3641 - val_accuracy: 0.8249 - val_loss: 1.1663 - learning_rate: 1.0000e-04
Epoch 4/100
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 20ms/step - accuracy: 0.9018 - loss: 0.9846 - val_accuracy: 0.8143 - val_loss: 1.0482 - learning_rate: 1.0000e-04
Epoch 5/100
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 24ms/step - accuracy: 0.9312 - loss: 0.7709 - val_accuracy: 0.8178 - val_loss: 0.9612 - learning_rate: 1.0000e-04
Epoch 6/100
[1m1563/1563[0m [32m━



[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m52s[0m 26ms/step - accuracy: 0.5667 - loss: 4.7602 - val_accuracy: 0.7838 - val_loss: 2.1803 - learning_rate: 1.0000e-04
Epoch 2/100
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 20ms/step - accuracy: 0.8047 - loss: 1.8930 - val_accuracy: 0.8213 - val_loss: 1.3217 - learning_rate: 1.0000e-04
Epoch 3/100
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 20ms/step - accuracy: 0.8650 - loss: 1.1665 - val_accuracy: 0.8296 - val_loss: 1.0259 - learning_rate: 1.0000e-04
Epoch 4/100
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 19ms/step - accuracy: 0.9060 - loss: 0.8595 - val_accuracy: 0.8273 - val_loss: 0.9455 - learning_rate: 1.0000e-04
Epoch 5/100
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 19ms/step - accuracy: 0.9390 - loss: 0.6796 - val_accuracy: 0.7583 - val_loss: 1.2360 - learning_rate: 1.0000e-04
Epoch 6/100
[1m1563/1563[0m [32m━━