# Sentiment Analysis

Download the data using the following command.

In [1]:
!curl -O https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
!tar -xf aclImdb_v1.tar.gz

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 80.2M  100 80.2M    0     0  21.6M      0  0:00:03  0:00:03 --:--:-- 21.6M


In [2]:
!rm -r aclImdb/train/unsup

In [3]:
import os, pathlib, shutil, random
from tensorflow.keras.layers import TextVectorization
from tensorflow import keras
from tensorflow.keras import layers, Model
import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds
from tensorflow.keras.optimizers import Adam
import keras_nlp

The following will create a validation data set to use when training the models.

In [4]:
base_dir = pathlib.Path("aclImdb")
val_dir = base_dir / "val"
train_dir = base_dir / "train"

for category in ("neg", "pos"):
  os.makedirs(val_dir / category)
  files = os.listdir(train_dir / category)
  random.Random(1337).shuffle(files)
  num_val_samples = int(0.2 * len(files))
  val_files = files[-num_val_samples:]
  for fname in val_files:
    shutil.move(train_dir / category / fname,
                val_dir / category / fname)

In [5]:
batch_size = 32
train_ds = keras.utils.text_dataset_from_directory("aclImdb/train", batch_size=batch_size)
val_ds = keras.utils.text_dataset_from_directory("aclImdb/val", batch_size=batch_size)
test_ds = keras.utils.text_dataset_from_directory("aclImdb/test", batch_size=batch_size)

Found 20000 files belonging to 2 classes.
Found 5000 files belonging to 2 classes.
Found 25000 files belonging to 2 classes.


## MLP / Feed Forward Simple Network

The following chunks of code will include different feed forward architectures. The model will be the same, but the vectorization will change

The simplest model possible, with just a relu layer and a dropout for regularization

In [6]:
def get_model(max_tokens=20000, hidden_dim=16):
  inputs = keras.Input(shape=(max_tokens,))
  x = layers.Dense(hidden_dim, activation="relu")(inputs)
  x = layers.Dropout(0.5)(x)
  outputs = layers.Dense(1, activation="sigmoid")(x)

  model = keras.Model(inputs, outputs)
  model.compile(optimizer="adam",
                loss="binary_crossentropy",
                metrics=["accuracy"])

  return model

### MLP: 1 Gram

Vectorizing the data, no order and selecting the most used 20000 words

In [7]:
text_vectorization = TextVectorization(
    max_tokens=20000,
    output_mode="multi_hot",)

text_only_train_ds = train_ds.map(lambda x, y: x)
text_vectorization.adapt(text_only_train_ds)

binary_1gram_train_ds = train_ds.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4)
binary_1gram_val_ds = val_ds.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4)
binary_1gram_test_ds = test_ds.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4)

In [8]:
model = get_model()
model.summary()

callbacks = [
    keras.callbacks.ModelCheckpoint("binary_1gram.keras", save_best_only=True),
    keras.callbacks.EarlyStopping(patience=3, restore_best_weights=True),
    keras.callbacks.ReduceLROnPlateau(patience=2, factor=0.2)
]

model.fit(binary_1gram_train_ds.cache(),
          validation_data=binary_1gram_val_ds.cache(),
          epochs=20,
          callbacks=callbacks)

model = keras.models.load_model("binary_1gram.keras")
print(f"Test acc: {model.evaluate(binary_1gram_test_ds)[1]:.3f}")

Epoch 1/20
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5ms/step - accuracy: 0.7751 - loss: 0.4816 - val_accuracy: 0.8918 - val_loss: 0.2702 - learning_rate: 0.0010
Epoch 2/20
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.9063 - loss: 0.2427 - val_accuracy: 0.8928 - val_loss: 0.2631 - learning_rate: 0.0010
Epoch 3/20
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.9337 - loss: 0.1752 - val_accuracy: 0.8938 - val_loss: 0.2741 - learning_rate: 0.0010
Epoch 4/20
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.9564 - loss: 0.1283 - val_accuracy: 0.8920 - val_loss: 0.3027 - learning_rate: 0.0010
Epoch 5/20
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.9708 - loss: 0.0939 - val_accuracy: 0.8948 - val_loss: 0.3160 - learning_rate: 2.0000e-04
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 

### MLP: 2 gram

Using N-gram = 2 to keep some of the order when vectorizing

In [12]:
text_vectorization = TextVectorization(
    ngrams=2,
    max_tokens=20000,
    output_mode="multi_hot",
    )

text_vectorization.adapt(text_only_train_ds)

Slight increase, order matters

In [13]:
binary_2gram_train_ds = train_ds.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4)
binary_2gram_val_ds = val_ds.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4)
binary_2gram_test_ds = test_ds.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4)

model = get_model()
model.summary()
callbacks = [
    keras.callbacks.ModelCheckpoint("binary_2gram.keras", save_best_only=True),
    keras.callbacks.EarlyStopping(patience=3, restore_best_weights=True),
    keras.callbacks.ReduceLROnPlateau(patience=2, factor=0.2)
]
model.fit(binary_2gram_train_ds.cache(),
          validation_data=binary_2gram_val_ds.cache(),
          epochs=10,
          callbacks=callbacks)

model = keras.models.load_model("binary_2gram.keras")
print(f"Test acc: {model.evaluate(binary_2gram_test_ds)[1]:.3f}")

Epoch 1/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 5ms/step - accuracy: 0.7850 - loss: 0.4660 - val_accuracy: 0.8936 - val_loss: 0.2549 - learning_rate: 0.0010
Epoch 2/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.9235 - loss: 0.2070 - val_accuracy: 0.8960 - val_loss: 0.2521 - learning_rate: 0.0010
Epoch 3/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.9481 - loss: 0.1457 - val_accuracy: 0.8928 - val_loss: 0.2708 - learning_rate: 0.0010
Epoch 4/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.9640 - loss: 0.1026 - val_accuracy: 0.8940 - val_loss: 0.3138 - learning_rate: 0.0010
Epoch 5/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.9741 - loss: 0.0739 - val_accuracy: 0.8962 - val_loss: 0.3190 - learning_rate: 2.0000e-04
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 

In [None]:
model.save("binary_2gram.h5")

In [14]:
import json

# Get config
config = text_vectorization.get_config()

# Get vocabulary
vocab = text_vectorization.get_vocabulary()

# Save config to JSON
with open("text_vectorization_config.json", "w") as f:
    json.dump(config, f)

# Save vocabulary to plain text or JSON
with open("text_vectorization_vocab.json", "w") as f:
    json.dump(vocab, f)


### MLP: 2 gram + tf_idf

By adding tf_idf, we also have in consideration the amount of time a word appears in the same sentence, in relation to the amount of time that word appears in the whole dataset

In [15]:
text_vectorization = TextVectorization(
    ngrams=2,
    max_tokens=20000,
    output_mode="tf_idf")

text_vectorization.adapt(text_only_train_ds)

Small worse value, most of the times the value should be higher for NLP

In [16]:
tfidf_2gram_train_ds = train_ds.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4)
tfidf_2gram_val_ds = val_ds.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4)
tfidf_2gram_test_ds = test_ds.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4)

model = get_model()
model.summary()
callbacks = [
    keras.callbacks.ModelCheckpoint("tfidf_2gram.keras", save_best_only=True),
    keras.callbacks.EarlyStopping(patience=3, restore_best_weights=True),
    keras.callbacks.ReduceLROnPlateau(patience=2, factor=0.2)
]
model.fit(tfidf_2gram_train_ds.cache(),
          validation_data=tfidf_2gram_val_ds.cache(),
          epochs=10,
          callbacks=callbacks)

model = keras.models.load_model("tfidf_2gram.keras")
print(f"Test acc: {model.evaluate(tfidf_2gram_test_ds)[1]:.3f}")

Epoch 1/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - accuracy: 0.6798 - loss: 0.6960 - val_accuracy: 0.8894 - val_loss: 0.3011 - learning_rate: 0.0010
Epoch 2/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.8177 - loss: 0.3536 - val_accuracy: 0.8958 - val_loss: 0.2638 - learning_rate: 0.0010
Epoch 3/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.8633 - loss: 0.2901 - val_accuracy: 0.8924 - val_loss: 0.2690 - learning_rate: 0.0010
Epoch 4/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.8735 - loss: 0.2623 - val_accuracy: 0.8978 - val_loss: 0.2684 - learning_rate: 0.0010
Epoch 5/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.8867 - loss: 0.2288 - val_accuracy: 0.8980 - val_loss: 0.2752 - learning_rate: 2.0000e-04
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 

### MLP Testing

Use the following piece of code to test different sentences

In [17]:
inputs = keras.Input(shape=(1,), dtype="string")
processed_inputs = text_vectorization(inputs)
outputs = model(processed_inputs)
inference_model = keras.Model(inputs, outputs)

import tensorflow as tf
raw_text_data = tf.convert_to_tensor([
["I'm really happy this is going to work really well"],
])
predictions = inference_model(raw_text_data)
print(f"{float(predictions[0] * 100):.2f} percent positive")

61.79 percent positive


---------

## LSTM

Using LSTM - In order to keep a manageable input size, we’ll truncate the inputs after the first 600 words. This is a reasonable choice, since the average review length is 233 words, and only 5% of reviews are longer than 600 words.

Notice that the output mode this time is int, this will keep the order each token appears in the sentence, instead of just keeping a binary value

In [36]:
from tensorflow.keras import layers
max_length = 600
max_tokens = 20000
text_vectorization = layers.TextVectorization(
    max_tokens=max_tokens,
    output_mode="int",
    output_sequence_length=max_length,
    )

text_vectorization.adapt(text_only_train_ds)
int_train_ds = train_ds.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4)
int_val_ds = val_ds.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4)
int_test_ds = test_ds.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4)

### LSTM: With embedding


In [37]:
inputs = keras.Input(shape=(None,), dtype="int64")
embedded = layers.Embedding(input_dim=max_tokens, output_dim=256)(inputs)
x = layers.Bidirectional(layers.LSTM(32))(embedded)
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(1, activation="sigmoid")(x)

model = keras.Model(inputs, outputs)
model.compile(optimizer="adam",
              loss="binary_crossentropy",
              metrics=["accuracy"])

model.summary()
callbacks = [
    keras.callbacks.ModelCheckpoint("embeddings_bidir_lstm.keras", save_best_only=True),
    keras.callbacks.EarlyStopping(patience=3, restore_best_weights=True),
    keras.callbacks.ReduceLROnPlateau(patience=2, factor=0.2)]

model.fit(int_train_ds, validation_data=int_val_ds, epochs=10,callbacks=callbacks)
model = keras.models.load_model("embeddings_bidir_lstm.keras")
print(f"Test acc: {model.evaluate(int_test_ds)[1]:.3f}")

Epoch 1/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 40ms/step - accuracy: 0.6704 - loss: 0.5861 - val_accuracy: 0.6922 - val_loss: 0.5838 - learning_rate: 0.0010
Epoch 2/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 40ms/step - accuracy: 0.8255 - loss: 0.4153 - val_accuracy: 0.8170 - val_loss: 0.4305 - learning_rate: 0.0010
Epoch 3/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 39ms/step - accuracy: 0.8984 - loss: 0.2794 - val_accuracy: 0.8586 - val_loss: 0.4168 - learning_rate: 0.0010
Epoch 4/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 39ms/step - accuracy: 0.9342 - loss: 0.1948 - val_accuracy: 0.8530 - val_loss: 0.4254 - learning_rate: 0.0010
Epoch 5/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 39ms/step - accuracy: 0.9580 - loss: 0.1294 - val_accuracy: 0.8516 - val_loss: 0.4795 - learning_rate: 0.0010
Epoch 6/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37

In [38]:
model.save("sentiment_LSTM.h5")

import json

# Get config
config = text_vectorization.get_config()

# Get vocabulary
vocab = text_vectorization.get_vocabulary()

# Save config to JSON
with open("sentiment_LSTM_text_vectorization_config.json", "w") as f:
    json.dump(config, f)

# Save vocabulary to plain text or JSON
with open("sentiment_LSTM_text_vectorization_vocab.json", "w") as f:
    json.dump(vocab, f)




### LSTM: Embedding + masking

small increase in performance

In [39]:
inputs = keras.Input(shape=(None,), dtype="int64")
embedded = layers.Embedding(
input_dim=max_tokens, output_dim=256, mask_zero=True)(inputs)
x = layers.Bidirectional(layers.LSTM(32))(embedded)
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(1, activation="sigmoid")(x)

model = keras.Model(inputs, outputs)
model.compile(optimizer="adam",
              loss="binary_crossentropy",
              metrics=["accuracy"])
model.summary()
callbacks = [
    keras.callbacks.ModelCheckpoint("embeddings_bidir_lstm.keras", save_best_only=True),
    keras.callbacks.EarlyStopping(patience=3, restore_best_weights=True),
    keras.callbacks.ReduceLROnPlateau(patience=2, factor=0.2)]

model.fit(int_train_ds, validation_data=int_val_ds, epochs=10,callbacks=callbacks)
model = keras.models.load_model("embeddings_bidir_gru_with_masking.keras")
print(f"Test acc: {model.evaluate(int_test_ds)[1]:.3f}")

Epoch 1/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 40ms/step - accuracy: 0.7389 - loss: 0.4991 - val_accuracy: 0.8714 - val_loss: 0.3185 - learning_rate: 0.0010
Epoch 2/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 40ms/step - accuracy: 0.9133 - loss: 0.2198 - val_accuracy: 0.8808 - val_loss: 0.2964 - learning_rate: 0.0010
Epoch 3/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 39ms/step - accuracy: 0.9595 - loss: 0.1125 - val_accuracy: 0.8672 - val_loss: 0.3471 - learning_rate: 0.0010
Epoch 4/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 39ms/step - accuracy: 0.9705 - loss: 0.0838 - val_accuracy: 0.8698 - val_loss: 0.4391 - learning_rate: 0.0010
Epoch 5/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 39ms/step - accuracy: 0.9893 - loss: 0.0381 - val_accuracy: 0.8840 - val_loss: 0.4542 - learning_rate: 2.0000e-04
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 

### LSTM: Using a pre-trained embedding (Glove)

use the following code to install and set up glove to correspond to our current text vectorization

In [23]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip -q glove.6B.zip

--2025-05-16 15:58:06--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2025-05-16 15:58:06--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2025-05-16 15:58:07--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


202

In [24]:
import numpy as np

path_to_glove_file = "glove.6B.100d.txt"

embeddings_index = {}
with open(path_to_glove_file) as f:
  for line in f:
    word, coefs = line.split(maxsplit=1)
    coefs = np.fromstring(coefs, "f", sep=" ")
    embeddings_index[word] = coefs

print(f"Found {len(embeddings_index)} word vectors.")

Found 400000 word vectors.


In [None]:
import numpy as np

path_to_glove_file = "glove.6B.100d.txt"

embeddings_index = {}
with open(path_to_glove_file) as f:
  for line in f:
    word, coefs = line.split(maxsplit=1)
    coefs = np.fromstring(coefs, "f", sep=" ")
    embeddings_index[word] = coefs

print(f"Found {len(embeddings_index)} word vectors.")

embedding_dim = 100

vocabulary = text_vectorization.get_vocabulary()
word_index = dict(zip(vocabulary, range(len(vocabulary))))

embedding_matrix = np.zeros((max_tokens, embedding_dim))
for word, i in word_index.items():
  if i < max_tokens:
    embedding_vector = embeddings_index.get(word)
  if embedding_vector is not None:
    embedding_matrix[i] = embedding_vector

embedding_layer = layers.Embedding(
    max_tokens,
    embedding_dim,
    embeddings_initializer=keras.initializers.Constant(embedding_matrix),
    trainable=False,
    mask_zero=True,
    )

inputs = keras.Input(shape=(None,), dtype="int64")
embedded = embedding_layer(inputs)
x = layers.Bidirectional(layers.LSTM(32))(embedded)
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(1, activation="sigmoid")(x)

model = keras.Model(inputs, outputs)
model.compile(optimizer="adam",
              loss="binary_crossentropy",
              metrics=["accuracy"])

model.summary()
callbacks = [
    keras.callbacks.ModelCheckpoint("glove_embeddings_sequence_model.keras", save_best_only=True),
    keras.callbacks.EarlyStopping(patience=3, restore_best_weights=True),
    keras.callbacks.ReduceLROnPlateau(patience=2, factor=0.2)
]
model.fit(int_train_ds, validation_data=int_val_ds, epochs=30, callbacks=callbacks)
model = keras.models.load_model("glove_embeddings_sequence_model.keras")
print(f"Test acc: {model.evaluate(int_test_ds)[1]:.3f}")

Model itself, no greater result than the simple model.

In [None]:
inputs = keras.Input(shape=(None,), dtype="int64")
embedded = embedding_layer(inputs)
x = layers.Bidirectional(layers.LSTM(32))(embedded)
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(1, activation="sigmoid")(x)

model = keras.Model(inputs, outputs)
model.compile(optimizer="adam",
              loss="binary_crossentropy",
              metrics=["accuracy"])

model.summary()
callbacks = [
    keras.callbacks.ModelCheckpoint("glove_embeddings_sequence_model.keras", save_best_only=True),
    keras.callbacks.EarlyStopping(patience=3, restore_best_weights=True),
    keras.callbacks.ReduceLROnPlateau(patience=2, factor=0.2)
]
model.fit(int_train_ds, validation_data=int_val_ds, epochs=30, callbacks=callbacks)
model = keras.models.load_model("glove_embeddings_sequence_model.keras")
print(f"Test acc: {model.evaluate(int_test_ds)[1]:.3f}")

Epoch 1/30
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 45ms/step - accuracy: 0.6432 - loss: 0.6168 - val_accuracy: 0.7892 - val_loss: 0.4510 - learning_rate: 0.0010
Epoch 2/30
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 45ms/step - accuracy: 0.8037 - loss: 0.4405 - val_accuracy: 0.8168 - val_loss: 0.4034 - learning_rate: 0.0010
Epoch 3/30
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 39ms/step - accuracy: 0.8317 - loss: 0.3921 - val_accuracy: 0.8178 - val_loss: 0.4055 - learning_rate: 0.0010
Epoch 4/30
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 39ms/step - accuracy: 0.8449 - loss: 0.3636 - val_accuracy: 0.8296 - val_loss: 0.4075 - learning_rate: 0.0010
Epoch 5/30
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 44ms/step - accuracy: 0.8656 - loss: 0.3240 - val_accuracy: 0.8484 - val_loss: 0.3360 - learning_rate: 2.0000e-04
Epoch 6/30
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m

In [None]:
model.save("sentiment_glove.h5")

import json

# Get config
config = text_vectorization.get_config()

# Get vocabulary
vocab = text_vectorization.get_vocabulary()

# Save config to JSON
with open("sentiment_glove_text_vectorization_config.json", "w") as f:
    json.dump(config, f)

# Save vocabulary to plain text or JSON
with open("sentiment_glove_text_vectorization_vocab.json", "w") as f:
    json.dump(vocab, f)


## Transformer

Preprocess the text data to make it a simpler model
We will only consider the most common 10,000 words
Allow for a maximum sequence length of 250
This keeps the model simple and faster

In [27]:
max_features = 20000
sequence_length = 600
vectorize_layer = layers.TextVectorization(
    max_tokens=max_features,
    output_mode='int',
    output_sequence_length=sequence_length
)

# Adapt the layer to the training text
train_text = train_ds.map(lambda text, label: text)
vectorize_layer.adapt(train_text)

This code processes the text data and converts it into a sequence of integers. The integers represent the words in the vocabular. The labels are kept as they are We will use the `map` function to apply the vectorization to the training and test datasets. The `cache` function caches the data in memory. The `prefetch` function allows the data to be loaded in the background while the model is training. This speeds up the training process. The `AUTOTUNE` parameter allows TensorFlow to automatically tune the prefetching to the available resources

In [28]:
def vectorize_text(text, label):
    text = vectorize_layer(text)
    return text, label

train_ds_2 = train_ds.map(vectorize_text).cache().prefetch(tf.data.AUTOTUNE)
test_ds_2 = test_ds.map(vectorize_text).cache().prefetch(tf.data.AUTOTUNE)
val_ds_2 = val_ds.map(vectorize_text).cache().prefetch(tf.data.AUTOTUNE)

# Now the data was trasnformed into a sequence of integers
for batch in train_ds.take(1):
    texts, labels = batch
    print("First text:",  texts[0])
    print("First label:", labels[0])

First text: tf.Tensor(b'Oh yes, Sakura Killers is a goofy, horrible ninja movie, make no mistake. But it\'s also an incredibly enjoyable one. This is largely thanks to the awesome presence of one Chuck Connors, who is billed as starring in the movie but really only shines in a few scenes. I suppose he\'s supposed to be sort of an Obi Wan Kenobi type ("The tough ninja-buster", the box copy exclaims) but his \'wisdom\' is laughable. "Move without thinking"??? My friend says this is the sign of mental retardation, not of supreme concentration.<br /><br />But really, his two aides, Sonny and Dennis, have such horrible dialogue that \'Brooklyn\', as we call The Colonel, tends to shine in comparison. Especially watch for Dennis\' logic regarding the \'genetic splicing\' the Sakura are involved with. If you know anything about cloning you will die laughing. And yes, this is a major plot point, folks.<br /><br />A terribly fun movie, Sakura Killers is a hard-to-find gem. I won\'t spoil the \'t

In [29]:
def get_positional_encoding(seq_length, d_model):
    # Calculate positional encoding
    positions = np.arange(seq_length)[:, np.newaxis]
    depths = np.arange(d_model)[np.newaxis, :] // 2 * 2  # Integer division

    # Create angle rates
    angle_rates = 1 / np.power(10000, (2 * (depths // 2)) / np.float32(d_model))
    angle_rads = positions * angle_rates

    # Apply sin/cos to even/odd indices
    pos_encoding = np.zeros(angle_rads.shape)
    pos_encoding[:, 0::2] = np.sin(angle_rads[:, 0::2])
    pos_encoding[:, 1::2] = np.cos(angle_rads[:, 1::2])

    return tf.cast(pos_encoding, dtype=tf.float32)


# Create a basic transformer model
def create_transformer_model():
    inputs = layers.Input(shape=(sequence_length,))

    embdding_dim = 128

    # Embedding layer for text
    embedding_layer = layers.Embedding(max_features,  embdding_dim,  mask_zero=True)(inputs)

    # Embedding layer for positional encoding
    pos_encoding = get_positional_encoding(sequence_length,  embdding_dim)
    pos_encoding = tf.keras.backend.constant(pos_encoding)

    embedded_with_pos = embedding_layer + pos_encoding

    # Transformer block
    transformer_block = layers.MultiHeadAttention(
        num_heads=4, key_dim=64
    )(embedded_with_pos, embedded_with_pos)

    norm_transformer_block = layers.LayerNormalization(epsilon=1e-6)(transformer_block + embedded_with_pos)

    # Feed Forward Network
    ffn = layers.Dense(128, activation='relu')(norm_transformer_block)
    ffn = layers.Dense(128)(ffn)
    ffn = layers.Dropout(0.1)(ffn)
    ffn_output = layers.LayerNormalization(epsilon=1e-6)(norm_transformer_block + ffn)

    # Global pooling and output
    pooled = layers.GlobalAveragePooling1D()(ffn_output)
    dropout = layers.Dropout(0.1)(pooled)
    outputs = layers.Dense(1, activation='sigmoid')(dropout)

    model = Model(inputs=inputs, outputs=outputs)
    return model


# Create and compile the model
model = create_transformer_model()
model.compile(
    optimizer=Adam(1e-4),
    loss='binary_crossentropy',
    metrics=['accuracy']
)

model.summary()

Comment the model performance

In [30]:
# Train the model
callbacks = [
    keras.callbacks.ModelCheckpoint("transformer.keras", save_best_only=True),
    keras.callbacks.EarlyStopping(patience=3, restore_best_weights=True),
    keras.callbacks.ReduceLROnPlateau(patience=2, factor=0.2)
]
model.fit(train_ds_2, validation_data=val_ds_2, epochs=30,callbacks=callbacks)
model = keras.models.load_model("transformer.keras")
print(f"Test acc: {model.evaluate(test_ds_2)[1]:.3f}")

Epoch 1/30
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 19ms/step - accuracy: 0.5051 - loss: 0.7231 - val_accuracy: 0.6262 - val_loss: 0.6846 - learning_rate: 1.0000e-04
Epoch 2/30
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 7ms/step - accuracy: 0.5622 - loss: 0.6760 - val_accuracy: 0.7852 - val_loss: 0.4688 - learning_rate: 1.0000e-04
Epoch 3/30
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 7ms/step - accuracy: 0.8090 - loss: 0.4208 - val_accuracy: 0.8416 - val_loss: 0.3613 - learning_rate: 1.0000e-04
Epoch 4/30
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 7ms/step - accuracy: 0.8651 - loss: 0.3125 - val_accuracy: 0.8496 - val_loss: 0.3484 - learning_rate: 1.0000e-04
Epoch 5/30
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 7ms/step - accuracy: 0.8987 - loss: 0.2499 - val_accuracy: 0.8524 - val_loss: 0.3549 - learning_rate: 1.0000e-04
Epoch 6/30
[1m625/625[0m [32m━━━━━━━━━━━━━━━━

In [None]:
model.save("sentiment_transformer.h5")

import json

# Get config
config = vectorize_layer.get_config()

# Get vocabulary
vocab = vectorize_layer.get_vocabulary()

# Save config to JSON
with open("sentiment_transformer_text_vectorization_config.json", "w") as f:
    json.dump(config, f)

# Save vocabulary to plain text or JSON
with open("sentiment_transformer_text_vectorization_vocab.json", "w") as f:
    json.dump(vocab, f)


## BERT: A pretrained model

BERT is a know model, capable of getting accuracy scores in the IMDB up to 94 or even 95%. By freezing its hidden layers, we can leverage the already prebuilt weights and train the model only to categorize the sentiment as positve or negative.

In [31]:
bert_preset = "bert_base_en"
sequence_length = 250

# Load BERT tokenizer (does not include preprocessing layers)
tokenizer = keras_nlp.models.BertTokenizer.from_preset(bert_preset)

# Load BERT preprocessor for truncation/padding (tokenizer only + packing)
preprocessor = keras_nlp.models.BertPreprocessor.from_preset(
    bert_preset,
    sequence_length=sequence_length
)

def preprocess_text(text, label):
    encoded = preprocessor(tf.convert_to_tensor(text))
    return (
        {
            "token_ids": encoded["token_ids"],
            "segment_ids": encoded["segment_ids"],
            "padding_mask": encoded["padding_mask"]
        },
        label
    )

In [32]:
train_ds_3 = train_ds.map(preprocess_text).cache().prefetch(tf.data.AUTOTUNE)
val_ds_3 = val_ds.map(preprocess_text).cache().prefetch(tf.data.AUTOTUNE)
test_ds_3 = test_ds.map(preprocess_text).cache().prefetch(tf.data.AUTOTUNE)

An additional dense layer was added to the model, in order to get better results.

In [None]:
input_token_ids = keras.Input(shape=(sequence_length,), dtype=tf.int32, name="token_ids")
input_segment_ids = keras.Input(shape=(sequence_length,), dtype=tf.int32, name="segment_ids")
input_padding_mask = keras.Input(shape=(sequence_length,), dtype=tf.int32, name="padding_mask")

inputs = {
    "token_ids": input_token_ids,
    "segment_ids": input_segment_ids,
    "padding_mask": input_padding_mask
}

# Frozen BERT backbone
bert_backbone = keras_nlp.models.BertBackbone.from_preset("bert_base_en")
bert_backbone.trainable = False

# Forward pass
encoder_outputs = bert_backbone(inputs)
cls_output = encoder_outputs["pooled_output"]

# Classification head
x = layers.Dense(256, activation="relu")(cls_output)
x = layers.Dropout(0.3)(x)
output = layers.Dense(1, activation="sigmoid")(x)


# Define model
model = keras.Model(inputs=inputs, outputs=output)
model.compile(
    optimizer=keras.optimizers.Adam(3e-5),
    loss="binary_crossentropy",
    metrics=["accuracy"]
)

model.summary()

In [34]:
callbacks = [
    keras.callbacks.ModelCheckpoint("bert_static_preprocessed.keras", save_best_only=True),
    keras.callbacks.EarlyStopping(patience=3, restore_best_weights=True),
    keras.callbacks.ReduceLROnPlateau(patience=2, factor=0.2)
]

model.fit(train_ds_3, validation_data=val_ds_3, epochs=50, callbacks=callbacks)


Epoch 1/50
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m76s[0m 64ms/step - accuracy: 0.5403 - loss: 0.7114 - val_accuracy: 0.6450 - val_loss: 0.6411 - learning_rate: 3.0000e-05
Epoch 2/50
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 46ms/step - accuracy: 0.6174 - loss: 0.6547 - val_accuracy: 0.6640 - val_loss: 0.6249 - learning_rate: 3.0000e-05
Epoch 3/50
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 46ms/step - accuracy: 0.6457 - loss: 0.6347 - val_accuracy: 0.6766 - val_loss: 0.6126 - learning_rate: 3.0000e-05
Epoch 4/50
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 46ms/step - accuracy: 0.6587 - loss: 0.6229 - val_accuracy: 0.6806 - val_loss: 0.6041 - learning_rate: 3.0000e-05
Epoch 5/50
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 46ms/step - accuracy: 0.6637 - loss: 0.6177 - val_accuracy: 0.6834 - val_loss: 0.5972 - learning_rate: 3.0000e-05
Epoch 6/50
[1m625/625[0m [32m━━━━━━━━

<keras.src.callbacks.history.History at 0x7a2b9ce00c90>

In [35]:
model = keras.models.load_model("bert_static_preprocessed.keras")
print(f"Test accuracy: {model.evaluate(test_ds_3)[1]:.3f}")


[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 33ms/step - accuracy: 0.7359 - loss: 0.5276
Test accuracy: 0.735


In [None]:
model.save("sentiment_bert.h5")

import json

# Get config
config = preprocessor.get_config()

# Save config to JSON
with open("sentiment_bert_text_vectorization_config.json", "w") as f:
    json.dump(config, f)