### Import the libraries

In [1]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras import layers
from tensorflow.keras import losses
import re
import string
import matplotlib.pyplot as plt

In [2]:
# Select your favourite number for the random seed
seed = 42

# Sets the global random seed for numpy
np.random.seed(seed)

# Sets the global random for tensorflow
tf.random.set_seed(seed)

print(f"Random seed set to {seed}")

Random seed set to 42


### Load the data

In [8]:
data_dir = './data/aclImdb'

In [None]:
# tf.keras.utils.text_dataset_from_directory: This function creates a tf.data.Dataset object from
# text files organized in a directory. This is useful for loading and preprocessing text data for
# training machine learning models.

# labels = 'inferred': This indicates that the labels for the dataset will be inferred from the
# directory structure. For instance, if your directory contains subdirectories named after each
# class, the names of these subdirectories will be used as labels.

# label_mode = 'int': This specifies that the labels should be encoded as integers. If you set
# this to 'categorical', labels would be one-hot encoded. If you set it to 'binary', it would
# expect a binary classification.
raw_training_set = tf.keras.utils.text_dataset_from_directory(
    f'{data_dir}/train',
    labels = 'inferred',
    label_mode = 'int',
    batch_size = 32,
    validation_split = 0.2,
    subset = 'training',
    seed = seed
)

raw_validation_set = tf.keras.utils.text_dataset_from_directory(
    f'{data_dir}/train',
    labels = 'inferred',
    label_model = 'int',
    batch_size = 32,
    validation_split = 0.2,
    subset = 'validation',
    seed = seed
)

# Create the test set
raw_test_set = tf.keras.utils.text_dataset_from_directory(
    f'{data_dir}/test',
    labels = 'inferred',
    label_model = 'int',
    batch_size = 32
)

'''
Found 5000 files belonging to 2 classes.
Using 4000 files for training.
Found 5000 files belonging to 2 classes.
Using 1000 files for validation.
Found 5000 files belonging to 2 classes.
'''

In [None]:
print(f'Label 0 corresponds to {raw_training_set.class_names[0]}')
print(f'Label 1 corresponds to {raw_training_set.class_names[1]}')

'''
Label 0 corresponds to neg
Label 1 corresponds to pos
'''

In [None]:
# Take one batch from the daatset and print out the first three dataset in the batch
for text_batch, label_batch in raw_training_set.take(1):
  for i in range(3):
    print(f"Review:\n {text_batch.numpy()[i]}")
    print(f"Label :\n {label_batch.numpy()[i]}")

'''
Review:
 b'This is a reunion, a team, and a great episode of Justice. From hesitation to resolution,
 Clark has made a important leap from a troubled teenager who was afraid of a controlled destiny,
 to a Superman who, like Green Arrow, sets aside his emotions to his few loved ones, ready to save
 the whole planet. This is not just a thrilling story about teamwork, loyalty, and friendship;
 this is also about deciding what\'s more important in life, a lesson for Clark. I do not want the
 series to end, but I hope the ensuing episodes will strictly stick to what Justice shows without any
 "rewind" pushes and put a good end here of Smallville---and a wonderful beginning of Superman.<br /
 ><br />In this episode, however, we should have seen more contrast between Lex and the Team. Nine
 stars should give it enough credit.'
Label: 1

Review:
b'"Hey Babu Riba" is a film about a young woman, Mariana (nicknamed "Esther" after a famous American
movie star), and four young men, Glenn, Sacha, Kicha, and Pop, all perhaps 15-17 years old in 1953
Belgrade, Yugoslavia. The five are committed friends and crazy about jazz, blue jeans, or anything
American it seems.<br /><br />The very close relationship of the teenagers is poignant, and ultimately
a sacrifice is willingly made to try to help one of the group who has fallen on unexpected difficulties.
In the wake of changing communist politics, they go their separate ways and reunite in 1985 (the year
before the film was made).<br /><br />I enjoyed the film with some reservations. The subtitles for one
thing were difficult. Especially in the beginning, there were a number of dialogues which had no subtitles
at all. Perhaps the conversational pace required it, but I couldn\'t always both read the text and absorb
the scene, which caused me to not always understand which character was involved. I watched the movie (a video
from our public library) with a friend, and neither of us really understood part of the story about acquiring
streptomycin for a sick relative.<br /><br />This Yugoslavian coming of age film effectively conveyed the
teenagers\' sense of invulnerability, idealism, and strong and loyal bonds to each other. There is a main flashforward,
and it was intriguing, keeping me guessing until the end as to who these characters were vis-a-vis the 1953 cast, and
what had actually happened.<br /><br />I would rate it 7 out of 10, and would like to see other films by the director,
Jovan Acin (1941-1991).'
Label: 1
'''

### Prapare the Data

In [None]:
# Set the maximum number of words
max_features = 10000

# Define the custom standardization function
def custom_standardization(input_data):
  # Convert all text to lowercase
  lowercase = tf.strings.lower(input_data)

  # Remove HTML tags
  stripped_html = tf.strings.regex_replace(lowercase, '<br />', ' ')

  # Remove punctuation
  # tf.strings.regex_replace function to clean text data by removing punctuation.
  # string.punctuation: This is a string containing all punctuation characters (!"#$%&'()*+,-./:;<=>?@[\]^_{|}~`).
  # re.escape(string.punctuation): This escapes all special characters in string.punctuation, so they are treated as literal characters in the regular expression.
  # '[%s]' % ...: This formats the escaped punctuation characters into a regex character class. For example, if string.punctuation contains !"#$%, the resulting
  # pattern will be ['!"#$%'], which matches any of these characters.
  replaced = tf.strings.regex_replace(
      stripped_html,
      '[%s]' % re.escape(string.punctuation),
      ''
  )

  return replaced

# Create a layer thatyou can use to convert text to vectors
# TextVectorization is a Keras preprocessing layer that converts raw text into sequences of integers or vectors. It helps in transforming text data into a format
# suitable for input into machine learning models.
# standardize is a parameter that specifies a custom function to standardize or preprocess text data before tokenization
# max_tokens sets the maximum number of unique tokens (words or subwords) to keep in the vocabulary.
# output_sequence_length sets the length of the output sequences. If the sequences of tokens are shorter than this length, they will be padded with zeros.
# If they are longer, they will be truncated to this length. This ensures that all sequences have the same length when passed to the model, which is essential for training.
vectorize_layer = layers.TextVectorization(
    standardize = custom_standardization,
    max_tokens = max_features,
    output_mode = 'int',
    output_sequence_length = 250
)

In [None]:
# Build the vocabulary
#  The function lambda x, y: x extracts only the text part (x) from each (text, label) pair in the dataset, discarding the labels (y). As a result,
# train_text will be a dataset of text samples only.
train_text = raw_training_set.map(lambda x,y : x)
# .adapt(train_text): The adapt method is used to build the vocabulary from the text data. It processes the train_text dataset, extracting unique tokens
# and their frequencies, and uses this information to create a vocabulary. The size of this vocabulary is determined by the max_tokens parameter you specified
# earlier.
vectorize_layer.adapt(train_text)

# Print out the vocabulary size
print(f"Vocabulary size : {len(vectorize_layer.get_vocabulary())}")

'''
Vocabulary size: 10000
'''

In [None]:
# Define the final function that will use to vectorize the text
def vectorize_text(text, label):
  text = tf.expand_dims(text, -1)
  return vectorize_layer(text), label

# Get one batch and select the first datapoint
text_batch, label_batch = next(iter(raw_training_set))
first_review, first_label = text_batch[0], label_batch[0]

# Show the raw data
print(f"Review: \n{first_review}")
print(f"\nLabel: {raw_training_set.class_names[first_label]}")
# Show the vectorized data
print(f"\nVectorized review\n{vectorize_text(first_review,first_label)}")

'''
Review:
b"Okay, so the plot is on shaky ground. Yeah, all right, so there are some randomly inserted song and/or dance sequences
(for example: Adam's concert and Henri's stage act). And Leslie Caron can't really, um, you know... act.<br /><br />But
somehow, 'An American In Paris' manages to come through it all as a polished, first-rate musical--largely on the basis of Gene
Kelly's incredible dancing talent and choreography, and the truckloads of charm he seems to be importing into each scene with
Caron. (He needs to, because she seems to have a... problem with emoting.) <br /><br />The most accomplished and technically
awe-inspiring number in this musical is obviously the 16-minute ballet towards the end of the film. It's stunningly filmed,
and Kelly and Caron dance beautifully. But my favourite number would have to be Kelly's character singing 'I Got Rhythm' with
a bunch of French school-children, then breaking into an array of American dances. It just goes to prove how you don't need special
effects when you've got some real *talent*.<br /><br />Not on the 'classics' level with 'Singin' In The Rain', but pretty high
up there nonetheless. Worth the watch!"

Label: pos

Vectorized review
(<tf.Tensor: shape=(1, 250), dtype=int64, numpy=
array([[ 947,   38,    2,  112,    7,   20, 6022, 1754, 1438,   31,  201,
          38,   46,   24,   47, 6565, 8919,  603, 2928,  831,  858,   15,
         476, 3241, 3010,    4,    1,  892,  478,    4, 3553, 5885,  175,
          63, 6992,   21,  118,  478,   18,  813,   33,  329,    8, 1466,
        1029,    6,  227,  143,    9,   31,   14,    3, 6590, 9055,    1,
          20,    2, 3025,    5, 1996,    1, 1085,  914,  597,    4, 2733,
           4,    2,    1,    5, 1411,   27,  190,    6,   26,    1,   77,
         244,  130,   16, 5885,   27,  731,    6,   80,   53,  190,    6,
          25,    3,  425,   16,    1,    2,   85, 3622,    4, 2603,    1,
         593,    8,   10,  663,    7,  506,    2,    1, 4342, 1089,    2,
         121,    5,    2,   19,   29, 5994,  886,    4, 1561,    4, 5885,
         831, 1415,   18,   55, 1496,  593,   62,   25,    6,   26,    1,
         105,  965,   11,  186, 4687,   16,    3,  862,    5, 1001,    1,
          96, 2442,   77,   33, 7537,    5,  329, 4825,    9,   41,  264,
           6, 2131,   86,   21,   87,  333,  290,  317,   51,  699,  186,
          47,  144,  597,   23,   20,    2, 2008,  557,   16, 7714,    8,
           2, 2477,   18,  179,  307,   57,   46, 2878,  268,    2,  106,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0]])>, <tf.Tensor: shape=(), dtype=int32, numpy=1>)
  '''

In [None]:
train_ds = raw_training_set.map(vectorize_text)
val_ds = raw_validation_set.map(vectorize_text)
test_ds = raw_test_set.map(vectorize_text)

### Configure the Dataset

In [None]:
AUTOTUNE = tf.data.AUTOTUNE

# train_ds.prefetch(buffer_size=AUTOTUNE): This method prepares the data for the next step while the current step is being
# executed. Setting buffer_size to AUTOTUNE enables TensorFlow to automatically determine the optimal buffer size to improve
# the performance of your input pipeline.
train_ds = train_ds.cache().prefetch(buffer_size = AUTOTUNE)
test_ds = test_ds.cache().prefetch(buffer_size = AUTOTUNE)

### Create a Sequential Model

In [None]:
embedding_dim = 16

# Create the model by calling tf.keras.Sequential, where the layers are given in a list
model_sequential = tf.keras.Sequential([
    # max_features: The number of unique tokens (rows) in the embedding matrix.
    # embedding_dim: The size of the dense vector (columns) that each token will be mapped to.
    layers.Embedding(max_features, embedding_dim),
    layers.GlobalAveragePooling1D(),
    layers.Dense(1, activation = 'sigmoid')
])

# Print out the summary of the model
model_sequential.summary()

'''
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #
=================================================================
 embedding (Embedding)       (None, None, 16)          160000

 global_average_pooling1d (  (None, 16)                0
 GlobalAveragePooling1D)

 dense (Dense)               (None, 1)                 17

=================================================================
Total params: 160017 (625.07 KB)
Trainable params: 160017 (625.07 KB)
Non-trainable params: 0 (0.00 Byte)
'''

In [None]:
model_sequential.compile(
    loss = losses.BinaryCrossentropy(),
    optimizer = 'adam',
    metrics = ['accuracy']
)

### Create a Model using Functional API

In [None]:
# Define the inputs
inputs = tf.keras.Input(shape = (None,))

# Define the first layer
embedding = layers.Embedding(max_features, embedding_dim)
# Call the first layer with inputs as the parameter
x = embedding(inputs)

# Define the second layer
pooling = layers.GlobalAveragePooling1D()
# Call the second layer with x as the parameter
x = pooling(x)

# Define output layer
outputs = layers.Dense(1, activation = 'sigmoid')(x)

# Create the model
model_functional = tf.keras.Model(inputs = inputs, outputs = outputs)

# Print out the summary of the model
model_functional.summary()

'''
Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #
=================================================================
 input_1 (InputLayer)        [(None, None)]            0

 embedding_1 (Embedding)     (None, None, 16)          160000

 global_average_pooling1d_1  (None, 16)                0
  (GlobalAveragePooling1D)

 dense_1 (Dense)             (None, 1)                 17

=================================================================
Total params: 160017 (625.07 KB)
Trainable params: 160017 (625.07 KB)
Non-trainable params: 0 (0.00 Byte)
'''

In [None]:
model_functional.compile(
    loss = losses.BinaryCorssentropy(),
    optimizer = 'adam',
    metrics = ['accuracy']
)

### Train the model

In [None]:
# Select which mode you want to use and train, the results should be the same
model = model_functional

# model = model_sequential

In [None]:
epochs = 25
history = model.fit(
    train_ds,
    validation_data = val_ds,
    epochs = epochs,
    verbose = 2
)

'''
Epoch 1/25
125/125 - 2s - loss: 0.6904 - accuracy: 0.5490 - val_loss: 0.6869 - val_accuracy: 0.6530 - 2s/epoch - 17ms/step
Epoch 2/25
125/125 - 1s - loss: 0.6792 - accuracy: 0.6955 - val_loss: 0.6731 - val_accuracy: 0.7140 - 757ms/epoch - 6ms/step
Epoch 3/25
125/125 - 1s - loss: 0.6590 - accuracy: 0.7445 - val_loss: 0.6511 - val_accuracy: 0.7390 - 777ms/epoch - 6ms/step
Epoch 4/25
125/125 - 1s - loss: 0.6305 - accuracy: 0.7735 - val_loss: 0.6236 - val_accuracy: 0.7660 - 721ms/epoch - 6ms/step
Epoch 5/25
125/125 - 1s - loss: 0.5970 - accuracy: 0.7908 - val_loss: 0.5944 - val_accuracy: 0.7890 - 730ms/epoch - 6ms/step

....

Epoch 21/25
125/125 - 1s - loss: 0.2271 - accuracy: 0.9510 - val_loss: 0.3579 - val_accuracy: 0.8620 - 701ms/epoch - 6ms/step
Epoch 22/25
125/125 - 1s - loss: 0.2156 - accuracy: 0.9535 - val_loss: 0.3534 - val_accuracy: 0.8610 - 691ms/epoch - 6ms/step
Epoch 23/25
125/125 - 1s - loss: 0.2048 - accuracy: 0.9580 - val_loss: 0.3495 - val_accuracy: 0.8610 - 699ms/epoch - 6ms/step
Epoch 24/25
125/125 - 1s - loss: 0.1946 - accuracy: 0.9622 - val_loss: 0.3460 - val_accuracy: 0.8610 - 717ms/epoch - 6ms/step
Epoch 25/25
125/125 - 1s - loss: 0.1849 - accuracy: 0.9640 - val_loss: 0.3430 - val_accuracy: 0.8610 - 835ms/epoch - 7ms/step
'''

In [None]:
loss,accuracy = model.evaluate(test_ds)

print(f"Loss : {loss}")
print(f"Accuracy : {accuracy}")

'''
157/157 [==============================] - 2s 12ms/step - loss: 0.3644 - accuracy: 0.8452
Loss: 0.36437687277793884
Accuracy: 0.8452000021934509
'''

In [None]:
def plot_metrics(history, metric):
  plt.plot(history.history[metric])
  plt.plot(history.history[f'val_{metric}'])
  plt.xlabel('Epochs')
  plt.ylabel(metric.title())
  plt.legend([metric, f'val_{metric}'])
  plt.show()

plot_metrics(history, "accuracy")
plot_metrics(history, "loss")

### Prediction

In [None]:
# Make a new sequential model using the vctorization layer and the model just trained
export_model = tf.keras.Sequential([
    vectorize_layer,
    model
])

# Compile the model
export_model.compile(
    # from_logits=True: The loss function will apply a sigmoid activation function to the logits before calculating the binary cross-entropy loss.
    # from_logits=False: The loss function assumes that the predictions are already probabilities (values between 0 and 1) and calculates the binary cross-entropy loss directly from these probabilities.
    loss = losses.BinaryCrossentropy(from_logits = False),
    optimizer = 'adam',
    metrics =['accuracy']
)

In [None]:
examples = ['this movie was very, very good', 'quite ok', 'the movie was not bad', 'bad', 'negative disappointed bad scary', 'this movie was stupid']

results = export_model.predict(examples, verbose=False)

for result, example in zip(results, examples):
  print(f"Result: {result[0]:.3f}, Label: {int(np.round(result[0]))} Review: {example}")

'''
Result: 0.625,   Label: 1,   Review: this movie was very, very good
Result: 0.541,   Label: 1,   Review: quite ok
Result: 0.427,   Label: 0,   Review: the movie was not bad
Result: 0.473,   Label: 0,   Review: bad
Result: 0.428,   Label: 0,   Review: negative disappointed bad scary
Result: 0.455,   Label: 0,   Review: this movie was stupid
'''