# IMDB Sentiment analysis

In [1]:
import random
import time
import datetime
import os

import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt

2024-08-17 10:49:39.959631: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-17 10:49:40.111534: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-17 10:49:40.152941: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-08-17 10:49:40.272660: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Loading data

In [2]:
def separate_text_and_label(data_path):
    """
    Read all the data and parse it (remove HTML and lowercase)

    :param data_path:
    :return:
    """
    texts = []
    labels = []
    classes = {}
    for category in ['pos', 'neg']:
        classes[category] = []
        full_path = os.path.join(data_path, category)
        for filename in sorted(os.listdir(full_path)):
            if filename.endswith('.txt'):
                with open(os.path.join(full_path, filename)) as file:
                    text = file.read().lower().replace('<br />', '')
                    texts.append(text)
                    classes[category].append(text)
                labels.append(0 if category == 'neg' else 1)

    return texts, labels, classes


train_texts, train_labels, train_classes = separate_text_and_label('aclImdb/train')
test_texts, test_labels, test_classes = separate_text_and_label('aclImdb/test')

timestamp = int(time.time())
random.Random(timestamp).shuffle(train_texts)
random.Random(timestamp).shuffle(train_labels)

## Analyzing the dataset

- Number of samples
- Number of classes,
- Number of samples per class
- Average number of words per sample,
- Distribution of words per category and globally,
- Distribution of the number of words per category and globally.

In [3]:
def get_words(text):
    """
    Get all the words in text
    :param text: the text to extract words from
    :return: list of words
    """
    return text.split()


def number_of_words_from_text(text):
    """
    Get the number of words in text
    :param text: the text to extract words from
    :return: the number of words
    """
    return len(get_words(text))


def flatten_comprehension(list_of_lists):
    """
    Flatten a list of lists
    :param list_of_lists: a list of lists
    :return: a flattened list of lists
    """
    return [item for row in list_of_lists for item in row]

In [4]:
nb_samples = len(train_texts)
classes_list, class_indexes, nb_samples_per_class = np.unique(train_labels, return_index=True, return_counts=True)
nb_classes = len(classes_list)

# np.vectorize maps every element of the array with a specific function
word_counts = np.vectorize(number_of_words_from_text)(train_texts)
# Thanks to this trick, we just have to calculate a mean of all the values
avg_nb_words = np.mean(word_counts)

In [None]:
from collections import Counter


def plot_word_distribution(texts, category):
    """
    Plot the distribution of words and length across samples of text
    :param texts: a list of texts
    :param category: the category of text
    """
    split = list(map(get_words, texts))

    all_words = np.array(flatten_comprehension(split))
    # Count the frequency of each word
    word_counts = Counter(all_words)
    # Extract words and their frequencies
    words, frequencies = zip(*word_counts.most_common(30))

    plt.figure(figsize=(12, 8))  # Increase figure size if needed

    plt.subplot(1, 2, 1)
    plt.bar(words, frequencies)
    plt.title(f'Frequency distribution of words for {category} samples')
    plt.xlabel('Words')
    plt.ylabel('Frequency in all texts')
    plt.xticks(rotation=45)
    plt.tight_layout()  # Adjust layout to fit labels
    plt.xlim(0, 40)

    nb_words_distribution = np.fromiter(map(lambda wds: len(wds), split), dtype=int)

    plt.subplot(1, 2, 2)
    plt.hist(nb_words_distribution, 40)
    plt.xlabel('Number of words')
    plt.ylabel('Number of samples')
    plt.title(f'Distribution of number of words for {category} samples')
    

print(f'Number of samples: {nb_samples}\nNumber of classes_list: {nb_classes}\nNumber of samples per class:')
for i in range(len(classes_list)):
      print(f'- {classes_list[i]}: {nb_samples_per_class[i]}')

print(f'Average number of words per sample: {avg_nb_words}')

for category in train_classes.keys():
    plot_word_distribution(train_classes[category], category)
plot_word_distribution(train_texts, 'all')

## Data pipeline

### N-Grams data structure

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

NUM_FEATURES = 20_000
NGRAM_RANGE = (1, 2)

def vectorize_ngrams(train_texts, train_labels, validation_texts):
    # Step 1 - Feature extraction : we want to extract n-gram frequencies from our texts
    vectorizer = TfidfVectorizer(
        # For more security, we allow 1-grams to be extracted (if the sentence only contains one word for instance)
        ngram_range=NGRAM_RANGE,
        # In case accents are present (for instance, if the comments are from other languages than English)
        # Here we know our data is in English, but this parameter could be useful in the future if we want to generalize
        # ou rmodel
        strip_accents='unicode',
        # If other characters than unicode are detected, just replace them ; the essential is having a good vocabulary
        # without repetitions
        decode_error='replace',
        # 'max_features': 20_000, # Will be treated by a feature selection process later

        # Integer are simpler to compute, and we don't need floats for the moment (we are only processing the data here,
        # not interacting with the coefficients)
        dtype=np.float64,
        # 'norm': None,

        # Remove tokens that shows only once (only keep those which count is more than 2)
        min_df=2
    )

    # This operation consists of:
    # - Fitting: read texts, learn the vocabulary and calculate frequencies
    # - Transforming: parse this data into a matrix
    # The returned value is a matrix of shape (n_samples, n_features) : we can get the number of extracted features
    # right from it. It will then be used for our MLP model
    x_train = vectorizer.fit_transform(train_texts)
    # Here we don't need to learn the vocabulary, this was done at the precedent line
    # (and it could cause problems if some words are different)
    x_validation = vectorizer.transform(validation_texts)

    # Step 2 - Feature selection : we only consider the features that are relevant for our purpose We choose the
    # classification function as we face a classification problem

    # Note the difference between the vectorizer - to return a vector out of another object - and the transformer

    # BE CAREFUL: if the dataset has less than 20k
    # features, we need a fallback -> the number of features already present
    selector = SelectKBest(score_func=f_classif, k=min(NUM_FEATURES, x_train.shape[1]))
    selector.fit(x_train, train_labels)

    x_train = selector.transform(x_train).astype(np.float64)
    x_validation = selector.transform(x_validation).astype(np.float64)

    return x_train, x_validation

### Creating the vocabulary and the vectorization preprocessing

In [5]:
from sklearn.feature_extraction.text import CountVectorizer

num_features = 20_000
max_sequence_length = 500
validation_split = 0.2

count = CountVectorizer(
  lowercase=True,
  min_df=2,
  strip_accents='unicode',
  max_features=num_features
)

count.fit(train_texts)

vocabulary = list(count.vocabulary_.keys())
vectorize_layer = tf.keras.layers.TextVectorization(
  vocabulary=vocabulary,
  output_mode='int',
  output_sequence_length=max_sequence_length
)

def create_dataset(texts, labels):
    # Create a unique dataset for the two parts, features and labels, then use
    # a pre-processing pipeline which TensorFlow will optimize
    return tf.data.Dataset.from_tensor_slices((texts, labels)).batch(32).map(lambda x, y: (vectorize_layer(x), y))

I0000 00:00:1723884608.515898   36361 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1723884608.911237   36361 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1723884608.911910   36361 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1723884608.915200   36361 cuda_executor.cc:1015] successful NUMA node read from SysFS ha

In [6]:

dataset = create_dataset(train_texts, train_labels)
# Be careful when splitting the dataset: .take() and .skip() count in batches, not number of samples
dataset_size = sum(1 for _ in dataset)
validation_size = int(validation_split * dataset_size)
validation_dataset = dataset.take(validation_size)
train_dataset = dataset.skip(validation_size)

test_dataset = create_dataset(test_texts, test_labels)

2024-08-17 10:50:11.805293: I tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


### Getting a word embedding

In [7]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip -q glove.6B.zip

--2024-08-17 10:50:12--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2024-08-17 10:50:13--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2024-08-17 10:50:14--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... ^C
replace glove.6B.50d.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: ^

In [8]:
embedding_path = 'glove.6B.300d.txt'
embeddings = {}

with open(embedding_path, 'r') as f:
    for line in f.readlines():
        word, coefs = line.split(maxsplit=1)
        vec = np.fromstring(coefs, 'f', sep=' ')
        embeddings[word] = vec
        
embedding_dim = embeddings['the'].shape[0]

embedding_matrix = np.zeros((num_features, embedding_dim))
for i, word in enumerate(vocabulary):
    vec = embeddings.get(word)
    if vec is not None:
        embedding_matrix[i] = vec

## Creating various models

### Skeleton

In [10]:
def get_units_and_activation(num_classes):
    """
        - One class : useless prediction
        - Two classes : a binary choice for one of them (in or out)
        - More classes : we have to give probabilities to belong to each class
    :param num_classes:
    :return: a tuple of (number of outputs, last activation function)
    """

    if num_classes > 2:
        return num_classes, 'softmax'
    else:
        return 1, 'sigmoid'

In [None]:
def compile_classification_model(
        model,
        num_classes):
    """
    Compile a model specifically for classification, using a loss fitted for the problem, and an Adam optimizer

    :param model:
    :param num_classes:
    :return: an array of callbacks to plug when fitting, i.e. early stopping
    """

    # We are in a classification problem, so we might use other losses (because our probabilities are either solids 1 or
    # 0 in our dataset)
    # If we only have two classes, our probability is straightforward : in or out => binary
    loss = tf.keras.losses.BinaryCrossentropy()
    if num_classes > 2:
        loss = tf.keras.losses.SparseCategoricalCrossentropy()
    optimizer = tf.keras.optimizers.Adam(model_params["learning_rate"])

    # Classification problem: we use accuracy as our metric
    model.compile(loss=loss, optimizer=optimizer, metrics=['accuracy'])

    callbacks = []
    if model_params["early_stopping"] is not None:
        # Stop the training early if the validation loss doesn't decrease in 2 consecutive steps
        callbacks.append(tf.keras.callbacks.EarlyStopping(monitor='loss', patience=model_params["early_stopping"]))

    return callbacks

In [None]:
def fit_model_tensorboard(model,
                          dataset=None,
                          features=None,
                          labels=None,
                          callbacks=None):
    """
    Fit the model using either:
        - a given dataset
        - separate features and labels
    Each one of them will be split into training and validation sets
    All the data will then be exported for TensorBoard analysis

    :param callbacks: additional callbacks in addition to TensorBoard
    :param labels:
    :param features:
    :param dataset: tf.data.Dataset object
    :param model The Keras model to fit
    """

    # Trick given by PEP to have immutable function arguments, and then fill with what we want
    if callbacks is None:
        callbacks = []

    date = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")

    log_dir = f'{model_params["log_dir"]}/{model_params["name"]}-{date}'
    tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)
    callbacks.append(tensorboard_callback)

    kwargs = {
        'epochs': model_params["epochs"],
        'verbose': model_params["verbose"],

        # 'batch_size': model_params["batch_size"],
        'callbacks': callbacks
    }

    if dataset is None:
        kwargs['validation_split'] = model_params["validation_split"]
        model.fit(
            tf.Variable(features),
            labels,
            **kwargs
        )
    else:
        dataset_size = sum(1 for _ in dataset)
        dataset = dataset.shuffle(buffer_size=1000)
        validation_size = int(model_params["validation_split"] * dataset_size)

        validation_dataset = dataset.take(validation_size)
        train_dataset = dataset.skip(validation_size)
        kwargs['validation_data'] = validation_dataset
        model.fit(train_dataset, **kwargs)

    # Save model and parameters
    model.save(f'models/{model_params["name"]}-{date}.keras')
    with open(f'models/{model_params["name"]}-{date}.json', 'w') as file:
        json.dump(model_params, file)
    tf.keras.utils.plot_model(model, show_shapes=True)
    print(f'Saved model to models/{model_params["name"]}.keras')

### Multi-layer perceptron (MLP)

In [None]:
def create_sequential(num_layers, units, activation, num_classes, input_shape, dropout_rate, normalization=None):
    op_units, op_activation = get_units_and_activation(num_classes)

    # Simpler to use: add layers as they are created
    model = tf.keras.Sequential()
    model.add(tf.keras.Input(shape=input_shape, sparse=True))
    # model.add(tf.keras.layers.Dropout(rate=dropout_rate))
    if normalization is not None:
        model.add(normalization)

    for i in range(num_layers):
        model.add(tf.keras.layers.Dense(units=units, activation=activation))
        # We should add a Dropout layer here to catch examples before they get to the output
        # model.add(tf.keras.layers.Dropout(rate=dropout_rate))

    model.add(tf.keras.layers.Dense(units=op_units, activation=op_activation))
    return model

In [None]:
def mlp_model(num_classes,
              train_features,
              train_labels,
              layers=2,
              epochs=1000,
              learning_rate=1e-3):
    # Input shape is a (n, m) matrix where :
    # - n is the number of samples
    # - m is the vocabulary size (so the number of probabilities)
    # Hence the input shape of our model = the vocabulary size
    model = create_sequential(layers, 64, 'relu', num_classes, train_features.shape[1:], 0.2)

    callbacks = utils.compile_classification_model(model, num_classes)

    utils.fit_model_tensorboard(model, features=train_features, labels=train_labels, callbacks=callbacks)

### CNN

In [None]:
def get_cnn(num_blocks=3,
            dropout_rate=0.2,
            kernel_size=3,
            num_filters=64,
            pool_size=3):
    
    # ------------------
    # STEP 1: tokenize our dataset to transform it into word sequences (those words are mapped to integers)
    # ------------------
    # This is done using the vectorization layer, which cannot be implemented here (only available to Functional API)
    # Instead, we do the vectorization outside
    
    # The input shape will be deduced when the model is built, i.e., when we compile and evaluate the model
    # for the first time (because it depends on the chosen sequence length)
    inputs = tf.keras.Input(shape=(None,), dtype="int64")
    
    # ------------------
    # STEP 2: create the word embedding so the model can semantically interpret our sentences
    # ------------------
    num_features = len(vocabulary)
    x = tf.keras.layers.Embedding(
        num_features,
        embedding_dim,
        embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix),
        trainable=False,
        # mask_zero=True,  # This option serves to ignore zeros in the padding (after having extended sequences to match
                         # length) when passing to convolution blocks, or more efficiently to RNN blocks
    )(inputs)
    
    for i in range(num_blocks):
        x = tf.keras.layers.Dropout(rate=dropout_rate)(x)
        x = tf.keras.layers.SeparableConv1D(
            kernel_size=kernel_size,
            filters=num_filters,
            padding="same",
            activation='relu',
            depthwise_initializer=tf.keras.initializers.RandomUniform,
            bias_initializer=tf.keras.initializers.RandomUniform
        )(x)
        if i < num_blocks - 1:
            x = tf.keras.layers.MaxPool1D(
                pool_size=pool_size
            )(x)
    
    x = tf.keras.layers.GlobalAveragePooling1D()(x)
    x = tf.keras.layers.Dropout(rate=dropout_rate)(x)
    
    # Every model should have a dense layer at the end to return a coherent result
    op_units, op_activation = get_units_and_activation(nb_classes)
    outputs = tf.keras.layers.Dense(units=op_units, activation=op_activation)(x)
    return tf.keras.Model(inputs=inputs, outputs=outputs)

cnn_model = get_cnn()

In [None]:
def train_new_cnn_model(vocabulary,
                        num_classes,
                        train_texts,
                        train_labels):

    model = init_cnn_model(vocabulary, num_classes)
    text_vectorizer = get_text_vectorizer(vocabulary)

    # We create a unique dataset for our two components : features and labels
    train_dataset = tf.data.Dataset.from_tensor_slices((train_texts, train_labels))
    # We then use a pre-processing pipeline which treatment will be optimized by TensorFlow
    train_dataset = train_dataset.batch(model_params["batch_size"]).map(lambda x, y: (text_vectorizer(x), y))

    # If the model doesn't contain a preprocessing layer (so, using the Functional API) we must
    # transform our dataset outside, and then pass it
    callbacks = utils.compile_classification_model(model, num_classes)

    utils.fit_model_tensorboard(model, dataset=train_dataset, callbacks=callbacks)
    print("Model trained. Check statistics on TensorBoard using the logs/fit directory.")
    return model

### Transformer encoder

We don't transform a text into another, so we don't need a decoder

In [11]:
def get_transformer_encoder():
    inputs = tf.keras.Input(shape=(None,), dtype="int64")
    x = tf.keras.layers.Embedding(
        input_dim=num_features,
        output_dim=embedding_dim,
        embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix),
        trainable=False,  # Not to disrupt the already trained representation
        # mask_zero=True # For more efficiency
    )(inputs)
    connection = x
    x = tf.keras.layers.MultiHeadAttention(num_heads=2, key_dim=embedding_dim)(x, x)
    x = tf.keras.layers.add((x, connection))
    x = tf.keras.layers.LayerNormalization()(x)
    
    residual = x
    x = tf.keras.layers.Dense(32, activation='relu')(x)
    x = tf.keras.layers.Dense(embedding_dim)(x)  # Must have the same size as the residual connection
    x = tf.keras.layers.add((x, residual))
    x = tf.keras.layers.LayerNormalization()(x)
    x = tf.keras.layers.GlobalMaxPooling1D()(x)
    x = tf.keras.layers.Dropout(rate=0.5)(x)
    
    op_units, op_activation = get_units_and_activation(nb_classes)
    outputs = tf.keras.layers.Dense(units=op_units, activation=op_activation)(x)
    return tf.keras.Model(inputs=inputs, outputs=outputs)
    
    
tf.keras.config.disable_traceback_filtering()
transformer_model = get_transformer_encoder()

In [12]:
transformer_model.compile(
    optimizer=tf.keras.optimizers.RMSprop(),
    loss=tf.keras.losses.BinaryCrossentropy(),
    metrics=['accuracy']
)
transformer_model.summary()

In [13]:
date = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
transformer_model.fit(
    train_dataset,
    validation_data=validation_dataset,
    epochs=20,
    callbacks=[
        tf.keras.callbacks.ModelCheckpoint(f'models/transformer_best.keras', save_best_only=True, monitor='val_loss'),
        tf.keras.callbacks.ModelCheckpoint('models/transformer{epoch:02d}-{val_loss:.2f}.keras'),
        tf.keras.callbacks.BackupAndRestore(backup_dir=f'/tmp/backup/transformer--{date}'),
        tf.keras.callbacks.TensorBoard(log_dir=f'logs/fit/transformer--{date}', histogram_freq=1)
    ]
)

Epoch 1/20


I0000 00:00:1723884707.985003   36410 service.cc:146] XLA service 0x7f5a7002adf0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1723884707.985048   36410 service.cc:154]   StreamExecutor device (0): NVIDIA GeForce GTX 1050, Compute Capability 6.1
2024-08-17 10:51:48.034539: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2024-08-17 10:51:48.515373: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:531] Loaded cuDNN version 8907
I0000 00:00:1723884714.429655   36410 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m626/626[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 129ms/step - accuracy: 0.5092 - loss: 0.8531

2024-08-17 10:53:36.436121: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 1440000000 exceeds 10% of free system memory.


[1m626/626[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m112s[0m 165ms/step - accuracy: 0.5092 - loss: 0.8529 - val_accuracy: 0.5551 - val_loss: 0.6908
Epoch 2/20
[1m625/626[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 126ms/step - accuracy: 0.5199 - loss: 0.6990

2024-08-17 10:55:18.014340: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 1440000000 exceeds 10% of free system memory.


[1m626/626[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m101s[0m 161ms/step - accuracy: 0.5200 - loss: 0.6990 - val_accuracy: 0.6478 - val_loss: 0.6520
Epoch 3/20
[1m625/626[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 122ms/step - accuracy: 0.5998 - loss: 0.6606

2024-08-17 10:56:57.409359: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 1440000000 exceeds 10% of free system memory.


[1m626/626[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m99s[0m 158ms/step - accuracy: 0.5999 - loss: 0.6605 - val_accuracy: 0.7282 - val_loss: 0.5372
Epoch 4/20
[1m625/626[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 121ms/step - accuracy: 0.7246 - loss: 0.5346

2024-08-17 10:58:35.104573: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 1440000000 exceeds 10% of free system memory.


[1m626/626[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m98s[0m 156ms/step - accuracy: 0.7246 - loss: 0.5346 - val_accuracy: 0.7851 - val_loss: 0.4556
Epoch 5/20
[1m625/626[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 123ms/step - accuracy: 0.7681 - loss: 0.4746

2024-08-17 11:00:15.726377: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 1440000000 exceeds 10% of free system memory.


[1m626/626[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m101s[0m 161ms/step - accuracy: 0.7681 - loss: 0.4746 - val_accuracy: 0.8017 - val_loss: 0.4257
Epoch 6/20
[1m626/626[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m84s[0m 133ms/step - accuracy: 0.7940 - loss: 0.4380 - val_accuracy: 0.7802 - val_loss: 0.4503
Epoch 7/20
[1m626/626[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m101s[0m 160ms/step - accuracy: 0.8064 - loss: 0.4190 - val_accuracy: 0.8079 - val_loss: 0.4125
Epoch 8/20
[1m626/626[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m84s[0m 133ms/step - accuracy: 0.8143 - loss: 0.3995 - val_accuracy: 0.7893 - val_loss: 0.4353
Epoch 9/20
[1m626/626[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m85s[0m 136ms/step - accuracy: 0.8300 - loss: 0.3798 - val_accuracy: 0.8081 - val_loss: 0.4155
Epoch 10/20
[1m626/626[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m102s[0m 162ms/step - accuracy: 0.8361 - loss: 0.3699 - val_accuracy: 0.8175 - val_loss: 0.3989
Epoch 11/20
[1m6

<keras.src.callbacks.history.History at 0x7f5b1a17a680>

In [14]:
best_transformer = tf.keras.models.load_model('models/transformer.keras')
best_transformer.evaluate(test_dataset)

[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 43ms/step - accuracy: 0.7949 - loss: 0.4414


[0.4108327627182007, 0.8143600225448608]