# Notebook to test different embedding models

In [1]:
# load libraries
import numpy as np
import pandas as pd

from util_funcs import load_split_data
from sentence_transformers import SentenceTransformer

from keras.layers import Dense, LSTM, LayerNormalization
from keras.models import Sequential
from keras.callbacks import EarlyStopping

In [2]:
# setup PyTorch backend
import os
os.environ['KERAS_BACKEND'] = 'torch'
import torch
print("Using PyTorch backend")
device = 'cuda' if torch.cuda.is_available() else 'cpu'

print(f"Using device: {device}")

Using PyTorch backend
Using device: cuda


In [None]:
# generate data splits
X_train, X_val, X_test, y_train, y_val, y_test = load_split_data("../data/Sentiment_Analysis.csv")

## Embedding models

In [4]:
emb_model_1 = SentenceTransformer("all-mpnet-base-v2")
emb_model_2 = SentenceTransformer("all-MiniLM-L6-v2")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
# sentence embeddings
def sentence_embeddings(model):
    """
    Generate sentence embeddings.

    Args:
        model: SentenceTransformer model

    Returns:
        X_train_emb: Training embeddings
        X_val_emb: Validation embeddings
        X_test_emb: Test embeddings
    """
    X_train_emb = model.encode(X_train, show_progress_bar=True, convert_to_numpy=True)
    X_val_emb = model.encode(X_val, show_progress_bar=True, convert_to_numpy=True)
    X_test_emb = model.encode(X_test, show_progress_bar=True, convert_to_numpy=True)

    print(f"Training embeddings shape: {X_train_emb.shape}")
    print(f"Validation embeddings shape: {X_val_emb.shape}")
    print(f"Test embeddings shape: {X_test_emb.shape}")
    return X_train_emb, X_val_emb, X_test_emb

In [None]:
# generate sentence embeddings
X_train_emb_1, X_val_emb_1, X_test_emb_1 = sentence_embeddings(emb_model_1)
X_train_emb_2, X_val_emb_2, X_test_emb_2 = sentence_embeddings(emb_model_2)

Batches:   0%|          | 0/750 [00:00<?, ?it/s]

Batches:   0%|          | 0/250 [00:00<?, ?it/s]

Batches:   0%|          | 0/250 [00:00<?, ?it/s]

Training embeddings shape: (24000, 768)
Validation embeddings shape: (8000, 768)
Test embeddings shape: (8000, 768)


Batches:   0%|          | 0/750 [00:00<?, ?it/s]

Batches:   0%|          | 0/250 [00:00<?, ?it/s]

Batches:   0%|          | 0/250 [00:00<?, ?it/s]

Training embeddings shape: (24000, 384)
Validation embeddings shape: (8000, 384)
Test embeddings shape: (8000, 384)


In [None]:
# pad sequences
import nltk
nltk.download("punkt_tab")
from nltk.tokenize import word_tokenize
from tensorflow.keras.preprocessing.sequence import pad_sequences

def process_text_sequences(max_length=40, padding='post', truncating='post'):
    """
    Convert texts to padded sequences of tokens.

    Args:
        max_length: Maximum sequence length (default: 40)
        padding: 'pre' or 'post' padding (default: 'post')
        truncating: 'pre' or 'post' truncation (default: 'post')

    Returns:
        X_train_padded: Padded training sequences
        X_val_padded: Padded validation sequences
        X_test_padded: Padded test sequences
    """
    data = [X_train, X_val, X_test]
    padded_sequences = []

    for texts in data:
        # Tokenize all texts
        sequences = [word_tokenize(text) for text in texts]

        # Convert tokens to numpy arrays with padding
        padded_sequences.append(pad_sequences(
            sequences=[s[:max_length] for s in sequences],  # Truncate if needed
            maxlen=max_length,
            padding=padding,
            truncating=truncating,
            dtype=object,  # Use object dtype for string tokens
            value=''  # Use empty string as padding token
        ))

    return tuple(padded_sequences)

X_train_padded, X_val_padded, X_test_padded = process_text_sequences()

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [10]:
from functools import cache
@cache
def encode_words(model, text):
    return model.encode(text)

In [11]:
# convert padded sequences to embeddings
def create_embedding_sequences(model, max_length=40):
    """
    Convert sequences of words to sequences of embeddings using cached word vectors.
    Ensures all sequences have the same length through padding.

    Args:
        model: SentenceTransformer model
        max_length: Maximum sequence length (default: 40)
    Returns:
        X_train_embedded: Training embeddings
        X_val_embedded: Validation embeddings
        X_test_embedded: Test embeddings
    """
    data = [X_train_padded, X_val_padded, X_test_padded]
    embeddings = []
    embedding_dim = model.get_sentence_embedding_dimension()

    for sequences in data:
        n_sequences = len(sequences)

        # Initialize the output array with zeros
        embedded_seqs = np.zeros((n_sequences, max_length, embedding_dim))

        for i, seq in enumerate(sequences):
            # Get embeddings for non-empty tokens
            valid_tokens = [word for word in seq if word != '']
            # Truncate if necessary
            valid_tokens = valid_tokens[:max_length]
            # Create embeddings for valid tokens
            seq_embeddings = [encode_words(model, word) for word in valid_tokens]

            # Add embeddings to the output array with padding
            for j, embedding in enumerate(seq_embeddings):
                if j < max_length:
                    embedded_seqs[i, j] = embedding
        embeddings.append(embedded_seqs)
        print(f"Shape: {embedded_seqs.shape}")

    return tuple(embeddings)

In [12]:
%%time
# generate sequence embeddings
X_train_embedded_1, X_val_embedded_1, X_test_embedded_1 = create_embedding_sequences(emb_model_1)

Shape: (24000, 40, 768)
Shape: (8000, 40, 768)
Shape: (8000, 40, 768)
CPU times: user 10min 56s, sys: 2.36 s, total: 10min 58s
Wall time: 10min 59s


In [13]:
%%time
# generate sequence embeddings
X_train_embedded_2, X_val_embedded_2, X_test_embedded_2 = create_embedding_sequences(emb_model_2)

Shape: (24000, 40, 384)
Shape: (8000, 40, 384)
Shape: (8000, 40, 384)
CPU times: user 5min 23s, sys: 1.26 s, total: 5min 24s
Wall time: 5min 24s


## MLP

In [14]:
early_stop = EarlyStopping(monitor='val_loss', patience=5, verbose=1, restore_best_weights=True)

In [None]:
# mlp model
def mlp_model(input_dim):
    """
    Create a simple MLP model.

    Args:
        input_dim: Dimension of the input data

    Returns:
        model: MLP model
    """
    model = Sequential([
        Dense(10, activation='relu', input_shape=(input_dim,)),
        Dense(10, activation='relu'),
        Dense(10, activation='relu'),
        Dense(13, activation='softmax')
    ])

    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    model.summary()
    return model

In [None]:
# model 1
mlp_model_1 = mlp_model(768)
mlp_history_1 = mlp_model_1.fit(X_train_emb_1, y_train, validation_data=(X_val_emb_1, y_val), epochs=30, callbacks=[early_stop])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/30
[1m750/750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 4ms/step - accuracy: 0.2230 - loss: 2.2899 - val_accuracy: 0.2898 - val_loss: 1.9978
Epoch 2/30
[1m750/750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.3114 - loss: 1.9616 - val_accuracy: 0.3416 - val_loss: 1.8931
Epoch 3/30
[1m750/750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.3429 - loss: 1.8834 - val_accuracy: 0.3525 - val_loss: 1.8755
Epoch 4/30
[1m750/750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.3540 - loss: 1.8514 - val_accuracy: 0.3610 - val_loss: 1.8633
Epoch 5/30
[1m750/750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.3721 - loss: 1.8355 - val_accuracy: 0.3636 - val_loss: 1.8563
Epoch 6/30
[1m750/750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.3779 - loss: 1.8131 - val_accuracy: 0.3671 - val_loss: 1.8519
Epoch 7/30
[1m750/750[0m 

In [None]:
# model 2
mlp_model_2 = mlp_model(384)
mlp_history_2 = mlp_model_2.fit(X_train_emb_2, y_train, validation_data=(X_val_emb_2, y_val), epochs=30, callbacks=[early_stop])

Epoch 1/30
[1m750/750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - accuracy: 0.2467 - loss: 2.2522 - val_accuracy: 0.3374 - val_loss: 1.9209
Epoch 2/30
[1m750/750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.3395 - loss: 1.9095 - val_accuracy: 0.3461 - val_loss: 1.8883
Epoch 3/30
[1m750/750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.3568 - loss: 1.8745 - val_accuracy: 0.3560 - val_loss: 1.8699
Epoch 4/30
[1m750/750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.3642 - loss: 1.8642 - val_accuracy: 0.3593 - val_loss: 1.8591
Epoch 5/30
[1m750/750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.3665 - loss: 1.8428 - val_accuracy: 0.3651 - val_loss: 1.8533
Epoch 6/30
[1m750/750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.3717 - loss: 1.8366 - val_accuracy: 0.3643 - val_loss: 1.8534
Epoch 7/30
[1m750/750[0m 

## Simple LSTM

In [7]:
# lstm model
def lstm_model(input_dim):
    """
    Create a simple LSTM model.

    Args:
        input_dim: Dimension of the input data

    Returns:
        model: LSTM model
    """
    model = Sequential([
        LSTM(10, input_shape=(40, input_dim)),
        LayerNormalization(),
        Dense(10, activation='relu'),
        Dense(10, activation='relu'),
        Dense(13, activation='softmax')
    ])

    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    model.summary()
    return model

In [15]:
# model 1
lstm_model_1 = lstm_model(768)
lstm_history_1 = lstm_model_1.fit(X_train_embedded_1, y_train, validation_data=(X_val_embedded_1, y_val), epochs=30, callbacks=[early_stop])

Epoch 1/30
[1m750/750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 10ms/step - accuracy: 0.1782 - loss: 2.2766 - val_accuracy: 0.2735 - val_loss: 2.0619
Epoch 2/30
[1m750/750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 7ms/step - accuracy: 0.3003 - loss: 2.0331 - val_accuracy: 0.3390 - val_loss: 1.9565
Epoch 3/30
[1m750/750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 7ms/step - accuracy: 0.3328 - loss: 1.9415 - val_accuracy: 0.3461 - val_loss: 1.9148
Epoch 4/30
[1m750/750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 7ms/step - accuracy: 0.3413 - loss: 1.9124 - val_accuracy: 0.3506 - val_loss: 1.9008
Epoch 5/30
[1m750/750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 7ms/step - accuracy: 0.3529 - loss: 1.8807 - val_accuracy: 0.3500 - val_loss: 1.9021
Epoch 6/30
[1m750/750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 7ms/step - accuracy: 0.3554 - loss: 1.8688 - val_accuracy: 0.3562 - val_loss: 1.8949
Epoch 7/30
[1m750/750[0m

In [16]:
# model 2
lstm_model_2 = lstm_model(384)
lstm_history_2 = lstm_model_2.fit(X_train_embedded_2, y_train, validation_data=(X_val_embedded_2, y_val), epochs=30, callbacks=[early_stop])

Epoch 1/30
[1m750/750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 9ms/step - accuracy: 0.1870 - loss: 2.2714 - val_accuracy: 0.2362 - val_loss: 2.1438
Epoch 2/30
[1m750/750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 7ms/step - accuracy: 0.2459 - loss: 2.1295 - val_accuracy: 0.2971 - val_loss: 2.0315
Epoch 3/30
[1m750/750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 7ms/step - accuracy: 0.3047 - loss: 2.0121 - val_accuracy: 0.3240 - val_loss: 1.9744
Epoch 4/30
[1m750/750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 7ms/step - accuracy: 0.3259 - loss: 1.9656 - val_accuracy: 0.3341 - val_loss: 1.9427
Epoch 5/30
[1m750/750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 7ms/step - accuracy: 0.3347 - loss: 1.9365 - val_accuracy: 0.3402 - val_loss: 1.9400
Epoch 6/30
[1m750/750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 7ms/step - accuracy: 0.3446 - loss: 1.9287 - val_accuracy: 0.3300 - val_loss: 1.9427
Epoch 7/30
[1m750/750[0m 

## Takeaways

- There are significant hardware constraints with generating sequence embeddings (RAM requirements in Google Colab)
- The more complex embedding model does not give much improvement in performance despite taking much longer while also showing more tendency to overfit
- Proceed using the simple embedding model: **all-MiniLM-L6-v2**