# Part 3(c
)

This notebook will highlight the process, result and insights obtained from allowing updates to the embedding matrix during training.

## Import Embedding Matrix

In [1]:
import json
from pathlib import Path

import numpy as np

embedding_path = Path("models/embedding_matrix_oov.npy")
index_from_word_path = Path("models/index_from_word_oov.json")

embedding_matrix = np.load(embedding_path)
with index_from_word_path.open() as f:
    index_from_word = json.load(f)

## Prepare Dataset

(a) Import datasets

In [2]:
from utils.text import tokenize
from datasets import load_dataset

dataset = load_dataset("rotten_tomatoes")
train_dataset = tokenize(dataset["train"])
val_dataset = tokenize(dataset["validation"])
test_dataset = tokenize(dataset["test"])

[nltk_data] Downloading package punkt to /Users/bohua/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package treebank to /Users/bohua/nltk_data...
[nltk_data]   Package treebank is already up-to-date!
[nltk_data] Downloading package punkt_tab to /Users/bohua/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


(b) Tokenise dataset

In [3]:
from utils.text import token_to_index

train_dataset = token_to_index(dataset=train_dataset, index_from_word=index_from_word)
val_dataset = token_to_index(dataset=val_dataset, index_from_word=index_from_word)
test_dataset = token_to_index(dataset=test_dataset, index_from_word=index_from_word)

train_dataset

Map:   0%|          | 0/8530 [00:00<?, ? examples/s]

Map:   0%|          | 0/1066 [00:00<?, ? examples/s]

Map:   0%|          | 0/1066 [00:00<?, ? examples/s]

Dataset({
    features: ['text', 'label', 'tokens', 'original_len', 'indexes'],
    num_rows: 8530
})

In [4]:
train_dataset = train_dataset.select_columns(["label", "original_len", "indexes"])
val_dataset = val_dataset.select_columns(["label", "original_len", "indexes"])
test_dataset = test_dataset.select_columns(["label", "original_len", "indexes"])

In [5]:
train_dataset.set_format(type="torch")
val_dataset.set_format(type="torch")
test_dataset.set_format(type="torch")

## Train RNN

We use Optuna to perform heuristic search on optimal configuration when embeddings are updatable during training.

In [6]:
import optuna
from utils.train import train_rnn_model_with_parameters

_N_TRIALS = 50
SEARCH_SPACE = {
    "batch_size": [2048],
    "learning_rate": [ 1e-2, 1e-1],
    "optimizer_name": ["Adam"],
    # RNN Model Parameters
    "hidden_dim": [32],
    "num_layers": [1, 2, 4],
    "sentence_representation_type": ["last", "average", "max"],
    "bidirectional" : [False, True],

}

def objective(trial):
    hidden_dim = trial.suggest_categorical("hidden_dim", SEARCH_SPACE["hidden_dim"])
    num_layers = trial.suggest_int("num_layers", min(SEARCH_SPACE["num_layers"]), max(SEARCH_SPACE["num_layers"]))
    optimizer_name = trial.suggest_categorical("optimizer_name", SEARCH_SPACE["optimizer_name"])
    batch_size = trial.suggest_categorical("batch_size", SEARCH_SPACE["batch_size"])
    learning_rate = trial.suggest_categorical("learning_rate", SEARCH_SPACE["learning_rate"])
    sentence_representation_type = trial.suggest_categorical("sentence_representation_type", SEARCH_SPACE["sentence_representation_type"])
    bidirectional = trial.suggest_categorical("bidirectional", SEARCH_SPACE["bidirectional"])

    
    log_message = f"---------- batch_size_{batch_size}; lr_{learning_rate}; optimizer_{optimizer_name}; hidden_dim_{hidden_dim}; num_layers_{num_layers}; sentence_representation_{sentence_representation_type}; bidirectional_{bidirectional} ----------"
    print(log_message)

    val_acc = train_rnn_model_with_parameters(
        embedding_matrix=embedding_matrix,
        train_dataset=train_dataset,
        val_dataset=val_dataset,
        batch_size=batch_size,
        learning_rate=learning_rate,
        optimizer_name=optimizer_name,
        hidden_dim=hidden_dim,
        num_layers=num_layers,
        sentence_representation_type=sentence_representation_type,
        show_progress=True,
        freeze_embedding=False,
        log_dir="rnn_trainable_embeddings",
        rnn_type="GRU", # biGRU model
        bidirectional=bidirectional

    )
    
    return val_acc

# Set up the Optuna study
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=_N_TRIALS)

[I 2024-11-07 20:00:28,352] A new study created in memory with name: no-name-1353d4f3-5e5f-4681-b447-4680c8a4e9c9
Seed set to 42
/Users/bohua/.pyenv/versions/3.9.6/lib/python3.9/site-packages/lightning/pytorch/utilities/parsing.py:208: Attribute 'rnn_model' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['rnn_model'])`.


---------- batch_size_2048; lr_0.01; optimizer_Adam; hidden_dim_32; num_layers_4; sentence_representation_last; bidirectional_False ----------
[Skipping] rnn_trainable_embeddings/batch_size_2048-lr_0.01-optimizer_Adam-hidden_dim_32-num_layers_4-sr_type_last-freeze_False-rnn_type_GRU-bidirectional_False


[I 2024-11-07 20:00:30,440] Trial 0 finished with value: 0.6349595785140991 and parameters: {'hidden_dim': 32, 'num_layers': 4, 'optimizer_name': 'Adam', 'batch_size': 2048, 'learning_rate': 0.01, 'sentence_representation_type': 'last', 'bidirectional': False}. Best is trial 0 with value: 0.6349595785140991.
Seed set to 42
[I 2024-11-07 20:00:30,456] Trial 1 finished with value: 0.66219562292099 and parameters: {'hidden_dim': 32, 'num_layers': 2, 'optimizer_name': 'Adam', 'batch_size': 2048, 'learning_rate': 0.1, 'sentence_representation_type': 'last', 'bidirectional': False}. Best is trial 1 with value: 0.66219562292099.
Seed set to 42
[I 2024-11-07 20:00:30,471] Trial 2 finished with value: 0.5869143009185791 and parameters: {'hidden_dim': 32, 'num_layers': 4, 'optimizer_name': 'Adam', 'batch_size': 2048, 'learning_rate': 0.01, 'sentence_representation_type': 'average', 'bidirectional': False}. Best is trial 1 with value: 0.66219562292099.
Seed set to 42
[I 2024-11-07 20:00:30,486] T

---------- batch_size_2048; lr_0.1; optimizer_Adam; hidden_dim_32; num_layers_2; sentence_representation_last; bidirectional_False ----------
[Skipping] rnn_trainable_embeddings/batch_size_2048-lr_0.1-optimizer_Adam-hidden_dim_32-num_layers_2-sr_type_last-freeze_False-rnn_type_GRU-bidirectional_False
---------- batch_size_2048; lr_0.01; optimizer_Adam; hidden_dim_32; num_layers_4; sentence_representation_average; bidirectional_False ----------
[Skipping] rnn_trainable_embeddings/batch_size_2048-lr_0.01-optimizer_Adam-hidden_dim_32-num_layers_4-sr_type_average-freeze_False-rnn_type_GRU-bidirectional_False
---------- batch_size_2048; lr_0.1; optimizer_Adam; hidden_dim_32; num_layers_2; sentence_representation_last; bidirectional_False ----------
[Skipping] rnn_trainable_embeddings/batch_size_2048-lr_0.1-optimizer_Adam-hidden_dim_32-num_layers_2-sr_type_last-freeze_False-rnn_type_GRU-bidirectional_False
---------- batch_size_2048; lr_0.01; optimizer_Adam; hidden_dim_32; num_layers_2; sent

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/Users/bohua/.pyenv/versions/3.9.6/lib/python3.9/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:419: Consider setting `persistent_workers=True` in 'val_dataloader' to speed up the dataloader worker initialization.
/Users/bohua/.pyenv/versions/3.9.6/lib/python3.9/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:419: Consider setting `persistent_workers=True` in 'train_dataloader' to speed up the dataloader worker initialization.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

[I 2024-11-07 20:04:09,302] Trial 7 finished with value: 0.5621190667152405 and parameters: {'hidden_dim': 32, 'num_layers': 2, 'optimizer_name': 'Adam', 'batch_size': 2048, 'learning_rate': 0.01, 'sentence_representation_type': 'max', 'bidirectional': False}. Best is trial 1 with value: 0.66219562292099.
Seed set to 42
[I 2024-11-07 20:04:09,348] Trial 8 finished with value: 0.6523199677467346 and parameters: {'hidden_dim': 32, 'num_layers': 1, 'optimizer_name': 'Adam', 'batch_size': 2048, 'learning_rate': 0.1, 'sentence_representation_type': 'average', 'bidirectional': True}. Best is trial 1 with value: 0.66219562292099.
Seed set to 42
[I 2024-11-07 20:04:09,365] Trial 9 finished with value: 0.6445276141166687 and parameters: {'hidden_dim': 32, 'num_layers': 2, 'optimizer_name': 'Adam', 'batch_size': 2048, 'learning_rate': 0.1, 'sentence_representation_type': 'average', 'bidirectional': True}. Best is trial 1 with value: 0.66219562292099.
Seed set to 42
[I 2024-11-07 20:04:09,384] Tr

---------- batch_size_2048; lr_0.1; optimizer_Adam; hidden_dim_32; num_layers_1; sentence_representation_average; bidirectional_True ----------
[Skipping] rnn_trainable_embeddings/batch_size_2048-lr_0.1-optimizer_Adam-hidden_dim_32-num_layers_1-sr_type_average-freeze_False-rnn_type_GRU-bidirectional_True
---------- batch_size_2048; lr_0.1; optimizer_Adam; hidden_dim_32; num_layers_2; sentence_representation_average; bidirectional_True ----------
[Skipping] rnn_trainable_embeddings/batch_size_2048-lr_0.1-optimizer_Adam-hidden_dim_32-num_layers_2-sr_type_average-freeze_False-rnn_type_GRU-bidirectional_True
---------- batch_size_2048; lr_0.1; optimizer_Adam; hidden_dim_32; num_layers_1; sentence_representation_max; bidirectional_True ----------
[Skipping] rnn_trainable_embeddings/batch_size_2048-lr_0.1-optimizer_Adam-hidden_dim_32-num_layers_1-sr_type_max-freeze_False-rnn_type_GRU-bidirectional_True
---------- batch_size_2048; lr_0.1; optimizer_Adam; hidden_dim_32; num_layers_1; sentence_

In [7]:
# Set up the Optuna study
# study = optuna.create_study(direction="maximize")
# study.optimize(objective, n_trials=_N_TRIALS)

## Train Result Analysis

(a) Load result from all trials

In [8]:
from utils.analytics import load_tensorboard_logs

train_results_df = load_tensorboard_logs(log_dir="tb_logs/rnn_trainable_embeddings")

train_results_df = train_results_df.sort_values(
    by=["val_acc"], ascending=False
).reset_index(drop=True)
train_results_df.head(20)

DATA in match_rnn_log {'val_loss': None, 'val_acc': None, 'epoch': None}
DATA in match_rnn_log {'val_loss': None, 'val_acc': None, 'epoch': None}
DATA in match_rnn_log {'val_loss': None, 'val_acc': None, 'epoch': None}
DATA in match_rnn_log {'val_loss': None, 'val_acc': None, 'epoch': None}
DATA in match_rnn_log {'val_loss': None, 'val_acc': None, 'epoch': None}
DATA in match_rnn_log {'val_loss': None, 'val_acc': None, 'epoch': None}
DATA in match_rnn_log {'val_loss': None, 'val_acc': None, 'epoch': None}
DATA in match_rnn_log {'val_loss': None, 'val_acc': None, 'epoch': None}
DATA in match_rnn_log {'val_loss': None, 'val_acc': None, 'epoch': None}
DATA in match_rnn_log {'val_loss': None, 'val_acc': None, 'epoch': None}
DATA in match_rnn_log {'val_loss': None, 'val_acc': None, 'epoch': None}
DATA in match_rnn_log {'val_loss': None, 'val_acc': None, 'epoch': None}
DATA in match_rnn_log {'val_loss': None, 'val_acc': None, 'epoch': None}
DATA in match_rnn_log {'val_loss': None, 'val_acc':

Unnamed: 0,val_acc,batch_size,hidden_dim,learning_rate,optimizer_name,num_layers,sentence_representation_type,freeze,epoch,val_loss,filename
0,0.77955,2048,32,0.01,Adam,1,max,False,5.0,0.493852,events.out.tfevents.1730965763.Lees-MacBook-Pr...
1,0.76454,2048,32,0.01,Adam,4,max,False,4.0,0.552021,events.out.tfevents.1730972478.Lees-MacBook-Pr...
2,0.751407,2048,32,0.01,Adam,1,average,False,5.0,0.559631,events.out.tfevents.1730967507.Lees-MacBook-Pr...
3,0.748593,2048,32,0.01,Adam,1,last,False,3.0,0.605685,events.out.tfevents.1730966980.Lees-MacBook-Pr...
4,0.744841,2048,32,0.01,Adam,3,average,False,4.0,0.559884,events.out.tfevents.1730967134.Lees-MacBook-Pr...
5,0.743902,2048,32,0.01,Adam,3,max,False,4.0,0.529529,events.out.tfevents.1730969095.Lees-MacBook-Pr...
6,0.742026,2048,32,0.01,Adam,1,average,False,4.0,0.567691,events.out.tfevents.1730967320.Lees-MacBook-Pr...
7,0.739212,2048,32,0.01,Adam,2,max,False,5.0,0.562119,events.out.tfevents.1730980830.Lees-MacBook-Pr...
8,0.738274,2048,32,0.1,Adam,1,average,False,5.0,0.68861,events.out.tfevents.1730978502.Lees-MacBook-Pr...
9,0.738274,2048,32,0.01,Adam,3,average,False,4.0,0.616701,events.out.tfevents.1730970112.Lees-MacBook-Pr...


(b) Configuration for best trial result

In [9]:
best_rnn_model_configuration = train_results_df.head(1)
best_rnn_model_configuration

Unnamed: 0,val_acc,batch_size,hidden_dim,learning_rate,optimizer_name,num_layers,sentence_representation_type,freeze,epoch,val_loss,filename
0,0.77955,2048,32,0.01,Adam,1,max,False,5.0,0.493852,events.out.tfevents.1730965763.Lees-MacBook-Pr...


## Performance on Test Dataset

In [10]:
from utils.analytics import test_top_n_models
from models.RNN import RNNClassifier

test_results_df = test_top_n_models(train_results_df, RNNClassifier, test_dataset, n=5)

/Users/bohua/.pyenv/versions/3.9.6/lib/python3.9/site-packages/lightning/pytorch/utilities/parsing.py:208: Attribute 'rnn_model' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['rnn_model'])`.
GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/Users/bohua/.pyenv/versions/3.9.6/lib/python3.9/site-packages/lightning/pytorch/trainer/setup.py:177: GPU available but not used. You can set it by doing `Trainer(accelerator='gpu')`.
/Users/bohua/.pyenv/versions/3.9.6/lib/python3.9/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=10` in the `DataLoader` to improve performance.


Testing: |          | 0/? [00:00<?, ?it/s]

IndexError: index out of range in self

In [None]:
test_results_df