# Part 3(a)

This notebook will highlight the process, result and insights obtained from allowing updates to the embedding matrix during training.

## Import Embedding Matrix

In [8]:
import json
from pathlib import Path

import numpy as np

embedding_path = Path("models/embedding_matrix.npy")
index_from_word_path = Path("models/index_from_word.json")

embedding_matrix = np.load(embedding_path)
with index_from_word_path.open() as f:
    index_from_word = json.load(f)

## Prepare Dataset

(a) Import datasets

In [9]:
from utils.text import tokenize
from datasets import load_dataset

dataset = load_dataset("rotten_tomatoes")
train_dataset = tokenize(dataset["train"])
val_dataset = tokenize(dataset["validation"])
test_dataset = tokenize(dataset["test"])

(b) Tokenise dataset

In [10]:
from utils.text import token_to_index

train_dataset = token_to_index(dataset=train_dataset, index_from_word=index_from_word)
val_dataset = token_to_index(dataset=val_dataset, index_from_word=index_from_word)
test_dataset = token_to_index(dataset=test_dataset, index_from_word=index_from_word)

train_dataset

Dataset({
    features: ['text', 'label', 'tokens', 'original_len', 'indexes'],
    num_rows: 8530
})

In [11]:
train_dataset = train_dataset.select_columns(["label", "original_len", "indexes"])
val_dataset = val_dataset.select_columns(["label", "original_len", "indexes"])
test_dataset = test_dataset.select_columns(["label", "original_len", "indexes"])

In [12]:
train_dataset.set_format(type="torch")
val_dataset.set_format(type="torch")
test_dataset.set_format(type="torch")

## Train RNN

We use Optuna to perform heuristic search on optimal configuration when embeddings are updatable during training.

In [None]:
import optuna
from utils.train import train_rnn_model_with_parameters

SEARCH_SPACE = {
    "batch_size": [32, 64, 128, 256, 512, 1024, 2048],
    "learning_rate": [1e-5, 1e-4, 1e-3, 1e-2, 1e-1],
    "optimizer_name": ["SGD", "Adagrad", "RMSprop", "Adam"],
    # RNN Model Parameters
    "hidden_dim": [256, 128, 64, 32],
    "num_layers": [1, 2, 4],
    "sentence_representation_type": ["last", "average", "max"],
}

def objective(trial):
    hidden_dim = trial.suggest_categorical("hidden_dim", SEARCH_SPACE["hidden_dim"])
    num_layers = trial.suggest_int("num_layers", min(SEARCH_SPACE["num_layers"]), max(SEARCH_SPACE["num_layers"]))
    optimizer_name = trial.suggest_categorical("optimizer_name", SEARCH_SPACE["optimizer_name"])
    batch_size = trial.suggest_categorical("batch_size", SEARCH_SPACE["batch_size"])
    learning_rate = trial.suggest_categorical("learning_rate", SEARCH_SPACE["learning_rate"])
    sentence_representation_type = trial.suggest_categorical("sentence_representation_type", SEARCH_SPACE["sentence_representation_type"])
    
    log_message = f"---------- batch_size_{batch_size}; lr_{learning_rate}; optimizer_{optimizer_name}; hidden_dim_{hidden_dim}; num_layers_{num_layers}; sentence_representation_{sentence_representation_type} ----------"
    print(log_message)

    val_acc = train_rnn_model_with_parameters(
        embedding_matrix=embedding_matrix,
        train_dataset=train_dataset,
        val_dataset=val_dataset,
        batch_size=batch_size,
        learning_rate=learning_rate,
        optimizer_name=optimizer_name,
        hidden_dim=hidden_dim,
        num_layers=num_layers,
        sentence_representation_type=sentence_representation_type,
        show_progress=True,
        freeze_embedding=False,
        log_dir="rnn_trainable_embeddings"
    )
    
    return val_acc

# Set up the Optuna study
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=300)  

## Train Result Analysis

(a) Load result from all trials

In [1]:
from utils.analytics import load_tensorboard_logs

train_results_df = load_tensorboard_logs(log_dir="tb_logs/rnn_trainable_embeddings")

train_results_df = train_results_df.sort_values(
    by=["val_acc"], ascending=False
)
train_results_df.head(20)

  from .autonotebook import tqdm as notebook_tqdm


Unnamed: 0,val_acc,train_acc,batch_size,hidden_dim,learning_rate,optimizer_name,num_layers,sentence_representation_type,freeze,epoch,train_loss,val_loss,filename
97,0.786116,0.900981,2048,128,0.001,Adagrad,1,last,False,21.0,0.248302,0.47758,events.out.tfevents.1730811642.yuriarch.655384.84
82,0.781426,0.920133,2048,128,0.0001,Adam,1,last,False,32.0,0.247747,0.479192,events.out.tfevents.1730803905.yuriarch.44842.85
118,0.777674,0.862892,2048,256,0.0001,Adam,1,last,False,21.0,0.323512,0.472788,events.out.tfevents.1730800925.yuriarch.44842.53
64,0.772983,0.899408,2048,32,0.0001,Adam,1,last,False,63.0,0.301803,0.514932,events.out.tfevents.1730802175.yuriarch.44842.69
62,0.772983,0.900458,2048,64,0.0001,Adam,1,last,False,45.0,0.270821,0.491593,events.out.tfevents.1730801993.yuriarch.44842.66
84,0.772045,0.869183,2048,256,0.0001,Adam,1,max,False,25.0,0.30422,0.490388,events.out.tfevents.1730799827.yuriarch.44842.43
135,0.771107,0.865013,2048,256,0.0001,Adam,2,last,False,19.0,0.319195,0.485763,events.out.tfevents.1730803003.yuriarch.44842.79
198,0.769231,0.970308,2048,128,0.001,Adam,3,max,False,9.0,0.103553,0.523066,events.out.tfevents.1730797787.yuriarch.44842.19
112,0.768293,0.813183,2048,256,1e-05,Adam,1,last,False,124.0,0.416392,0.49267,events.out.tfevents.1730802353.yuriarch.44842.72
37,0.767355,0.930754,2048,128,0.001,RMSprop,1,average,False,9.0,0.179622,0.511739,events.out.tfevents.1730808095.yuriarch.655384.32


(b) Configuration for best trial result

In [2]:
best_rnn_model_configuration = train_results_df.head(1)
best_rnn_model_configuration

Unnamed: 0,val_acc,train_acc,batch_size,hidden_dim,learning_rate,optimizer_name,num_layers,sentence_representation_type,freeze,epoch,train_loss,val_loss,filename
97,0.786116,0.900981,2048,128,0.001,Adagrad,1,last,False,21.0,0.248302,0.47758,events.out.tfevents.1730811642.yuriarch.655384.84


## Performance on Test Dataset

In [None]:
from utils.analytics import test_top_n_models
from models.RNN import RNNClassifier

test_results_df = test_top_n_models(train_results_df, RNNClassifier, test_dataset, n=5)

In [38]:
test_results_df

Unnamed: 0,test_acc,test_loss,val_acc,train_acc,batch_size,hidden_dim,learning_rate,optimizer_name,num_layers,sentence_representation_type,freeze,epoch,train_loss,val_loss,filename
0,0.797373,0.440366,0.781426,0.920133,2048,128,0.0001,Adam,1,last,False,32.0,0.247747,0.479192,events.out.tfevents.1730803905.yuriarch.44842.85
1,0.790807,0.462758,0.777674,0.862892,2048,256,0.0001,Adam,1,last,False,21.0,0.323512,0.472788,events.out.tfevents.1730800925.yuriarch.44842.53
2,0.787992,0.482971,0.772983,0.899408,2048,32,0.0001,Adam,1,last,False,63.0,0.301803,0.514932,events.out.tfevents.1730802175.yuriarch.44842.69
3,0.787054,0.460505,0.772983,0.900458,2048,64,0.0001,Adam,1,last,False,45.0,0.270821,0.491593,events.out.tfevents.1730801993.yuriarch.44842.66
4,0.77955,0.476281,0.772045,0.869183,2048,256,0.0001,Adam,1,max,False,25.0,0.30422,0.490388,events.out.tfevents.1730799827.yuriarch.44842.43
