# Part 3(a)

This notebook will highlight the process, result and insights obtained from allowing updates to the embedding matrix during training.

## Import Embedding Matrix

In [1]:
import json
from pathlib import Path

import numpy as np

embedding_path = Path("models/embedding_matrix.npy")
index_from_word_path = Path("models/index_from_word.json")

embedding_matrix = np.load(embedding_path)
with index_from_word_path.open() as f:
    index_from_word = json.load(f)

## Prepare Dataaset

(a) Import datasets

In [2]:
from utils.text import tokenize
from datasets import load_dataset

dataset = load_dataset("rotten_tomatoes")
train_dataset = tokenize(dataset["train"])
val_dataset = tokenize(dataset["validation"])
test_dataset = tokenize(dataset["test"])

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\juinl\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package treebank to
[nltk_data]     C:\Users\juinl\AppData\Roaming\nltk_data...
[nltk_data]   Package treebank is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\juinl\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


(b) Tokenise dataset

In [3]:
from utils.text import token_to_index

train_dataset = token_to_index(dataset=train_dataset, index_from_word=index_from_word)
val_dataset = token_to_index(dataset=val_dataset, index_from_word=index_from_word)
test_dataset = token_to_index(dataset=test_dataset, index_from_word=index_from_word)

train_dataset

Dataset({
    features: ['text', 'label', 'tokens', 'original_len', 'indexes'],
    num_rows: 8530
})

In [4]:
train_dataset = train_dataset.select_columns(["label", "original_len", "indexes"])
val_dataset = val_dataset.select_columns(["label", "original_len", "indexes"])
test_dataset = test_dataset.select_columns(["label", "original_len", "indexes"])

In [5]:
train_dataset.set_format(type="torch")
val_dataset.set_format(type="torch")
test_dataset.set_format(type="torch")

## Train RNN

We use Optuna to perform heuristic search on optimal configuration when embeddings are updatable during training.

In [None]:
import optuna
from utils.train import train_rnn_model_with_parameters

SEARCH_SPACE = {
    "batch_size": [32, 64, 128, 256, 512, 1024, 2048],
    "learning_rate": [1e-1, 1e-2, 1e-3, 1e-4],
    "optimizer_name": ["Adam", "Adagrad", "RMSprop"],
    # RNN Model Parameters
    "hidden_dim": [256, 128, 64, 32],
    "num_layers": [1, 2, 4],
    "sentence_representation_type": ["last", "average", "max"],
}

def objective(trial):
    hidden_dim = trial.suggest_categorical("hidden_dim", SEARCH_SPACE["hidden_dim"])
    num_layers = trial.suggest_int("num_layers", min(SEARCH_SPACE["num_layers"]), max(SEARCH_SPACE["num_layers"]))
    optimizer_name = trial.suggest_categorical("optimizer_name", SEARCH_SPACE["optimizer_name"])
    batch_size = trial.suggest_categorical("batch_size", SEARCH_SPACE["batch_size"])
    learning_rate = trial.suggest_categorical("learning_rate", SEARCH_SPACE["learning_rate"])
    sentence_representation_type = trial.suggest_categorical("sentence_representation_type", SEARCH_SPACE["sentence_representation_type"])
    
    log_message = f"---------- batch_size_{batch_size}; lr_{learning_rate}; optimizer_{optimizer_name}; hidden_dim_{hidden_dim}; num_layers_{num_layers}; sentence_representation_{sentence_representation_type} ----------"
    print(log_message)

    val_acc = train_rnn_model_with_parameters(
        embedding_matrix=embedding_matrix,
        train_dataset=train_dataset,
        val_dataset=val_dataset,
        batch_size=batch_size,
        learning_rate=learning_rate,
        optimizer_name=optimizer_name,
        hidden_dim=hidden_dim,
        num_layers=num_layers,
        sentence_representation_type=sentence_representation_type,
        show_progress=False,
        freeze_embedding=False,
        log_dir="rnn/test/w2v-3a"
    )
    
    return val_acc

# Set up the Optuna study
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=150)  

# Get the best hyperparameters
best_params = study.best_params


## Train Result Analysis

In [None]:
print("Best hyperparameters:", best_params)

(a) Load result from all trials

In [24]:
from utils.analytics import load_tensorboard_logs

train_results_df = load_tensorboard_logs(log_dir="tb_logs/rnn/test/w2v-3a")

train_results_df = train_results_df.sort_values(
    by=["val_acc"], ascending=False
).reset_index(drop=True)
train_results_df.head(20)

Unnamed: 0,val_acc,train_acc,batch_size,hidden_dim,learning_rate,freeze,optimizer_name,num_layers,sentence_representation_type,epoch,train_loss,val_loss,filename
0,0.77955,0.890533,2048,32,0.001,False,Adam,1,max,15.0,0.434699,0.510937,events.out.tfevents.1730624293.LEEJUIN-PC.2336...
1,0.776735,,2048,32,0.01,False,Adagrad,2,max,8.0,,0.490966,events.out.tfevents.1730640444.LEEJUIN-PC.2336...
2,0.775797,,2048,128,0.001,False,RMSprop,1,max,7.0,,0.487259,events.out.tfevents.1730639894.LEEJUIN-PC.2336...
3,0.772983,0.893217,2048,32,0.0001,False,Adam,1,max,74.0,0.405118,0.525215,events.out.tfevents.1730627220.LEEJUIN-PC.2336...
4,0.772983,,2048,64,0.01,False,Adam,2,max,5.0,,0.52275,events.out.tfevents.1730639566.LEEJUIN-PC.2336...
5,0.772045,0.946746,2048,64,0.001,False,Adam,1,max,10.0,0.186989,0.488326,events.out.tfevents.1730624617.LEEJUIN-PC.2336...
6,0.772045,,2048,64,0.01,False,Adam,1,max,4.0,,0.498227,events.out.tfevents.1730625631.LEEJUIN-PC.2336...
7,0.772045,,2048,256,0.001,False,Adam,1,max,7.0,,0.487313,events.out.tfevents.1730626954.LEEJUIN-PC.2336...
8,0.771107,,2048,128,0.001,False,Adam,1,last,7.0,,0.501685,events.out.tfevents.1730624983.LEEJUIN-PC.2336...
9,0.768293,0.872268,2048,32,0.001,False,Adam,2,max,13.0,0.410194,0.510118,events.out.tfevents.1730623825.LEEJUIN-PC.2336...


(b) Configuration for best trial result

In [25]:
best_rnn_model_configuration = train_results_df.head(1)
best_rnn_model_configuration

Unnamed: 0,val_acc,train_acc,batch_size,hidden_dim,learning_rate,freeze,optimizer_name,num_layers,sentence_representation_type,epoch,train_loss,val_loss,filename
0,0.77955,0.890533,2048,32,0.001,False,Adam,1,max,15.0,0.434699,0.510937,events.out.tfevents.1730624293.LEEJUIN-PC.2336...


## Performance on Test Dataset

(a) Rerun best config trial to save the embeddings

In [6]:
from utils.train import train_rnn_model_with_parameters

train_rnn_model_with_parameters(
    embedding_matrix=embedding_matrix,
    train_dataset=train_dataset,
    val_dataset=val_dataset,
    batch_size=2048,
    learning_rate=0.001,
    optimizer_name="Adam",
    hidden_dim=32,
    num_layers=1,
    sentence_representation_type="max",
    show_progress=False,
    freeze_embedding=False,
    write_embeddings=True
)

Seed set to 42
c:\Users\juinl\Documents\GitHub\sc4002-nlp-sentiment-classification\venv\lib\site-packages\lightning\pytorch\utilities\parsing.py:208: Attribute 'rnn_model' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['rnn_model'])`.
GPU available: True (cuda), used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
c:\Users\juinl\Documents\GitHub\sc4002-nlp-sentiment-classification\venv\lib\site-packages\lightning\pytorch\trainer\setup.py:177: GPU available but not used. You can set it by doing `Trainer(accelerator='gpu')`.

  | Name   | Type               | Params | Mode 
------------------------------------------------------
0 | model  | RNN                | 4.5 M  | train
1 | metric | MulticlassAccuracy | 0      | train
------------------------------------------------------
4.5 M     Trainable params
0         Non-trainable params
4.5 M     Total par

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

c:\Users\juinl\Documents\GitHub\sc4002-nlp-sentiment-classification\venv\lib\site-packages\lightning\pytorch\trainer\connectors\data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.
c:\Users\juinl\Documents\GitHub\sc4002-nlp-sentiment-classification\venv\lib\site-packages\lightning\pytorch\trainer\connectors\data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.
c:\Users\juinl\Documents\GitHub\sc4002-nlp-sentiment-classification\venv\lib\site-packages\lightning\pytorch\loops\fit_loop.py:298: The number of training batches (5) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see lo

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

0.7795497179031372

(b) Load updated embeddings

In [26]:
from pathlib import Path

import numpy as np

embedding_path = Path("models/test.npy")
updated_embedding_matrix = np.load(embedding_path)

(c) Model with best trial configuration and updated embedding matrix

In [30]:
from models.RNN import RNN

rnn_model = RNN(
    embedding_matrix=updated_embedding_matrix,
    hidden_dim=32,
    output_dim=2,
    num_layers=1,
    sentence_representation_type="max",
)

In [32]:
from pathlib import Path
from utils.train import RNNClassifier


best_rnn_model_filename = best_rnn_model_configuration["filename"].item()
matched_files = list(Path().rglob(best_rnn_model_filename))

if not matched_files:
    print("Model checkpoint not found!")
else:
    checkpoint_dir = matched_files[0].parent / "checkpoints"
    checkpoint_files = (
        list(checkpoint_dir.glob("*.ckpt")) if checkpoint_dir.exists() else []
    )

    if not checkpoint_files:
        print("No checkpoint files found in the checkpoint directory!")
    else:
        best_checkpoint = checkpoint_files[0] 
        best_rnn_model = RNNClassifier.load_from_checkpoint(
            best_checkpoint, rnn_model=rnn_model
        )
        print(best_rnn_model)

RNNClassifier(
  (model): RNN(
    (embedding): Embedding(14888, 300)
    (rnn): RNN(300, 32, batch_first=True)
    (fc): Linear(in_features=32, out_features=16, bias=True)
    (relu): ReLU()
    (fc2): Linear(in_features=16, out_features=2, bias=True)
  )
  (metric): MulticlassAccuracy()
)


(d) Accuracy test on test dataset

In [33]:
import lightning as L
from torch.utils.data import DataLoader

test_dataloader = DataLoader(test_dataset, shuffle=True)

trainer = L.Trainer(accelerator="cpu")
trainer.test(best_rnn_model, test_dataloader)

GPU available: True (cuda), used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
c:\Users\juinl\Documents\GitHub\sc4002-nlp-sentiment-classification\venv\lib\site-packages\lightning\pytorch\trainer\setup.py:177: GPU available but not used. You can set it by doing `Trainer(accelerator='gpu')`.
c:\Users\juinl\Documents\GitHub\sc4002-nlp-sentiment-classification\venv\lib\site-packages\lightning\pytorch\trainer\connectors\data_connector.py:475: Your `test_dataloader`'s sampler has shuffling enabled, it is strongly recommended that you turn shuffling off for val/test dataloaders.
c:\Users\juinl\Documents\GitHub\sc4002-nlp-sentiment-classification\venv\lib\site-packages\lightning\pytorch\trainer\connectors\data_connector.py:424: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.


Testing: |          | 0/? [00:00<?, ?it/s]

────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
        test_acc            0.7786116600036621
        test_loss           0.4890058934688568
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


[{'test_loss': 0.4890058934688568, 'test_acc': 0.7786116600036621}]