# Part 3.4. CNN

## Prepare embedding matrix and metadata

In [6]:
import json
from pathlib import Path

import numpy as np

# Load the embedding matrix that handled OOV words
embedding_path = Path("models/embedding_matrix_oov.npy")
index_from_word_path = Path("models/index_from_word_oov.json")

embedding_matrix = np.load(embedding_path)
with index_from_word_path.open() as f:
    index_from_word = json.load(f)

## Prepare dataset

In [None]:
from utils.text import tokenize
from datasets import load_dataset

dataset = load_dataset("rotten_tomatoes")
train_dataset = tokenize(dataset["train"])
val_dataset = tokenize(dataset["validation"])
test_dataset = tokenize(dataset["test"])

In [8]:
from utils.text import token_to_index

train_dataset = token_to_index(dataset=train_dataset, index_from_word=index_from_word)
val_dataset = token_to_index(dataset=val_dataset, index_from_word=index_from_word)
test_dataset = token_to_index(dataset=test_dataset, index_from_word=index_from_word)

In [9]:
train_dataset = train_dataset.select_columns(["label", "original_len", "indexes"])
val_dataset = val_dataset.select_columns(["label", "original_len", "indexes"])
test_dataset = test_dataset.select_columns(["label", "original_len", "indexes"])

In [10]:
train_dataset.set_format(type="torch")
val_dataset.set_format(type="torch")
test_dataset.set_format(type="torch")

In [6]:
train_dataset

Dataset({
    features: ['label', 'original_len', 'indexes'],
    num_rows: 8530
})

## Train CNN Model

In [7]:
SEARCH_SPACE = {
    "batch_size": [512, 1024, 2048],
    "learning_rate": [1e-5, 1e-4, 1e-3, 1e-2, 1e-1],
    "optimizer_name": ["Adam"],
    # CNN Model Parameters
    "dropout": [0.1, 0.3, 0.5, 0.7, 0.9],
    "hidden_dim": [600, 500, 400, 300],
    "n_grams": [
        [2],
        [3],
        [4],
        [5],
        [6],
        [2, 3],
        [2, 3, 4],
        [2, 3, 4, 5],
        [2, 3, 4, 6],
        [3, 4],
        [3, 4, 5],
        [3, 4, 6],
        [4, 5],
        [4, 5, 6],
        [5, 6],
    ],
}

In [None]:
import optuna

from utils.train import (
    CNNArgs,
    DataArgs,
    OptimizerArgs,
    train_cnn_model_with_parameters,
)

_N_TRIALS = 500

def objective(trial: optuna.Trial):
    batch_size = trial.suggest_categorical("batch_size", SEARCH_SPACE["batch_size"])
    learning_rate = trial.suggest_categorical(
        "learning_rate", SEARCH_SPACE["learning_rate"]
    )
    optimizer_name = trial.suggest_categorical(
        "optimizer_name", SEARCH_SPACE["optimizer_name"]
    )
    # CNN Model Parameters
    dropout = trial.suggest_categorical("dropout", SEARCH_SPACE["dropout"])
    hidden_dim = trial.suggest_categorical("hidden_dim", SEARCH_SPACE["hidden_dim"])
    n_grams = trial.suggest_categorical("n_grams", SEARCH_SPACE["n_grams"])

    log_message = f"---------- batch_size_{batch_size}; lr_{learning_rate}; optimizer_{optimizer_name}; hidden_dim_{hidden_dim}; n_grams_{"_".join(map(str, n_grams))}; dropout_{dropout}  ----------"
    print(log_message)

    cnn_args = CNNArgs(
        embedding_matrix=embedding_matrix,
        freeze_embedding=False,
        hidden_dim=hidden_dim,
        dropout=dropout,
        n_grams=n_grams,
    )

    optimizer_args = OptimizerArgs(
        optimizer_name=optimizer_name,
        learning_rate=learning_rate,
    )

    data_args = DataArgs(
        batch_size=batch_size,
        train_dataset=train_dataset,
        val_dataset=val_dataset,
    )

    val_acc = train_cnn_model_with_parameters(
        data_args=data_args,
        cnn_args=cnn_args,
        optimizer_args=optimizer_args,
    )

    return val_acc


# Set up the Optuna study
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=_N_TRIALS)

In [2]:
from utils.analytics import load_tensorboard_logs

df = load_tensorboard_logs("tb_logs/cnn").sort_values("val_acc", ascending=False).reset_index(drop=True)
df.head(20)

Unnamed: 0,val_acc,batch_size,hidden_dim,learning_rate,optimizer_name,train_loss,train_acc,n_grams,dropout,epoch,val_loss,filename
0,0.787054,2048,300,0.001,Adam,0.302605,0.953268,2_3,0.1,17.0,0.478298,events.out.tfevents.1730843468.yuriarch.302770...
1,0.78424,2048,400,0.001,Adam,0.308057,0.890524,2_3,0.3,15.0,0.479946,events.out.tfevents.1730837453.yuriarch.302770...
2,0.783302,2048,400,0.001,Adam,0.258119,0.942599,2_3,0.1,15.0,0.479409,events.out.tfevents.1730842147.yuriarch.302770...
3,0.780488,2048,300,0.0001,Adam,0.123483,0.978822,2_3,0.1,108.0,0.481418,events.out.tfevents.1730843851.yuriarch.302770...
4,0.780488,2048,300,0.001,Adam,0.419764,0.79412,2_3,0.9,52.0,0.487362,events.out.tfevents.1730849012.yuriarch.302770...
5,0.77955,2048,600,0.001,Adam,0.223024,0.967416,2_3_4_6,0.1,15.0,0.474464,events.out.tfevents.1730849629.yuriarch.302770...
6,0.778612,2048,400,0.001,Adam,0.291379,0.952866,2_3_4,0.1,16.0,0.480456,events.out.tfevents.1730842900.yuriarch.302770...
7,0.776735,2048,500,0.001,Adam,0.170268,0.964043,2,0.1,13.0,0.483572,events.out.tfevents.1730851518.yuriarch.302770...
8,0.775797,2048,300,0.001,Adam,0.317358,0.860387,2_3,0.7,27.0,0.480353,events.out.tfevents.1730855957.yuriarch.302770...
9,0.774859,2048,300,0.001,Adam,0.344862,0.942173,2_3,0.3,17.0,0.480878,events.out.tfevents.1730845225.yuriarch.302770...


In [3]:
best_cnn_model_configuration = df.head(1)
best_cnn_model_configuration

Unnamed: 0,val_acc,batch_size,hidden_dim,learning_rate,optimizer_name,train_loss,train_acc,n_grams,dropout,epoch,val_loss,filename
0,0.787054,2048,300,0.001,Adam,0.302605,0.953268,2_3,0.1,17.0,0.478298,events.out.tfevents.1730843468.yuriarch.302770...


## Best Config on Test set

In [None]:
from utils.analytics import test_top_n_models
from models.CNN import CNNClassifier


test_results_df = test_top_n_models(df, CNNClassifier, test_dataset, n=10)

In [13]:
test_results_df

Unnamed: 0,test_acc,test_loss,val_acc,batch_size,hidden_dim,learning_rate,optimizer_name,train_loss,train_acc,n_grams,dropout,epoch,val_loss,filename
0,0.788931,0.419539,0.787054,2048,300,0.001,Adam,0.302605,0.953268,2_3,0.1,17.0,0.478298,events.out.tfevents.1730843468.yuriarch.302770...
1,0.798311,0.411556,0.78424,2048,400,0.001,Adam,0.308057,0.890524,2_3,0.3,15.0,0.479946,events.out.tfevents.1730837453.yuriarch.302770...
2,0.802064,0.411389,0.783302,2048,400,0.001,Adam,0.258119,0.942599,2_3,0.1,15.0,0.479409,events.out.tfevents.1730842147.yuriarch.302770...
3,0.775797,0.433588,0.780488,2048,300,0.0001,Adam,0.123483,0.978822,2_3,0.1,108.0,0.481418,events.out.tfevents.1730843851.yuriarch.302770...
4,0.780488,0.457076,0.780488,2048,300,0.001,Adam,0.419764,0.79412,2_3,0.9,52.0,0.487362,events.out.tfevents.1730849012.yuriarch.302770...
5,0.798311,0.434948,0.77955,2048,600,0.001,Adam,0.223024,0.967416,2_3_4_6,0.1,15.0,0.474464,events.out.tfevents.1730849629.yuriarch.302770...
6,0.803002,0.432682,0.778612,2048,400,0.001,Adam,0.291379,0.952866,2_3_4,0.1,16.0,0.480456,events.out.tfevents.1730842900.yuriarch.302770...
7,0.789869,0.43547,0.776735,2048,500,0.001,Adam,0.170268,0.964043,2,0.1,13.0,0.483572,events.out.tfevents.1730851518.yuriarch.302770...
8,0.791745,0.429452,0.775797,2048,300,0.001,Adam,0.317358,0.860387,2_3,0.7,27.0,0.480353,events.out.tfevents.1730855957.yuriarch.302770...
9,0.792683,0.422271,0.774859,2048,300,0.001,Adam,0.344862,0.942173,2_3,0.3,17.0,0.480878,events.out.tfevents.1730845225.yuriarch.302770...
