In [None]:
!pip install transformers datasets

In [3]:
import pandas as pd

data_pth = "./data/imdb"
train = pd.read_csv(f"{data_pth}/train_cleaned.csv") # These use same cleaning technique as traditional pipe
val = pd.read_csv(f"{data_pth}/val_cleaned.csv")
test = pd.read_csv(f"{data_pth}/test_cleaned.csv")

train.head(5)

Unnamed: 0,text,label,text_cleaned
0,Everyone involved (and the audience) should se...,0,everyone involved and the audience should seek...
1,The Williams family live on a ranch located in...,1,the williams family live on a ranch located in...
2,This movie surprised me in a good way. From th...,1,this movie surprised me in a good way from the...
3,Forget Neo and Bourne and all those half-baked...,1,forget neo and bourne and all those halfbaked ...
4,I figured that any horror film with Orson Well...,0,i figured that any horror film with orson well...


### Data
Create Vocabulary and Tokenizer; Prepare PyTorch dataloaders

In [4]:
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.trainers import WordLevelTrainer
from tokenizers.pre_tokenizers import WhitespaceSplit
from datasets import Dataset

vocab_size = 89527

dataset = {"train": Dataset.from_pandas(train),
           "val": Dataset.from_pandas(val),
           "test": Dataset.from_pandas(test) }

tokenizer = Tokenizer(WordLevel(unk_token='<unk>'))
tokenizer.pre_tokenizer = WhitespaceSplit()

trainer = WordLevelTrainer( # should only be 89527 distinct types in IMDB50k
    vocab_size=vocab_size, special_tokens=["<unk>", "<pad>"])
generator_bsz = 512
all_splits_generator = (dataset[split][i:i+generator_bsz]["text_cleaned"]
                        for split in ["train", "val", "test"]
                          for i in range (0, len(dataset[split]), generator_bsz))
tokenizer.train_from_iterator(all_splits_generator, trainer)

orig_vocab = tokenizer.get_vocab()
word_types = sorted(list(orig_vocab.keys()), key=lambda w: orig_vocab[w])
vocab = {w: i for i, w in enumerate(word_types)}
vocab_size = len(vocab)
pad_id = vocab["<pad>"]
print("Vocabulary Size: ", vocab_size)

Vocabulary Size:  89527


In [5]:
from transformers import BertTokenizer
import torch.utils.data as torch_data
import torch

class IMDB50(torch_data.Dataset):
    def __init__(self,
                 text,
                 labels,
                 tokenizer,):

        self.all_text = text
        self.all_labels = labels
        self.tokenizer = tokenizer

        self.is_bert = isinstance(tokenizer, BertTokenizer)

    def __len__(self):
        return len(self.all_text)

    def __getitem__(self, idx):

        if not self.is_bert:
            input_ids = torch.LongTensor(self.tokenizer.encode(self.all_text[idx]).ids)
        else:
            input_ids = self.tokenizer(self.all_text[idx], return_tensors='pt', max_length=512,
                                       padding="do_not_pad", truncation=True).input_ids.squeeze(0)

        label = torch.Tensor([self.all_labels[idx]])
        return input_ids, input_ids.size(0), label

# def tokenize_with_pad_truncate(tokenizer, text:str, max_input_length:int):
#     raw_ids = tokenizer.encode(text).ids
#     raw_len = len(raw_ids)
#     if raw_len < max_input_length:
#         # Padding
#         ids = raw_ids + [pad_id]*(max_input_length-raw_len)
#     else:
#         # Truncating
#         ids = raw_ids[:max_input_length]
#     return ids

import torch.nn.utils.rnn as rnn_utils
def collate_fn(batch):
    # input_id, length, label
    batch.sort(key=lambda x: x[1], reverse=True) # sort by sequence length
    sequences, seq_lengths, targets = zip(*batch)

    # Pad the sequences and stack the targets
    sequences_padded = rnn_utils.pad_sequence(sequences, padding_value=pad_id, batch_first=True)
    targets_stacked = torch.stack(targets)

    return sequences_padded, seq_lengths, targets_stacked

batch_size = 64

train_text, train_label = dataset["train"]["text_cleaned"], dataset["train"]["label"]
validation_text, validation_label = dataset["val"]["text_cleaned"], dataset["val"]["label"]
test_text, test_label = dataset["test"]["text_cleaned"], dataset["test"]["label"]

trainset = IMDB50(train_text, train_label, tokenizer)
valset = IMDB50(validation_text, validation_label, tokenizer)
testset = IMDB50(test_text, test_label, tokenizer)

train_loader = torch_data.DataLoader(trainset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
val_loader = torch_data.DataLoader(valset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
test_loader = torch_data.DataLoader(testset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

print(next(iter(test_loader))[0].shape, next(iter(test_loader))[1], next(iter(test_loader))[2].shape)

torch.Size([64, 629]) (629, 588, 568, 541, 519, 437, 423, 411, 392, 359, 357, 355, 321, 292, 285, 266, 261, 247, 247, 238, 232, 230, 226, 226, 220, 219, 217, 203, 201, 199, 196, 194, 191, 190, 189, 187, 176, 173, 171, 170, 168, 164, 159, 154, 153, 150, 147, 138, 135, 133, 130, 128, 125, 124, 123, 121, 105, 75, 74, 57, 53, 51, 32, 21) torch.Size([64, 1])


### Model
Model Class are wrapped in rnn_classifier.py

In [6]:
import torch, random
import numpy as np
import torch.nn as nn
from tqdm.notebook import tqdm
from sklearn.metrics import classification_report, accuracy_score
from rnn_classifier import RNNBinarySequenceClassifier

class Trainer:

    def __init__(self,
                 exp_name="default",
                 seed=42,
                 device="cuda",
                 info_steps=10,
                 args=None) -> None:

        args.exp_name = exp_name
        args.seed = seed
        args.info_steps = info_steps
        args.device = device

        print("---"*20)
        print("Running Experiment: ", args.exp_name)
        print("---"*20)

        self.args = args
        self.init_train_objects()

    def init_train_objects(self,):

        self.model = RNNBinarySequenceClassifier(
            vocab_size=self.args.vocab_size,
            embedding_size=self.args.embedding_size,
            hidden_size=self.args.hidden_size,
            output_size=self.args.output_size,
            num_layers=self.args.num_layers,
            embedding_dropout=self.args.embedding_dropout,
            output_dropout=self.args.output_dropout,
            rnn_dropout=self.args.rnn_dropout,
            embedding_type=self.args.embedding_type,
            rnn_base_cell=self.args.rnn_base_cell,
            learnable=self.args.embedding_learnable,
            bidirectional=self.args.bidirectional,
            vocab=vocab,
        ).to(self.args.device)

        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=self.args.lr)
        self.scheduler = None

        self.loss_fc = nn.BCEWithLogitsLoss()

    def train_loop(self, train_loader, progress_bar):

        self.model.train()
        train_loss_per_epoch = 0.0
        train_loss_per_steps = 0.0
        for i, (input_ids, lengths, labels) in enumerate(train_loader):
            input_ids, labels = input_ids.to(self.args.device), labels.to(self.args.device)
            self.optimizer.zero_grad()
            output = self.model((input_ids, lengths))
            loss = self.loss_fc(output, labels)

            train_loss_per_epoch += loss.item()
            train_loss_per_steps += loss.item()
            loss.backward()
            self.optimizer.step()

            progress_bar.update(1)
            if progress_bar.n % self.args.info_steps == 0:
                print(f"Step {progress_bar.n}/{progress_bar.total}: Training Loss={train_loss_per_steps/self.args.info_steps:.5f}")
                train_loss_per_steps = 0.0

        return train_loss_per_epoch/len(train_loader)

    @torch.no_grad()
    def eval_loop(self, loader, cls_report=False):
        self.model.eval()

        pred_labels, true_labels = [], []
        for i, (input_ids, lengths, labels) in enumerate(loader):
            input_ids, labels = input_ids.to(self.args.device), labels.to(self.args.device)
            preds = self.model.predict((input_ids, lengths))

            pred_labels.extend(preds.squeeze(-1).tolist())
            true_labels.extend(labels.squeeze(-1).tolist())

        acc = accuracy_score(true_labels, pred_labels)
        if cls_report:
            print(classification_report(true_labels, pred_labels))
            print("Accuracy: ", round(acc, 6))
        return acc

    def train(self,
              num_epochs=10,
              batch_size=64,
              num_workers=0,
              trainset:torch_data.Dataset=None,
              testset:torch_data.Dataset=None,
              valset:torch_data.Dataset=None,):

        torch.manual_seed(self.args.seed)
        np.random.seed(self.args.seed)
        random.seed(self.args.seed)

        train_loader = torch_data.DataLoader(trainset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn, num_workers=num_workers, pin_memory=True)
        val_loader = torch_data.DataLoader(valset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn, num_workers=num_workers, pin_memory=True)
        test_loader = torch_data.DataLoader(testset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn, num_workers=num_workers, pin_memory=True)

        total_train_steps = len(train_loader) * num_epochs
        print(f"Total Training Steps Per Epoch: {len(train_loader)}")
        progress_bar = tqdm(range(1, total_train_steps), desc="Training Progress")

        best_acc = -1
        for epoch in range(1, num_epochs+1):

            train_loss_per_epoch = self.train_loop(train_loader, progress_bar)
            # train_acc_per_epoch = self.eval_loop(train_loader, cls_report=False)
            val_acc_per_epoch = self.eval_loop(val_loader, cls_report=False)
            print(f"Epoch {epoch} | Training Loss: {train_loss_per_epoch/len(train_loader):.4f}  Accuracy: valset={val_acc_per_epoch:.4f}")

            if val_acc_per_epoch > best_acc:
                torch.save(self.model.state_dict(), f"{self.args.exp_name}_best.pt")
                print(f"Saving Best Model at epoch {epoch}")
                best_acc = val_acc_per_epoch

        self.model.load_state_dict(torch.load(f"{self.args.exp_name}_best.pt", map_location=self.args.device))
        self.eval_loop(train_loader, cls_report=True)
        self.eval_loop(val_loader, cls_report=True)
        self.eval_loop(test_loader, cls_report=True)

class RNN_ARG:
    def __init__(self, lr, vocab_size, embedding_size, hidden_size,
                 output_size=1, num_layers=1, embedding_dropout=0., output_dropout=0., rnn_dropout=.0,
                 rnn_base_cell='vanilla', embedding_type='vanilla', embedding_learnable=True, bidirectional=False) -> None:

        self.lr = lr
        self.vocab_size = vocab_size
        self.embedding_size = embedding_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.num_layers = num_layers
        self.embedding_dropout = embedding_dropout
        self.output_dropout = output_dropout
        self.rnn_dropout = rnn_dropout
        self.rnn_base_cell = rnn_base_cell
        self.embedding_type = embedding_type
        self.embedding_learnable = embedding_learnable
        self.bidirectional = bidirectional

## Running Experiments

In [7]:
import warnings
warnings.filterwarnings("ignore")
import os
os.environ["TOKENIZERS_PARALLELISM"] = "true"

device = "cuda" if torch.cuda.is_available() else "cpu"
seed = 42
device

'cuda'

### Variant 1 [Model Hyper-Parameters]
For this one, we only tune rnn type as well as hidden_size

In [8]:
num_layers = 2
output_size = 1
embedding_dropout = .3
output_dropout = .3
rnn_dropout = .3
num_epochs = 12
lr = 1e-3
bs = 64
num_workers=8

In [None]:
# To Compare the varying effects of hidden size and model types
for hidden_size in [64, 128, 256]:
    for model_type in ["vanilla", "lstm", "gru"]:
        args = RNN_ARG(lr, vocab_size, embedding_size=hidden_size, hidden_size=hidden_size,
                    output_size=output_size, num_layers=num_layers, embedding_dropout=embedding_dropout, output_dropout=output_dropout,
                    rnn_base_cell=model_type, embedding_type="vanilla", embedding_learnable=True, bidirectional=False, rnn_dropout=rnn_dropout)

        trainer = Trainer(exp_name=f"{model_type}_rnn_{hidden_size}", seed=seed, device=device, args=args, info_steps=100)
        trainer.train(num_epochs, bs, num_workers, trainset, testset, valset)

------------------------------------------------------------
Running Experiment:  vanilla_rnn_64
------------------------------------------------------------
Random Initialize
Total Training Steps Per Epoch: 313


Training Progress:   0%|          | 0/3755 [00:00<?, ?it/s]

Step 100/3755: Training Loss=0.69580
Step 200/3755: Training Loss=0.69016
Step 300/3755: Training Loss=0.68412
Epoch 1 | Training Loss: 0.0022  Accuracy: valset=0.5744
Saving Best Model at epoch 1
Step 400/3755: Training Loss=0.58195
Step 500/3755: Training Loss=0.66666
Step 600/3755: Training Loss=0.65788
Epoch 2 | Training Loss: 0.0021  Accuracy: valset=0.5874
Saving Best Model at epoch 2
Step 700/3755: Training Loss=0.49300
Step 800/3755: Training Loss=0.64646
Step 900/3755: Training Loss=0.62870
Epoch 3 | Training Loss: 0.0021  Accuracy: valset=0.6432
Saving Best Model at epoch 3
Step 1000/3755: Training Loss=0.37608
Step 1100/3755: Training Loss=0.60148
Step 1200/3755: Training Loss=0.66800
Epoch 4 | Training Loss: 0.0020  Accuracy: valset=0.6304
Step 1300/3755: Training Loss=0.30711
Step 1400/3755: Training Loss=0.63681
Step 1500/3755: Training Loss=0.61737
Epoch 5 | Training Loss: 0.0020  Accuracy: valset=0.6540
Saving Best Model at epoch 5
Step 1600/3755: Training Loss=0.21186


Training Progress:   0%|          | 0/3755 [00:00<?, ?it/s]

Step 100/3755: Training Loss=0.69139
Step 200/3755: Training Loss=0.65621
Step 300/3755: Training Loss=0.64408
Epoch 1 | Training Loss: 0.0021  Accuracy: valset=0.6362
Saving Best Model at epoch 1
Step 400/3755: Training Loss=0.54640
Step 500/3755: Training Loss=0.65291
Step 600/3755: Training Loss=0.63532
Epoch 2 | Training Loss: 0.0020  Accuracy: valset=0.6966
Saving Best Model at epoch 2
Step 700/3755: Training Loss=0.45496
Step 800/3755: Training Loss=0.57191
Step 900/3755: Training Loss=0.54153
Epoch 3 | Training Loss: 0.0018  Accuracy: valset=0.7646
Saving Best Model at epoch 3
Step 1000/3755: Training Loss=0.31582
Step 1100/3755: Training Loss=0.55232
Step 1200/3755: Training Loss=0.53481
Epoch 4 | Training Loss: 0.0017  Accuracy: valset=0.7844
Saving Best Model at epoch 4
Step 1300/3755: Training Loss=0.26432
Step 1400/3755: Training Loss=0.64779
Step 1500/3755: Training Loss=0.52959
Epoch 5 | Training Loss: 0.0018  Accuracy: valset=0.8008
Saving Best Model at epoch 5
Step 1600

Training Progress:   0%|          | 0/3755 [00:00<?, ?it/s]

Step 100/3755: Training Loss=0.68857
Step 200/3755: Training Loss=0.66654
Step 300/3755: Training Loss=0.64141
Epoch 1 | Training Loss: 0.0021  Accuracy: valset=0.6848
Saving Best Model at epoch 1
Step 400/3755: Training Loss=0.51328
Step 500/3755: Training Loss=0.58364
Step 600/3755: Training Loss=0.55777
Epoch 2 | Training Loss: 0.0018  Accuracy: valset=0.7888
Saving Best Model at epoch 2
Step 700/3755: Training Loss=0.34240
Step 800/3755: Training Loss=0.42295
Step 900/3755: Training Loss=0.41348
Epoch 3 | Training Loss: 0.0013  Accuracy: valset=0.8128
Saving Best Model at epoch 3
Step 1000/3755: Training Loss=0.21967
Step 1100/3755: Training Loss=0.32539
Step 1200/3755: Training Loss=0.31012
Epoch 4 | Training Loss: 0.0010  Accuracy: valset=0.8596
Saving Best Model at epoch 4
Step 1300/3755: Training Loss=0.13535
Step 1400/3755: Training Loss=0.27047
Step 1500/3755: Training Loss=0.26582
Epoch 5 | Training Loss: 0.0009  Accuracy: valset=0.8794
Saving Best Model at epoch 5
Step 1600

Training Progress:   0%|          | 0/3755 [00:00<?, ?it/s]

Step 100/3755: Training Loss=0.69099
Step 200/3755: Training Loss=0.67323
Step 300/3755: Training Loss=0.65786
Epoch 1 | Training Loss: 0.0022  Accuracy: valset=0.5584
Saving Best Model at epoch 1
Step 400/3755: Training Loss=0.55935
Step 500/3755: Training Loss=0.65332
Step 600/3755: Training Loss=0.63646
Epoch 2 | Training Loss: 0.0021  Accuracy: valset=0.6764
Saving Best Model at epoch 2
Step 700/3755: Training Loss=0.46063
Step 800/3755: Training Loss=0.62223
Step 900/3755: Training Loss=0.60731
Epoch 3 | Training Loss: 0.0020  Accuracy: valset=0.5710
Step 1000/3755: Training Loss=0.36622
Step 1100/3755: Training Loss=0.59949
Step 1200/3755: Training Loss=0.58768
Epoch 4 | Training Loss: 0.0019  Accuracy: valset=0.6952
Saving Best Model at epoch 4
Step 1300/3755: Training Loss=0.27457
Step 1400/3755: Training Loss=0.60059
Step 1500/3755: Training Loss=0.58146
Epoch 5 | Training Loss: 0.0019  Accuracy: valset=0.6824
Step 1600/3755: Training Loss=0.20038
Step 1700/3755: Training Loss

Training Progress:   0%|          | 0/3755 [00:00<?, ?it/s]

Step 100/3755: Training Loss=0.67712
Step 200/3755: Training Loss=0.62717
Step 300/3755: Training Loss=0.58946
Epoch 1 | Training Loss: 0.0020  Accuracy: valset=0.6724
Saving Best Model at epoch 1
Step 400/3755: Training Loss=0.49706
Step 500/3755: Training Loss=0.56688
Step 600/3755: Training Loss=0.53202
Epoch 2 | Training Loss: 0.0018  Accuracy: valset=0.7300
Saving Best Model at epoch 2
Step 700/3755: Training Loss=0.37226
Step 800/3755: Training Loss=0.48668
Step 900/3755: Training Loss=0.46953
Epoch 3 | Training Loss: 0.0016  Accuracy: valset=0.5070
Step 1000/3755: Training Loss=0.42451
Step 1100/3755: Training Loss=0.68086
Step 1200/3755: Training Loss=0.64275
Epoch 4 | Training Loss: 0.0021  Accuracy: valset=0.7924
Saving Best Model at epoch 4
Step 1300/3755: Training Loss=0.23474
Step 1400/3755: Training Loss=0.60567
Step 1500/3755: Training Loss=0.58509
Epoch 5 | Training Loss: 0.0019  Accuracy: valset=0.6698
Step 1600/3755: Training Loss=0.20618
Step 1700/3755: Training Loss

Training Progress:   0%|          | 0/3755 [00:00<?, ?it/s]

Step 100/3755: Training Loss=0.67504
Step 200/3755: Training Loss=0.64482
Step 300/3755: Training Loss=0.61098
Epoch 1 | Training Loss: 0.0021  Accuracy: valset=0.7394
Saving Best Model at epoch 1
Step 400/3755: Training Loss=0.44796
Step 500/3755: Training Loss=0.49277
Step 600/3755: Training Loss=0.41822
Epoch 2 | Training Loss: 0.0015  Accuracy: valset=0.8364
Saving Best Model at epoch 2
Step 700/3755: Training Loss=0.27027
Step 800/3755: Training Loss=0.32467
Step 900/3755: Training Loss=0.31235
Epoch 3 | Training Loss: 0.0010  Accuracy: valset=0.8358
Step 1000/3755: Training Loss=0.16570
Step 1100/3755: Training Loss=0.25573
Step 1200/3755: Training Loss=0.23668
Epoch 4 | Training Loss: 0.0008  Accuracy: valset=0.8750
Saving Best Model at epoch 4
Step 1300/3755: Training Loss=0.09647
Step 1400/3755: Training Loss=0.19263
Step 1500/3755: Training Loss=0.19651
Epoch 5 | Training Loss: 0.0006  Accuracy: valset=0.8778
Saving Best Model at epoch 5
Step 1600/3755: Training Loss=0.05962


Training Progress:   0%|          | 0/3755 [00:00<?, ?it/s]

Step 100/3755: Training Loss=0.69720
Step 200/3755: Training Loss=0.68140
Step 300/3755: Training Loss=0.68512
Epoch 1 | Training Loss: 0.0022  Accuracy: valset=0.5764
Saving Best Model at epoch 1
Step 400/3755: Training Loss=0.57284
Step 500/3755: Training Loss=0.64858
Step 600/3755: Training Loss=0.64089
Epoch 2 | Training Loss: 0.0021  Accuracy: valset=0.6692
Saving Best Model at epoch 2
Step 700/3755: Training Loss=0.45591
Step 800/3755: Training Loss=0.60799
Step 900/3755: Training Loss=0.60766
Epoch 3 | Training Loss: 0.0019  Accuracy: valset=0.6060
Step 1000/3755: Training Loss=0.35694
Step 1100/3755: Training Loss=0.56513
Step 1200/3755: Training Loss=0.54761
Epoch 4 | Training Loss: 0.0018  Accuracy: valset=0.7198
Saving Best Model at epoch 4
Step 1300/3755: Training Loss=0.25727
Step 1400/3755: Training Loss=0.50315
Step 1500/3755: Training Loss=0.48330
Epoch 5 | Training Loss: 0.0016  Accuracy: valset=0.6412
Step 1600/3755: Training Loss=0.19479
Step 1700/3755: Training Loss

Training Progress:   0%|          | 0/3755 [00:00<?, ?it/s]

Step 100/3755: Training Loss=0.69515
Step 200/3755: Training Loss=0.66054
Step 300/3755: Training Loss=0.62079
Epoch 1 | Training Loss: 0.0021  Accuracy: valset=0.7468
Saving Best Model at epoch 1
Step 400/3755: Training Loss=0.46234
Step 500/3755: Training Loss=0.49407
Step 600/3755: Training Loss=0.45434
Epoch 2 | Training Loss: 0.0015  Accuracy: valset=0.8270
Saving Best Model at epoch 2
Step 700/3755: Training Loss=0.29441
Step 800/3755: Training Loss=0.34887
Step 900/3755: Training Loss=0.34632
Epoch 3 | Training Loss: 0.0011  Accuracy: valset=0.8392
Saving Best Model at epoch 3
Step 1000/3755: Training Loss=0.18734
Step 1100/3755: Training Loss=0.27120
Step 1200/3755: Training Loss=0.33891
Epoch 4 | Training Loss: 0.0010  Accuracy: valset=0.8646
Saving Best Model at epoch 4
Step 1300/3755: Training Loss=0.11281
Step 1400/3755: Training Loss=0.25712
Step 1500/3755: Training Loss=0.24149
Epoch 5 | Training Loss: 0.0008  Accuracy: valset=0.8806
Saving Best Model at epoch 5
Step 1600

Training Progress:   0%|          | 0/3755 [00:00<?, ?it/s]

Step 100/3755: Training Loss=0.67576
Step 200/3755: Training Loss=0.62883
Step 300/3755: Training Loss=0.58986
Epoch 1 | Training Loss: 0.0020  Accuracy: valset=0.7620
Saving Best Model at epoch 1
Step 400/3755: Training Loss=0.37815
Step 500/3755: Training Loss=0.36916
Step 600/3755: Training Loss=0.32596
Epoch 2 | Training Loss: 0.0012  Accuracy: valset=0.8742
Saving Best Model at epoch 2
Step 700/3755: Training Loss=0.18203
Step 800/3755: Training Loss=0.22958
Step 900/3755: Training Loss=0.23785
Epoch 3 | Training Loss: 0.0008  Accuracy: valset=0.8752
Saving Best Model at epoch 3
Step 1000/3755: Training Loss=0.10776
Step 1100/3755: Training Loss=0.16342
Step 1200/3755: Training Loss=0.15495
Epoch 4 | Training Loss: 0.0005  Accuracy: valset=0.9010
Saving Best Model at epoch 4
Step 1300/3755: Training Loss=0.04950
Step 1400/3755: Training Loss=0.10650
Step 1500/3755: Training Loss=0.11883
Epoch 5 | Training Loss: 0.0004  Accuracy: valset=0.8988
Step 1600/3755: Training Loss=0.02547


In [None]:
# To Compare the varying effects of bi-directional
for hidden_size in [64, 128, 256]:
    args = RNN_ARG(lr, vocab_size, embedding_size=hidden_size, hidden_size=hidden_size,
                output_size=output_size, num_layers=num_layers, embedding_dropout=embedding_dropout, output_dropout=output_dropout,
                rnn_base_cell="gru", embedding_type="vanilla", embedding_learnable=True, bidirectional=True, rnn_dropout=rnn_dropout)

    trainer = Trainer(exp_name=f"gru_bi_rnn_{hidden_size}", seed=seed, device=device, args=args, info_steps=50)
    trainer.train(num_epochs, bs, num_workers, trainset, testset, valset)

------------------------------------------------------------
Running Experiment:  gru_bi_rnn_64
------------------------------------------------------------
Random Initialize
Total Training Steps Per Epoch: 313


Training Progress:   0%|          | 0/3755 [00:00<?, ?it/s]

Step 50/3755: Training Loss=0.68589
Step 100/3755: Training Loss=0.68212
Step 150/3755: Training Loss=0.66587
Step 200/3755: Training Loss=0.64652
Step 250/3755: Training Loss=0.63398
Step 300/3755: Training Loss=0.59791
Epoch 1 | Training Loss: 0.0021  Accuracy: valset=0.7156
Saving Best Model at epoch 1
Step 350/3755: Training Loss=0.41308
Step 400/3755: Training Loss=0.55496
Step 450/3755: Training Loss=0.55145
Step 500/3755: Training Loss=0.51386
Step 550/3755: Training Loss=0.51122
Step 600/3755: Training Loss=0.46855
Epoch 2 | Training Loss: 0.0017  Accuracy: valset=0.7344
Saving Best Model at epoch 2
Step 650/3755: Training Loss=0.22734
Step 700/3755: Training Loss=0.42649
Step 750/3755: Training Loss=0.39193
Step 800/3755: Training Loss=0.38982
Step 850/3755: Training Loss=0.37540
Step 900/3755: Training Loss=0.36829
Epoch 3 | Training Loss: 0.0012  Accuracy: valset=0.8472
Saving Best Model at epoch 3
Step 950/3755: Training Loss=0.08240
Step 1000/3755: Training Loss=0.32018
St

Training Progress:   0%|          | 0/3755 [00:00<?, ?it/s]

Step 50/3755: Training Loss=0.68713
Step 100/3755: Training Loss=0.66217
Step 150/3755: Training Loss=0.65991
Step 200/3755: Training Loss=0.61877
Step 250/3755: Training Loss=0.58338
Step 300/3755: Training Loss=0.55345
Epoch 1 | Training Loss: 0.0020  Accuracy: valset=0.6196
Saving Best Model at epoch 1
Step 350/3755: Training Loss=0.37786
Step 400/3755: Training Loss=0.55523
Step 450/3755: Training Loss=0.54099
Step 500/3755: Training Loss=0.48957
Step 550/3755: Training Loss=0.44515
Step 600/3755: Training Loss=0.40770
Epoch 2 | Training Loss: 0.0015  Accuracy: valset=0.7700
Saving Best Model at epoch 2
Step 650/3755: Training Loss=0.17119
Step 700/3755: Training Loss=0.35748
Step 750/3755: Training Loss=0.33881
Step 800/3755: Training Loss=0.32846
Step 850/3755: Training Loss=0.33299
Step 900/3755: Training Loss=0.30844
Epoch 3 | Training Loss: 0.0011  Accuracy: valset=0.8738
Saving Best Model at epoch 3
Step 950/3755: Training Loss=0.06235
Step 1000/3755: Training Loss=0.26074
St

Training Progress:   0%|          | 0/3755 [00:00<?, ?it/s]

Step 50/3755: Training Loss=0.68296
Step 100/3755: Training Loss=0.63630
Step 150/3755: Training Loss=0.62069
Step 200/3755: Training Loss=0.57384
Step 250/3755: Training Loss=0.55312
Step 300/3755: Training Loss=0.61416
Epoch 1 | Training Loss: 0.0019  Accuracy: valset=0.7732
Saving Best Model at epoch 1
Step 350/3755: Training Loss=0.34154
Step 400/3755: Training Loss=0.41140
Step 450/3755: Training Loss=0.38034
Step 500/3755: Training Loss=0.35131
Step 550/3755: Training Loss=0.33774
Step 600/3755: Training Loss=0.32373
Epoch 2 | Training Loss: 0.0012  Accuracy: valset=0.8592
Saving Best Model at epoch 2
Step 650/3755: Training Loss=0.12677
Step 700/3755: Training Loss=0.24836
Step 750/3755: Training Loss=0.22698
Step 800/3755: Training Loss=0.23887
Step 850/3755: Training Loss=0.25391
Step 900/3755: Training Loss=0.24307
Epoch 3 | Training Loss: 0.0008  Accuracy: valset=0.8748
Saving Best Model at epoch 3
Step 950/3755: Training Loss=0.04356
Step 1000/3755: Training Loss=0.18783
St

### Variant 2 [Embedding]
For this one, we will compare the effect of different embedding methodologies

In [9]:
hidden_size=256
model_type="gru"

#### Vanilla Embedding

In [None]:
args = RNN_ARG(lr, vocab_size, embedding_size=hidden_size, hidden_size=hidden_size,
            output_size=output_size, num_layers=num_layers, embedding_dropout=embedding_dropout, output_dropout=output_dropout,
            rnn_base_cell=model_type, embedding_type="vanilla", embedding_learnable=True, bidirectional=True)

trainer = Trainer(exp_name=f"{model_type}_rnn_vanilla_embedding", seed=seed, device=device, args=args, info_steps=100)
trainer.train(num_epochs, bs, num_workers, trainset, testset, valset)

# This one is obtained above

#### GLoVE Embedding [Fixed]

In [None]:
args = RNN_ARG(lr, vocab_size, embedding_size=hidden_size, hidden_size=hidden_size,
            output_size=output_size, num_layers=num_layers, embedding_dropout=embedding_dropout, output_dropout=output_dropout,
            rnn_base_cell=model_type, embedding_type="glove", embedding_learnable=False, bidirectional=True)

trainer = Trainer(exp_name=f"{model_type}_rnn_glove_fixed_embedding", seed=seed, device=device, args=args, info_steps=100)
trainer.train(num_epochs, bs, num_workers, trainset, testset, valset)

------------------------------------------------------------
Running Experiment:  gru_rnn_glove_fixed_embedding
------------------------------------------------------------
Initialize by GLoVE word embedding
Fix the Embedding Layer
Total Training Steps Per Epoch: 313


Training Progress:   0%|          | 0/3755 [00:00<?, ?it/s]

Step 100/3755: Training Loss=0.66063
Step 200/3755: Training Loss=0.65737
Step 300/3755: Training Loss=0.64342
Epoch 1 | Training Loss: 0.0021  Accuracy: valset=0.6720
Saving Best Model at epoch 1
Step 400/3755: Training Loss=0.46445
Step 500/3755: Training Loss=0.47657
Step 600/3755: Training Loss=0.43359
Epoch 2 | Training Loss: 0.0015  Accuracy: valset=0.8200
Saving Best Model at epoch 2
Step 700/3755: Training Loss=0.29086
Step 800/3755: Training Loss=0.36216
Step 900/3755: Training Loss=0.34737
Epoch 3 | Training Loss: 0.0011  Accuracy: valset=0.8504
Saving Best Model at epoch 3
Step 1000/3755: Training Loss=0.16759
Step 1100/3755: Training Loss=0.28038
Step 1200/3755: Training Loss=0.27922
Epoch 4 | Training Loss: 0.0009  Accuracy: valset=0.8418
Step 1300/3755: Training Loss=0.10920
Step 1400/3755: Training Loss=0.21752
Step 1500/3755: Training Loss=0.22651
Epoch 5 | Training Loss: 0.0007  Accuracy: valset=0.8794
Saving Best Model at epoch 5
Step 1600/3755: Training Loss=0.06055


#### GLoVE Embedding [Learnable]

In [None]:
args = RNN_ARG(lr, vocab_size, embedding_size=hidden_size, hidden_size=hidden_size,
            output_size=output_size, num_layers=num_layers, embedding_dropout=embedding_dropout, output_dropout=output_dropout,
            rnn_base_cell=model_type, embedding_type="glove", embedding_learnable=True, bidirectional=True)

trainer = Trainer(exp_name=f"{model_type}_rnn_glove_learnable_embedding", seed=seed, device=device, args=args, info_steps=100)
trainer.train(num_epochs, bs, num_workers, trainset, testset, valset)

------------------------------------------------------------
Running Experiment:  gru_rnn_glove_learnable_embedding
------------------------------------------------------------
Initialize by GLoVE word embedding
Total Training Steps Per Epoch: 313


Training Progress:   0%|          | 0/3755 [00:00<?, ?it/s]

Step 100/3755: Training Loss=0.65632
Step 200/3755: Training Loss=0.57252
Step 300/3755: Training Loss=0.44468
Epoch 1 | Training Loss: 0.0018  Accuracy: valset=0.7982
Saving Best Model at epoch 1
Step 400/3755: Training Loss=0.31842
Step 500/3755: Training Loss=0.31146
Step 600/3755: Training Loss=0.29407
Epoch 2 | Training Loss: 0.0010  Accuracy: valset=0.8856
Saving Best Model at epoch 2
Step 700/3755: Training Loss=0.15936
Step 800/3755: Training Loss=0.20107
Step 900/3755: Training Loss=0.22090
Epoch 3 | Training Loss: 0.0007  Accuracy: valset=0.8720
Step 1000/3755: Training Loss=0.08850
Step 1100/3755: Training Loss=0.13501
Step 1200/3755: Training Loss=0.14639
Epoch 4 | Training Loss: 0.0005  Accuracy: valset=0.9002
Saving Best Model at epoch 4
Step 1300/3755: Training Loss=0.04073
Step 1400/3755: Training Loss=0.09176
Step 1500/3755: Training Loss=0.10399
Epoch 5 | Training Loss: 0.0003  Accuracy: valset=0.9010
Saving Best Model at epoch 5
Step 1600/3755: Training Loss=0.01815


#### BERT Representation as Embedding

In [10]:
from transformers import BertTokenizer
tokenizer_bert = BertTokenizer.from_pretrained("bert-base-uncased")

# Need to reconstruct Dataset since changing the tokenizer
trainset_bert = IMDB50(train_text, train_label, tokenizer_bert)
valset_bert = IMDB50(validation_text, validation_label, tokenizer_bert)
testset_bert = IMDB50(test_text, test_label, tokenizer_bert)

bert_d_model = 768
args = RNN_ARG(lr, vocab_size, embedding_size=bert_d_model, hidden_size=hidden_size,
            output_size=output_size, num_layers=num_layers, embedding_dropout=embedding_dropout, output_dropout=output_dropout,
            rnn_base_cell=model_type, embedding_type="bert", embedding_learnable=False, bidirectional=True)

trainer = Trainer(exp_name=f"{model_type}_rnn_bert_representation", seed=seed, device=device, args=args, info_steps=50)
trainer.train(num_epochs, bs, num_workers, trainset_bert, testset_bert, valset_bert)

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

------------------------------------------------------------
Running Experiment:  gru_rnn_bert_representation
------------------------------------------------------------
Use BERT representation [fixed]


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

BERT layers are freezed
Total Training Steps Per Epoch: 313


Training Progress:   0%|          | 0/3755 [00:00<?, ?it/s]

Step 50/3755: Training Loss=0.62250
Step 100/3755: Training Loss=0.54019
Step 150/3755: Training Loss=0.44202
Step 200/3755: Training Loss=0.36428
Step 250/3755: Training Loss=0.35120
Step 300/3755: Training Loss=0.31501
Epoch 1 | Training Loss: 0.0014  Accuracy: valset=0.8910
Saving Best Model at epoch 1
Step 350/3755: Training Loss=0.21602
Step 400/3755: Training Loss=0.28300
Step 450/3755: Training Loss=0.28258
Step 500/3755: Training Loss=0.29496
Step 550/3755: Training Loss=0.29528
Step 600/3755: Training Loss=0.28529
Epoch 2 | Training Loss: 0.0009  Accuracy: valset=0.9078
Saving Best Model at epoch 2
Step 650/3755: Training Loss=0.12490
Step 700/3755: Training Loss=0.28069
Step 750/3755: Training Loss=0.24333
Step 800/3755: Training Loss=0.25858
Step 850/3755: Training Loss=0.26847
Step 900/3755: Training Loss=0.25094
Epoch 3 | Training Loss: 0.0008  Accuracy: valset=0.9078
Step 950/3755: Training Loss=0.04752
Step 1000/3755: Training Loss=0.22230
Step 1050/3755: Training Loss=0

In [12]:
# Saving bert variant excluding bert part as it's too large

state_dict = trainer.model.state_dict()
state_dict_copy = state_dict.copy()

layer_keys_to_remove = [key for key in state_dict_copy if 'embedding_layer' in key]

for key in layer_keys_to_remove:
    del state_dict_copy[key]

torch.save(state_dict_copy, 'gru_rnn_bert_representation_best.pt')

In [None]:
state_dict = torch.load('gru_rnn_bert_representation_best.pt')
trainer.model.load_state_dict(state_dict, strict=False)