In [1]:
# Import Embedding Matrix and Embedding Matrix's Train dataset vocab to index dictionary

from utils.file import load_from_local_file

embedding_matrix = load_from_local_file("models/embedding_matrix.pckl")
embedding_matrix_train_dataset_vocab_to_index: dict = load_from_local_file("models/embedding_matrix_train_dataset_vocab_to_index.pckl")

Loading object from local...
Object loaded from local!
Loading object from local...
Object loaded from local!


In [2]:
# Import Dataset

import pandas as pd

train_df = pd.read_csv("datasets/train.csv")
val_df = pd.read_csv("datasets/val.csv")
train_df.head()

Unnamed: 0,text,label
0,the rock is destined to be the 21st century's ...,1
1,"the gorgeously elaborate continuation of "" the...",1
2,effective but too-tepid biopic,1
3,if you sometimes like to go to the movies to h...,1
4,"emerges as something rare , an issue movie tha...",1


In [3]:
from torch.utils.data import Dataset, DataLoader
import torch
from utils.text import tokenize_sentence

# Custom Text Dataset
class TextDataset(Dataset):
    def __init__(self, dataframe, max_len):
        self.data = dataframe
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        # Get the sentence and label
        sentence = self.data.iloc[idx, 0]
        label = self.data.iloc[idx, 1]

        # Tokenize sentence
        sentence_tokens: list[str] = tokenize_sentence(sentence)

        # Convert Tokens into indexes used in embeddings layer
        sentence_tokens_indexes = []
        for token in sentence_tokens:
            if token in embedding_matrix_train_dataset_vocab_to_index.keys():
                sentence_tokens_indexes.append(embedding_matrix_train_dataset_vocab_to_index[token])
            else:
                # For OOV words in val and test set
                sentence_tokens_indexes.append(embedding_matrix_train_dataset_vocab_to_index[""])

        # Pad the sentence if it's shorter than max_len, or truncate if it's longer
        if len(sentence_tokens_indexes) < self.max_len:
            sentence_tokens_indexes = sentence_tokens_indexes + [0] * (self.max_len - len(sentence_tokens_indexes)) # Padding with 0
        elif len(sentence_tokens_indexes) > self.max_len:
            sentence_tokens_indexes = sentence_tokens_indexes[:self.max_len] # Truncate to max_len

        # Convert to PyTorch tensors
        sentence = torch.tensor(sentence_tokens_indexes, dtype=torch.long)
        label = torch.tensor(label, dtype=torch.long)

        return sentence, label


[nltk_data] Downloading package punkt to C:\Users\Toh Jing
[nltk_data]     Qiang\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package treebank to C:\Users\Toh Jing
[nltk_data]     Qiang\AppData\Roaming\nltk_data...
[nltk_data]   Package treebank is already up-to-date!
[nltk_data] Downloading package punkt_tab to C:\Users\Toh Jing
[nltk_data]     Qiang\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [5]:
from models.RNN import RNN
import torch.nn as nn
import torch
from solver import train

# Model
model_rnn = RNN(
  embedding_matrix=embedding_matrix,
  hidden_dim=32,
  num_layers=8,
  output_dim=2
)

########################
###### Parameters ######
########################
batch_size = 50
max_epochs = 10_000

# SGD Optimizer
learning_rate = 0.05
optimizer = torch.optim.SGD(model_rnn.parameters(), lr=learning_rate)

# Cross Entropy Loss 
criterion = nn.CrossEntropyLoss()

########################
######## Dataset #######
########################
train_dataset = TextDataset(
  dataframe=train_df,
  max_len=train_df["text"].str.split().apply(len).max()
)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)

val_dataset = TextDataset(
  dataframe=val_df,
  max_len=val_df["text"].str.split().apply(len).max()
)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

train(
  model=model_rnn,
  criterion=criterion,
  optimizer=optimizer,
  train_dataloader=train_dataloader,
  val_dataloader=val_dataloader,
  max_epoch=max_epochs
)

Epoch 0 (Train):   0%|          | 0/171 [00:00<?, ?it/s]

Epoch 0 (Train): 100%|██████████| 171/171 [00:10<00:00, 16.86it/s, acc=0.955, loss=0.141]
Epoch 0 (Val): 100%|██████████| 22/22 [00:00<00:00, 48.96it/s, acc=0.515, loss=2.31] 
Epoch 1 (Train): 100%|██████████| 171/171 [00:10<00:00, 17.01it/s, acc=0.914, loss=0.23] 
Epoch 1 (Val): 100%|██████████| 22/22 [00:00<00:00, 49.89it/s, acc=0.515, loss=2.3]  
Epoch 2 (Train): 100%|██████████| 171/171 [00:10<00:00, 16.68it/s, acc=0.926, loss=0.213]
Epoch 2 (Val): 100%|██████████| 22/22 [00:00<00:00, 49.35it/s, acc=0.515, loss=2.33] 
Epoch 3 (Train): 100%|██████████| 171/171 [00:10<00:00, 16.70it/s, acc=0.926, loss=0.208]
Epoch 3 (Val): 100%|██████████| 22/22 [00:00<00:00, 50.78it/s, acc=0.515, loss=2.34] 
Epoch 4 (Train): 100%|██████████| 171/171 [00:10<00:00, 16.89it/s, acc=0.926, loss=0.209]
Epoch 4 (Val): 100%|██████████| 22/22 [00:00<00:00, 38.98it/s, acc=0.515, loss=2.33] 
Epoch 5 (Train): 100%|██████████| 171/171 [00:10<00:00, 16.06it/s, acc=0.926, loss=0.213]
Epoch 5 (Val): 100%|██████████

(RNN(
   (embedding): Embedding(16332, 300)
   (rnn): RNN(300, 32, num_layers=8, batch_first=True)
   (fc): Linear(in_features=32, out_features=2, bias=True)
 ),
 [0.1410138,
  0.22987168,
  0.21258558,
  0.20753047,
  0.208947,
  0.21326254,
  0.21962442,
  0.22728582,
  0.23414059,
  0.23445147,
  0.2353951],
 [0.9549708,
  0.9140351,
  0.925731,
  0.925731,
  0.925731,
  0.925731,
  0.925731,
  0.9198831,
  0.9140351,
  0.9140351,
  0.9140351],
 [2.3114026,
  2.3002942,
  2.32814,
  2.3373911,
  2.33325,
  2.3207536,
  2.3018148,
  2.2754319,
  2.2796082,
  2.2648547,
  2.2679617],
 [0.51545453,
  0.51545453,
  0.51545453,
  0.51545453,
  0.51545453,
  0.51545453,
  0.51545453,
  0.51545453,
  0.51545453,
  0.51545453,
  0.51545453])