## Sentiment Analayis with RNN

### 1. Preparing Data

In [154]:
import torch
from torchtext import datasets
from torch.utils.data import  DataLoader
from torchtext.data import utils
from torchtext import vocab
from torchtext.data import functional
from torch.utils.data.dataset import random_split
from torch.nn.utils.rnn import pad_sequence
import torch.nn as nn
from tqdm import tqdm

#### Build Vocabulary

In [91]:
# Load Dataset
train_iter, test_iter = datasets.IMDB()

tokenizer = utils.get_tokenizer("basic_english")

def yield_tokens(text_iter):
    for _, text in text_iter:
        yield tokenizer(text)
        
vocabulary = vocab.build_vocab_from_iterator(yield_tokens(train_iter),
                                            min_freq=1,
                                            specials=["<unk>"])
vocabulary.set_default_index(vocabulary["<unk>"])

#### Build Dataset and Dataloader

In [98]:
text_pipeline = lambda x : vocabulary(tokenizer(x))
label_pipeline = lambda x: 0. if x=='neg' else 1.
BATCH_SIZE = 100

# Load Dataset
train_iter, test_iter = datasets.IMDB()

train_dataset, test_dataset = functional.to_map_style_dataset(train_iter), \
                                functional.to_map_style_dataset(test_iter)
num_test = int(len(test_dataset)*0.90)
split_test, split_valid = random_split(test_dataset, [num_test, len(test_dataset)-num_test])

def collate_batch(batch):
    label_list, text_list = [], []
    for (label, text) in batch:
        processed_text = torch.tensor(text_pipeline(text), dtype=torch.int64)
        text_list.append(processed_text)
        label_list.append(label_pipeline(label))
    seq_list = pad_sequence(text_list, batch_first=True, padding_value=0)
    return seq_list, torch.tensor(label_list)



train_loader = DataLoader(dataset=train_dataset, 
                         batch_size=BATCH_SIZE,
                         shuffle=True,
                         collate_fn=collate_batch)
valid_loader = DataLoader(dataset=split_valid, 
                         batch_size=BATCH_SIZE,
                         shuffle=False,
                         collate_fn=collate_batch)
test_loader = DataLoader(dataset=split_test, 
                         batch_size=BATCH_SIZE,
                         shuffle=False,
                         collate_fn=collate_batch)

### 2. Define Model

In [168]:
class TextClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_size, output_dim, num_layers=1):
        super(TextClassifier, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.rnn = nn.RNN(embed_dim, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_dim)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, X):
        embedded = self.embedding(X)
        h0 = torch.zeros(self.num_layers, X.size(0), self.hidden_size)
        output, hidden = self.rnn(embedded, h0)
        output = output[:, -1, :]
        output = self.fc(output)
        return self.sigmoid(output)

### 3. Train Model

In [169]:
def train(dataloader, model):
    n_samples, n_accurates = 0, 0
    total_batches = len(dataloader)
    for idx, (text, label) in enumerate(dataloader):
        optimizer.zero_grad()
        outputs = model(text)
        outputs = outputs.type(torch.FloatTensor).reshape(-1)
        loss = criterion(outputs, label)
        loss.backward()
        optimizer.step()

            
def evaluate(dataloader, model):
    n_samples, n_accurates = 0, 0
    with torch.no_grad():
        for idx, (text, label) in enumerate(dataloader):
            outputs = model(text)
            outputs = outputs.reshape(-1).type(torch.FloatTensor)
            label = label.reshape(-1).type(torch.LongTensor)
            n_accurates += (torch.round(outputs) == label).sum().item()
            n_samples += label.size(0)
    return n_accurates/n_samples

In [172]:
# Define some hyperparameters
LR = 0.0001
EPOCHS = 5
output_dim = 1
vocab_size = len(vocabulary)
embed_dim = 64
hidden_size = 50
text_classifier = TextClassifier(vocab_size, embed_dim, hidden_size, output_dim)

# Criterion, Optimizer, learning rate scheduler
criterion = torch.nn.BCELoss()
optimizer = torch.optim.Adam(text_classifier.parameters(), lr=LR)

for epoch in range(1, EPOCHS + 1):
    train(train_loader, text_classifier)
    accu_train = evaluate(train_loader, text_classifier)
    accu_val = evaluate(valid_loader, text_classifier)
    print(f"| Epoch: {epoch} | train_accuracy: {accu_train: .3f} | val_accuracy :  {accu_val: .3f}")

# Test with test set
accu_test = evaluate(test_loader, text_classifier)
print(f"Test Accuracy: {accu_test: .3f}")

  return torch.tensor(seq_list), torch.tensor(label_list)


| epoch: 1 | batches: 125/250 | train_accuracy:  50.424
| epoch: 1 | batches: 250/250 | train_accuracy:  50.188
| End of Epoch: 1 | train_accuracy:  0.520 | val_accuracy :   0.440
| epoch: 2 | batches: 125/250 | train_accuracy:  49.960
| epoch: 2 | batches: 250/250 | train_accuracy:  49.948
| End of Epoch: 2 | train_accuracy:  0.510 | val_accuracy :   0.550
| epoch: 3 | batches: 125/250 | train_accuracy:  50.256
| epoch: 3 | batches: 250/250 | train_accuracy:  49.960
| End of Epoch: 3 | train_accuracy:  0.540 | val_accuracy :   0.550
| epoch: 4 | batches: 125/250 | train_accuracy:  50.048
| epoch: 4 | batches: 250/250 | train_accuracy:  49.772
| End of Epoch: 4 | train_accuracy:  0.610 | val_accuracy :   0.560
| epoch: 5 | batches: 125/250 | train_accuracy:  50.552
| epoch: 5 | batches: 250/250 | train_accuracy:  50.028
| End of Epoch: 5 | train_accuracy:  0.530 | val_accuracy :   0.450
Test Accuracy:  0.520
