## Sentiment Analysis with Pretrained Embedding
### (Embedding, Linear Layer, Embedding Parameters set to trainable)

### 1. Preparing Data

In [2]:
import torch
import torchtext
from torchtext.datasets import IMDB
from torch.utils.data import  DataLoader
from torchtext.data import utils
from torchtext import vocab
from torchtext.data import functional
from torch.utils.data.dataset import random_split
from torch.nn.utils.rnn import pad_sequence
from torch.optim.lr_scheduler import StepLR
import torch.nn as nn
from tqdm import tqdm
import functools
import sys

#### Build Vocabulary

In [3]:
# Load Dataset
train_iter, test_iter = IMDB()

tokenizer = utils.get_tokenizer("basic_english")

def yield_tokens(text_iter):
    for _, text in text_iter:
        yield tokenizer(text)
        
special_tokens = ["<unk>", "<pad>"]
        
vocabulary = vocab.build_vocab_from_iterator(yield_tokens(train_iter),
                                            min_freq=1,
                                            specials=special_tokens)
vocabulary.set_default_index(vocabulary["<unk>"])

#### Build Dataset and Vocabulary

In [4]:
text_pipeline = lambda x : vocabulary(tokenizer(x))
label_pipeline = lambda x: 0. if x=='neg' else 1.
BATCH_SIZE = 128

# Load Dataset
train_iter, test_iter = IMDB()

train_dataset, test_dataset = functional.to_map_style_dataset(train_iter), \
                                functional.to_map_style_dataset(test_iter)
num_test = int(len(test_dataset)*0.90)
split_test, split_valid = random_split(test_dataset, [num_test, len(test_dataset)-num_test])

def collate_batch(batch, pad_index):
    label_list, text_list = [], []
    for (label, text) in batch:
        processed_text = torch.tensor(text_pipeline(text), dtype=torch.int64)
        text_list.append(processed_text)
        label_list.append(label_pipeline(label))
    seq_list = pad_sequence(text_list, batch_first=True, padding_value=pad_index)
    return seq_list, torch.tensor(label_list)

pad_index = vocabulary["<pad>"]
collate_batch = functools.partial(collate_batch, pad_index=pad_index)

train_loader = DataLoader(dataset=train_dataset, 
                         batch_size=BATCH_SIZE,
                         shuffle=True,
                         collate_fn=collate_batch)
valid_loader = DataLoader(dataset=split_valid, 
                         batch_size=BATCH_SIZE,
                         shuffle=False,
                         collate_fn=collate_batch)
test_loader = DataLoader(dataset=split_test, 
                         batch_size=BATCH_SIZE,
                         shuffle=False,
                         collate_fn=collate_batch)

### 2. Define Model

In [5]:
class TextClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, output_dim, pad_index):
        super(TextClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=pad_index)
        self.fc = nn.Linear(embed_dim, output_dim)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self,X):
        embedded = self.embedding(X)
        pooled = embedded.mean(dim=1)
        out = self.fc(pooled)
        return self.sigmoid(out)

### 3. Build and Train Model

#### Define Hyperparamters

In [10]:
EMBED_DIM = 300
vocab_size = len(vocabulary)
OUTPUT_DIM = 1
pad_index = vocabulary['<pad>']

text_classifier = TextClassifier(vocab_size, EMBED_DIM, OUTPUT_DIM, pad_index)        

#### Use Pretrained Embedding Vectors and freeze it 

In [11]:
vectors = torchtext.vocab.GloVe(name='840B', dim=300)
pretrained_embedding = vectors.get_vecs_by_tokens(vocabulary.get_itos())
text_classifier.embedding.weight.data = pretrained_embedding
text_classifier.embedding.weight.requires_grad = False

#### Train Model

In [12]:
def train(dataloader, model):
    for texts, labels in tqdm(dataloader, desc='training...', file=sys.stdout):
        optimizer.zero_grad()
        outputs = model(texts)
        outputs = outputs.reshape(-1)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
def evaluate(dataloader, model):
    n_samples, n_accurates = 0, 0
    with torch.no_grad():
        for texts, labels in dataloader:
            outputs = model(texts)
            outputs = outputs.reshape(-1)
            n_samples += labels.size(0)
            n_accurates += (torch.round(outputs)==labels).sum().item()
    return n_accurates/n_samples

In [13]:
N_EPOCHS = 20
LR = 0.05

# Criterion, Optimizer, learning rate scheduler
criterion = torch.nn.BCELoss()
optimizer = torch.optim.Adam(text_classifier.parameters(), lr=LR)
scheduler = StepLR(optimizer, step_size=4, gamma=0.8)

for epoch in range(1, N_EPOCHS+1):
    train(train_loader, text_classifier)
    accu_train = evaluate(train_loader, text_classifier)
    accu_val = evaluate(valid_loader, text_classifier)
    scheduler.step()
    print(f"| Epoch: {epoch}/{N_EPOCHS} | train_accuracy: {accu_train: .3f} | val_accuracy :  {accu_val: .3f}")
    
    # Test with test set
accu_test = evaluate(test_loader, text_classifier)
print('='*60)
print(f"Test Accuracy: {accu_test: .3f}")

training...: 100%|███████████████████████████████████████████████████████████████████| 196/196 [00:25<00:00,  7.63it/s]
| Epoch: 1/20 | train_accuracy:  0.802 | val_accuracy :   0.790
training...: 100%|███████████████████████████████████████████████████████████████████| 196/196 [00:26<00:00,  7.35it/s]
| Epoch: 2/20 | train_accuracy:  0.822 | val_accuracy :   0.815
training...: 100%|███████████████████████████████████████████████████████████████████| 196/196 [00:29<00:00,  6.74it/s]
| Epoch: 3/20 | train_accuracy:  0.833 | val_accuracy :   0.820
training...: 100%|███████████████████████████████████████████████████████████████████| 196/196 [00:26<00:00,  7.43it/s]
| Epoch: 4/20 | train_accuracy:  0.836 | val_accuracy :   0.821
training...: 100%|███████████████████████████████████████████████████████████████████| 196/196 [00:29<00:00,  6.65it/s]
| Epoch: 5/20 | train_accuracy:  0.841 | val_accuracy :   0.830
training...: 100%|██████████████████████████████████████████████████████████████

In [14]:
def predict_sentiment(text, model, tokenizer, vocab):
    tokens = tokenizer(text)
    ids = [vocab[t] for t in tokens]
    tensor = torch.LongTensor(ids).unsqueeze(dim=0)
    prediction = model(tensor)
    prediction = torch.round(prediction).item()
    predicted_polarity = "pos" if prediction==1 else "neg"
    return predicted_polarity


In [15]:
text = "This film is great!"

predict_sentiment(text, text_classifier, tokenizer, vocabulary)

'pos'

In [16]:
text = "This film is not great, it's terrible! i didn't like it"

predict_sentiment(text, text_classifier, tokenizer, vocabulary)

'neg'

In [2]:
import torchtext

vectors = torchtext.vocab.FastText(language='de')

.vector_cache\wiki.de.vec: 5.97GB [09:25, 10.6MB/s]                                                                    
  0%|                                                                                      | 0/2275233 [00:00<?, ?it/s]Skipping token b'2275233' with 1-dimensional vector [b'300']; likely a header
100%|██████████████████████████████████████████████████████████████████████| 2275233/2275233 [08:24<00:00, 4506.57it/s]
