# Sentiment Analysis
We will incorporate the sequential information to do sentiment analysis. This will be realized by adopting the recurrent neural networks (RNN).

# Prepare data

In [1]:
import numpy as np
import pandas as pd

In [2]:
train_data_path = "datasets/twitter_sentiment_analysis/twitter_training.csv"
train_data = pd.read_csv(train_data_path,header=None)
train_data.columns = ["Tweet_ID","entity","sentiment","Tweet_content"]

test_data_path = "datasets/twitter_sentiment_analysis/twitter_validation.csv"
test_data = pd.read_csv(test_data_path,header=None)
test_data.columns = ["Tweet_ID","entity","sentiment","Tweet_content"]

In [3]:
## Inlcude Only "Positive" and "Negatvie" twitts to form a binary classification problem
## Label Positve as 1 and Negative as 0
train_data = train_data[train_data.sentiment.isin(["Positive","Negative"])]
train_data["label"] = train_data.sentiment.map({"Positive":1, "Negative":0})
test_data = test_data[test_data.sentiment.isin(["Positive","Negative"])]
test_data["label"] = test_data.sentiment.map({"Positive":1, "Negative":0})

## Prepare data processing pipeline

In [4]:
import torch
from torch.utils.data import DataLoader
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torchtext.transforms import VocabTransform, ToTensor

#device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")  ## Only works on M-Series Mac
device = torch.device("cpu")

In [5]:
class TwitterDataset:
    def __init__(self, texts, label):
        self.texts = texts
        self.label = label

    def __len__(self):
        return len(self.label)
    def __getitem__(self, idx):
        return self.texts[idx], self.label[idx]

train_dataset = TwitterDataset(train_data['Tweet_content'].map(str).values,train_data["label"].values)
test_dataset = TwitterDataset(test_data['Tweet_content'].map(str).values,test_data["label"].values)

In [6]:
tokenizer = get_tokenizer("basic_english")
def yield_tokens(data_iter):
    for text in data_iter:
        yield tokenizer(text)
vocab = build_vocab_from_iterator(yield_tokens(train_data["Tweet_content"].map(str)), specials=["<unk>", "<pad>"])
vocab.set_default_index(vocab["<unk>"])
totensor = ToTensor(padding_value=vocab["<pad>"])

In [25]:
## We redefine the collate function so that it will do dynamic padding
def collate_batch(batch):
    label_list, text_list, text_lens = [], [], []
    for  _text, _label in batch:
        label_list.append(_label)
        processed_text = vocab(tokenizer(_text))
        text_list.append(processed_text)
        text_lens.append(len(processed_text))
    label_list = torch.tensor(label_list,dtype=torch.int64)
    text_list=  totensor(text_list)
    text_lens = torch.tensor(text_lens,dtype = torch.int64)
    return  text_list, label_list, text_lens

batch_size = 32
train_dataloader = DataLoader(train_dataset, batch_size = batch_size, shuffle=True, collate_fn = collate_batch)
test_dataloader = DataLoader(test_dataset, batch_size = batch_size, shuffle=False, collate_fn = collate_batch)

In [26]:
X, y, l = next(iter(train_dataloader))
print(X.shape)
print(y.shape)

torch.Size([32, 150])
torch.Size([32])


# Text Classification

In [30]:
from torch import nn

## A wrong model

In [31]:
class RNNClassifier(nn.Module):
    def __init__(self,vocab_size, embed_dim, hidden_size, num_layers=1, num_class=2):
        super(RNNClassifier,self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(vocab_size, embed_dim)#, padding_idx = padding_idx)
        self.rnn = nn.RNN(input_size = embed_dim, hidden_size = hidden_size, num_layers = num_layers, batch_first = True)
        self.fc = nn.Linear(hidden_size, num_class)
    
    def forward(self, x):
        embeded = self.embedding(x)
        out, h = self.rnn(embeded)
        last_out = out[:,-1,:]
        return self.fc(last_out)


In [32]:
model = RNNClassifier(len(vocab), 50, 50).to(device)

## Training

In [33]:
def train(dataloader, model, loss_fn, optimizer, print_per_batches=200):
    size = len(dataloader.dataset)
    model.train()
    for batch, (X, y, l) in enumerate(dataloader):
        X, y, l  = X.to(device), y.to(device), l.to(device)

        pred = model(X)
        loss = loss_fn(pred,y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch % print_per_batches ==0:
            loss, current = loss.item(), batch*len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

def test(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval()
    test_loss, correct = 0, 0
    with torch.no_grad():
        for batch, (X, y, l) in enumerate(dataloader):
            X, y, l = X.to(device), y.to(device), l.to(device)
            pred = model(X)
            loss = loss_fn(pred,y)
            test_loss += loss.item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
    test_loss /= num_batches
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")
    

loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr = 1e-3)

In [34]:
epochs = 10
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train(train_dataloader, model, loss_fn, optimizer)
    test(test_dataloader, model, loss_fn)
print("Done!") 

Epoch 1
-------------------------------
loss: 0.711299  [    0/43374]




loss: 0.725538  [ 6400/43374]
loss: 0.702321  [12800/43374]
loss: 0.683266  [19200/43374]
loss: 0.703592  [25600/43374]
loss: 0.682076  [32000/43374]
loss: 0.698702  [38400/43374]
Test Error: 
 Accuracy: 52.1%, Avg loss: 0.692388 

Epoch 2
-------------------------------
loss: 0.664274  [    0/43374]
loss: 0.647142  [ 6400/43374]
loss: 0.690406  [12800/43374]
loss: 0.702946  [19200/43374]
loss: 0.687263  [25600/43374]
loss: 0.697123  [32000/43374]
loss: 0.694179  [38400/43374]
Test Error: 
 Accuracy: 49.0%, Avg loss: 0.692663 

Epoch 3
-------------------------------
loss: 0.685992  [    0/43374]
loss: 0.682557  [ 6400/43374]
loss: 0.691424  [12800/43374]
loss: 0.687053  [19200/43374]
loss: 0.690464  [25600/43374]
loss: 0.706798  [32000/43374]
loss: 0.680312  [38400/43374]
Test Error: 
 Accuracy: 48.8%, Avg loss: 0.696416 

Epoch 4
-------------------------------
loss: 0.682889  [    0/43374]
loss: 0.699637  [ 6400/43374]
loss: 0.714963  [12800/43374]
loss: 0.662530  [19200/43374]
loss

We can see the model above has almost no predicting power. This is because in each batch, we use padding to fill the length of each text to the max length of the batch. As a result, the last output is often times domininated by the padding token.

## A quick fix
Let's try fixing it quickly by using the output at text length, not the last one

In [66]:
class RNNClassifier(nn.Module):
    def __init__(self,vocab_size, embed_dim, hidden_size, num_layers=1, num_class=2):
        super(RNNClassifier,self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(vocab_size, embed_dim)#, padding_idx = padding_idx)
        self.rnn = nn.RNN(input_size = embed_dim, hidden_size = hidden_size, num_layers = num_layers, batch_first = True)
        self.fc = nn.Linear(hidden_size, num_class)
    
    def forward(self, x, l):
        embeded = self.embedding(x)
        out, h = self.rnn(embeded)
        last_out = out[range(len(x)),l-1,:]
        return self.fc(last_out)

In [70]:
model = RNNClassifier(len(vocab), 50, 50).to(device)

In [71]:

def train(dataloader, model, loss_fn, optimizer, print_per_batches=200):
    size = len(dataloader.dataset)
    model.train()
    for batch, (X, y, l) in enumerate(dataloader):
        X, y, l  = X.to(device), y.to(device), l.to(device)

        pred = model(X,l)
        loss = loss_fn(pred,y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch % print_per_batches ==0:
            loss, current = loss.item(), batch*len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

def test(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval()
    test_loss, correct = 0, 0
    with torch.no_grad():
        for batch, (X, y, l) in enumerate(dataloader):
            X, y, l = X.to(device), y.to(device), l.to(device)
            pred = model(X,l)
            loss = loss_fn(pred,y)
            test_loss += loss.item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
    test_loss /= num_batches
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")
    

loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr = 1e-3)

In [73]:
epochs = 20
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train(train_dataloader, model, loss_fn, optimizer)
    test(test_dataloader, model, loss_fn)
print("Done!") 

Epoch 1
-------------------------------
loss: 0.056843  [    0/43374]
loss: 0.124621  [ 6400/43374]
loss: 0.132309  [12800/43374]
loss: 0.155050  [19200/43374]
loss: 0.074953  [25600/43374]
loss: 0.036090  [32000/43374]
loss: 0.088022  [38400/43374]
Test Error: 
 Accuracy: 97.8%, Avg loss: 0.077875 

Epoch 2
-------------------------------
loss: 0.123090  [    0/43374]
loss: 0.058282  [ 6400/43374]
loss: 0.048432  [12800/43374]
loss: 0.214843  [19200/43374]
loss: 0.020524  [25600/43374]
loss: 0.231911  [32000/43374]
loss: 0.023945  [38400/43374]
Test Error: 
 Accuracy: 98.0%, Avg loss: 0.066188 

Epoch 3
-------------------------------
loss: 0.010380  [    0/43374]
loss: 0.035176  [ 6400/43374]
loss: 0.089242  [12800/43374]
loss: 0.072622  [19200/43374]
loss: 0.065983  [25600/43374]
loss: 0.041987  [32000/43374]
loss: 0.085193  [38400/43374]
Test Error: 
 Accuracy: 98.2%, Avg loss: 0.066941 

Epoch 4
-------------------------------
loss: 0.071263  [    0/43374]
loss: 0.154215  [ 6400/4