# Sentiment Analysis with RNN
We will incorporate the sequential information to do sentiment analysis. This will be realized by adopting the recurrent neural networks (RNN).

# Prepare data

In [1]:
import numpy as np
import pandas as pd

In [2]:
train_data_path = "datasets/twitter_sentiment_analysis/twitter_training.csv"
train_data = pd.read_csv(train_data_path,header=None)
train_data.columns = ["Tweet_ID","entity","sentiment","Tweet_content"]

test_data_path = "datasets/twitter_sentiment_analysis/twitter_validation.csv"
test_data = pd.read_csv(test_data_path,header=None)
test_data.columns = ["Tweet_ID","entity","sentiment","Tweet_content"]

In [3]:
sentiment_to_label = {"Negative":0, "Irrelevant":1, "Neutral":2, "Positive": 3}
train_data["label"] = train_data.sentiment.map(sentiment_to_label)
test_data["label"] = test_data.sentiment.map(sentiment_to_label)

train_data.label.unique()

array([3, 2, 0, 1])

## Prepare data processing pipeline

In [4]:
import torch
from torch.utils.data import DataLoader
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torchtext.transforms import VocabTransform, ToTensor

#device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")  ## Only works on M-Series Mac
device = torch.device("cpu")

In [5]:
class TwitterDataset:
    def __init__(self, texts, label):
        self.texts = texts
        self.label = label

    def __len__(self):
        return len(self.label)
    def __getitem__(self, idx):
        return self.texts[idx], self.label[idx]

train_dataset = TwitterDataset(train_data['Tweet_content'].map(str).values,train_data["label"].values)
test_dataset = TwitterDataset(test_data['Tweet_content'].map(str).values,test_data["label"].values)

In [6]:
tokenizer = get_tokenizer("basic_english")
def yield_tokens(data_iter):
    for text in data_iter:
        yield tokenizer(text)
vocab = build_vocab_from_iterator(yield_tokens(train_data["Tweet_content"].map(str)), specials=["<unk>", "<pad>"])
vocab.set_default_index(vocab["<unk>"])
totensor = ToTensor(padding_value=vocab["<pad>"])

In [7]:
## We redefine the collate function so that it will do dynamic padding
def collate_batch(batch):
    label_list, text_list, text_lens = [], [], []
    for  _text, _label in batch:
        label_list.append(_label)
        processed_text = vocab(tokenizer(_text))
        text_list.append(processed_text)
        text_lens.append(len(processed_text))
    label_list = torch.tensor(label_list,dtype=torch.int64)
    text_list=  totensor(text_list)
    text_lens = torch.tensor(text_lens,dtype = torch.int64)
    return  text_list, label_list, text_lens

batch_size = 32
train_dataloader = DataLoader(train_dataset, batch_size = batch_size, shuffle=True, collate_fn = collate_batch)
test_dataloader = DataLoader(test_dataset, batch_size = batch_size, shuffle=False, collate_fn = collate_batch)

In [8]:
X, y, l = next(iter(train_dataloader))
print(X.shape)
print(y.shape)

torch.Size([32, 71])
torch.Size([32])


# Text Classification

In [9]:
from torch import nn

## A wrong model

In [10]:
class RNNClassifier(nn.Module):
    def __init__(self,vocab_size, embed_dim, hidden_size, num_layers=1, num_class=4):
        super(RNNClassifier,self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(vocab_size, embed_dim)#, padding_idx = padding_idx)
        self.rnn = nn.RNN(input_size = embed_dim, hidden_size = hidden_size, num_layers = num_layers, batch_first = True)
        self.fc = nn.Linear(hidden_size, num_class)
    
    def forward(self, x):
        embeded = self.embedding(x)
        out, h = self.rnn(embeded)
        last_out = out[:,-1,:]
        return self.fc(last_out)


In [11]:
model = RNNClassifier(len(vocab), 50, 50).to(device)

## Training

In [12]:
def train(dataloader, model, loss_fn, optimizer, print_per_batches=200):
    size = len(dataloader.dataset)
    model.train()
    for batch, (X, y, l) in enumerate(dataloader):
        X, y, l  = X.to(device), y.to(device), l.to(device)

        pred = model(X)
        loss = loss_fn(pred,y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch % print_per_batches ==0:
            loss, current = loss.item(), batch*len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

def test(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval()
    test_loss, correct = 0, 0
    with torch.no_grad():
        for batch, (X, y, l) in enumerate(dataloader):
            X, y, l = X.to(device), y.to(device), l.to(device)
            pred = model(X)
            loss = loss_fn(pred,y)
            test_loss += loss.item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
    test_loss /= num_batches
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")
    

loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr = 1e-3)

In [13]:
epochs = 4
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train(train_dataloader, model, loss_fn, optimizer)
    test(test_dataloader, model, loss_fn)
print("Done!") 

Epoch 1
-------------------------------
loss: 1.412648  [    0/74682]




loss: 1.369063  [ 6400/74682]
loss: 1.337177  [12800/74682]
loss: 1.411565  [19200/74682]
loss: 1.369592  [25600/74682]
loss: 1.409129  [32000/74682]
loss: 1.340999  [38400/74682]
loss: 1.366333  [44800/74682]
loss: 1.421093  [51200/74682]
loss: 1.354804  [57600/74682]
loss: 1.364854  [64000/74682]
loss: 1.404611  [70400/74682]
Test Error: 
 Accuracy: 27.1%, Avg loss: 1.372565 

Epoch 2
-------------------------------
loss: 1.322476  [    0/74682]
loss: 1.313258  [ 6400/74682]
loss: 1.400803  [12800/74682]
loss: 1.391215  [19200/74682]
loss: 1.308881  [25600/74682]
loss: 1.333135  [32000/74682]
loss: 1.409786  [38400/74682]
loss: 1.461099  [44800/74682]
loss: 1.339707  [51200/74682]
loss: 1.361057  [57600/74682]
loss: 1.376830  [64000/74682]
loss: 1.371366  [70400/74682]
Test Error: 
 Accuracy: 27.3%, Avg loss: 1.375028 

Epoch 3
-------------------------------
loss: 1.397185  [    0/74682]
loss: 1.361613  [ 6400/74682]
loss: 1.350316  [12800/74682]
loss: 1.375150  [19200/74682]
loss: 

We can see the model above has almost no predicting power. This is because in each batch, we use padding to fill the length of each text to the max length of the batch. As a result, the last output is often times domininated by the padding token.

## A quick fix
Let's try fixing it quickly by using the output at text length, not the last one

In [14]:
class RNNClassifier(nn.Module):
    def __init__(self,vocab_size, embed_dim, hidden_size, num_layers=1, num_class=4):
        super(RNNClassifier,self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(vocab_size, embed_dim)#, padding_idx = padding_idx)
        self.rnn = nn.RNN(input_size = embed_dim, hidden_size = hidden_size, num_layers = num_layers, batch_first = True)
        self.fc = nn.Linear(hidden_size, num_class)
    
    def forward(self, x, l):
        embeded = self.embedding(x)
        out, h = self.rnn(embeded)
        last_out = out[range(len(x)),l-1,:]
        return self.fc(last_out)

In [15]:
model = RNNClassifier(len(vocab), 50, 50).to(device)

In [16]:

def train(dataloader, model, loss_fn, optimizer, print_per_batches=200):
    size = len(dataloader.dataset)
    model.train()
    for batch, (X, y, l) in enumerate(dataloader):
        X, y, l  = X.to(device), y.to(device), l.to(device)

        pred = model(X,l)
        loss = loss_fn(pred,y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch % print_per_batches ==0:
            loss, current = loss.item(), batch*len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

def test(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval()
    test_loss, correct = 0, 0
    with torch.no_grad():
        for batch, (X, y, l) in enumerate(dataloader):
            X, y, l = X.to(device), y.to(device), l.to(device)
            pred = model(X,l)
            loss = loss_fn(pred,y)
            test_loss += loss.item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
    test_loss /= num_batches
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")
    

loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr = 1e-3)

In [17]:
epochs = 10
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train(train_dataloader, model, loss_fn, optimizer)
    test(test_dataloader, model, loss_fn)
print("Done!") 

Epoch 1
-------------------------------
loss: 1.509013  [    0/74682]
loss: 1.263869  [ 6400/74682]
loss: 1.200433  [12800/74682]
loss: 1.301165  [19200/74682]
loss: 1.357305  [25600/74682]
loss: 1.312935  [32000/74682]
loss: 1.127126  [38400/74682]
loss: 1.131188  [44800/74682]
loss: 1.235723  [51200/74682]
loss: 1.347865  [57600/74682]
loss: 1.278798  [64000/74682]
loss: 1.061867  [70400/74682]
Test Error: 
 Accuracy: 52.9%, Avg loss: 1.104585 

Epoch 2
-------------------------------
loss: 1.053307  [    0/74682]
loss: 1.042103  [ 6400/74682]
loss: 1.007339  [12800/74682]
loss: 0.941086  [19200/74682]
loss: 0.805227  [25600/74682]
loss: 0.731336  [32000/74682]
loss: 1.038620  [38400/74682]
loss: 1.019140  [44800/74682]
loss: 0.822154  [51200/74682]
loss: 1.080436  [57600/74682]
loss: 0.916930  [64000/74682]
loss: 1.047602  [70400/74682]
Test Error: 
 Accuracy: 68.3%, Avg loss: 0.826997 

Epoch 3
-------------------------------
loss: 1.084673  [    0/74682]
loss: 1.030566  [ 6400/746

## Use LSTM

In [18]:
class LSTMClassifier(nn.Module):
    def __init__(self,vocab_size, embed_dim, hidden_size, num_layers=1, num_class=4):
        super(LSTMClassifier,self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(vocab_size, embed_dim)#, padding_idx = padding_idx)
        self.rnn = nn.LSTM(input_size = embed_dim, hidden_size = hidden_size, num_layers = num_layers, batch_first = True)
        self.fc = nn.Linear(hidden_size, num_class)
    
    def forward(self, x, l):
        embeded = self.embedding(x)
        out, (h, c) = self.rnn(embeded)
        last_out = out[:,-1,:]#out[range(len(x)),l-1,:]
        return self.fc(last_out)

In [19]:
model = LSTMClassifier(len(vocab), 50, 50).to(device)

def train(dataloader, model, loss_fn, optimizer, print_per_batches=200):
    size = len(dataloader.dataset)
    model.train()
    for batch, (X, y, l) in enumerate(dataloader):
        X, y, l  = X.to(device), y.to(device), l.to(device)

        pred = model(X,l)
        loss = loss_fn(pred,y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch % print_per_batches ==0:
            loss, current = loss.item(), batch*len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

def test(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval()
    test_loss, correct = 0, 0
    with torch.no_grad():
        for batch, (X, y, l) in enumerate(dataloader):
            X, y, l = X.to(device), y.to(device), l.to(device)
            pred = model(X,l)
            loss = loss_fn(pred,y)
            test_loss += loss.item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
    test_loss /= num_batches
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")
    

loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr = 1e-3)

In [20]:
epochs = 10
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train(train_dataloader, model, loss_fn, optimizer)
    test(test_dataloader, model, loss_fn)
print("Done!") 

Epoch 1
-------------------------------
loss: 1.479046  [    0/74682]
loss: 1.319519  [ 6400/74682]
loss: 1.347234  [12800/74682]
loss: 1.306167  [19200/74682]
loss: 1.395151  [25600/74682]
loss: 1.354324  [32000/74682]
loss: 1.346324  [38400/74682]
loss: 1.365904  [44800/74682]
loss: 1.314047  [51200/74682]
loss: 1.377275  [57600/74682]
loss: 1.326208  [64000/74682]
loss: 1.278841  [70400/74682]
Test Error: 
 Accuracy: 45.7%, Avg loss: 1.237679 

Epoch 2
-------------------------------
loss: 1.184323  [    0/74682]
loss: 1.163339  [ 6400/74682]
loss: 1.100607  [12800/74682]
loss: 1.123048  [19200/74682]
loss: 1.102641  [25600/74682]
loss: 1.178702  [32000/74682]
loss: 1.108421  [38400/74682]
loss: 0.956805  [44800/74682]
loss: 0.929813  [51200/74682]
loss: 1.013586  [57600/74682]
loss: 1.097093  [64000/74682]
loss: 1.095808  [70400/74682]
Test Error: 
 Accuracy: 67.6%, Avg loss: 0.835711 

Epoch 3
-------------------------------
loss: 1.036163  [    0/74682]
loss: 0.839168  [ 6400/746

Looks like LSTM does not suffer the problem of too many '<pad>' dominating the output.