# Sentiment Analysis
We will incorporate the sequential information to do sentiment analysis. This will be realized by adopting the recurrent neural networks (RNN).

# Prepare data

In [31]:
import numpy as np
import pandas as pd

In [32]:
train_data_path = "datasets/twitter_sentiment_analysis/twitter_training.csv"
train_data = pd.read_csv(train_data_path,header=None)
train_data.columns = ["Tweet_ID","entity","sentiment","Tweet_content"]

test_data_path = "datasets/twitter_sentiment_analysis/twitter_validation.csv"
test_data = pd.read_csv(test_data_path,header=None)
test_data.columns = ["Tweet_ID","entity","sentiment","Tweet_content"]

In [33]:
## Inlcude Only "Positive" and "Negatvie" twitts to form a binary classification problem
## Label Positve as 1 and Negative as 0
train_data = train_data[train_data.sentiment.isin(["Positive","Negative"])]
train_data["label"] = train_data.sentiment.map({"Positive":1, "Negative":0})
test_data = test_data[test_data.sentiment.isin(["Positive","Negative"])]
test_data["label"] = test_data.sentiment.map({"Positive":1, "Negative":0})

## Prepare data processing pipeline

In [34]:
import torch
from torch.utils.data import DataLoader
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torchtext.transforms import VocabTransform, ToTensor

#device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")  ## Only works on M-Series Mac
device = torch.device("cpu")

In [35]:
class TwitterDataset:
    def __init__(self, texts, label):
        self.texts = texts
        self.label = label

    def __len__(self):
        return len(self.label)
    def __getitem__(self, idx):
        return self.texts[idx], self.label[idx]

train_dataset = TwitterDataset(train_data['Tweet_content'].map(str).values,train_data["label"].values)
test_dataset = TwitterDataset(test_data['Tweet_content'].map(str).values,test_data["label"].values)

In [36]:
tokenizer = get_tokenizer("basic_english")
def yield_tokens(data_iter):
    for text in data_iter:
        yield tokenizer(text)
vocab = build_vocab_from_iterator(yield_tokens(train_data["Tweet_content"].map(str)), specials=["<unk>", "<pad>"])
vocab.set_default_index(vocab["<unk>"])
totensor = ToTensor(padding_value=vocab["<pad>"])

In [37]:
## We redefine the collate function so that it will do dynamic padding
def collate_batch(batch):
    label_list, text_list = [], []
    for  _text, _label in batch:
        label_list.append(_label)
        processed_text = vocab(tokenizer(_text))
        text_list.append(processed_text)
    label_list = torch.tensor(label_list,dtype=torch.int64)
    text_list=  totensor(text_list)
    return  text_list, label_list

batch_size = 32
train_dataloader = DataLoader(train_dataset, batch_size = batch_size, shuffle=True, collate_fn = collate_batch)
test_dataloader = DataLoader(test_dataset, batch_size = batch_size, shuffle=False, collate_fn = collate_batch)

In [38]:
X, y = next(iter(train_dataloader))
print(X.shape)
print(y.shape)

torch.Size([32, 53])
torch.Size([32])


# Create Model

In [39]:
from torch import nn

In [43]:
embedding = nn.Embedding(len(vocab),64, padding_idx=vocab["<pad>"])
rnn = nn.RNN(64, 64, batch_first=True)


In [49]:
X_embeded = embedding(X)
print(X_embeded.size())
out, h = rnn(X_embeded)
print(out.size())
print(h.size())

torch.Size([32, 53, 64])
torch.Size([32, 53, 64])
torch.Size([1, 32, 64])


In [51]:
out[:,-1,:].size()

torch.Size([32, 64])

In [68]:
X.shape

torch.Size([32, 53])

In [101]:
class RNNClassifier(nn.Module):
    def __init__(self,vocab_size, embed_dim, hidden_size, num_layers=1, num_class=2):
        super(RNNClassifier,self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(vocab_size, embed_dim)#, padding_idx = padding_idx)
        self.rnn = nn.RNN(input_size = embed_dim, hidden_size = hidden_size, num_layers = num_layers, bidirectional = True, batch_first = True)
        self.fc = nn.Linear(2*hidden_size, num_class)
    
    def forward(self, x):
        embeded = self.embedding(x)
        out, h = self.rnn(embeded)
        last_out = torch.cat([out[:,-1,:self.hidden_size],out[:,0,self.hidden_size:]],1)
        return self.fc(last_out)


In [102]:
model = RNNClassifier(len(vocab), 50, 50).to(device)

In [103]:
model(X)

tensor([[ 0.3770,  0.0671],
        [ 0.4562, -0.0643],
        [ 0.1653, -0.1817],
        [-0.1681,  0.2097],
        [ 0.1134,  0.0804],
        [-0.0048, -0.3137],
        [ 0.2154,  0.0781],
        [-0.1206,  0.2077],
        [ 0.2965,  0.3851],
        [ 0.2142, -0.1757],
        [ 0.4296,  0.4270],
        [ 0.1423, -0.1823],
        [ 0.1370,  0.1301],
        [-0.1808, -0.2075],
        [ 0.1392, -0.2321],
        [ 0.4007,  0.3022],
        [ 0.4129,  0.3271],
        [-0.0824,  0.0921],
        [-0.0237,  0.2510],
        [-0.0277, -0.0968],
        [ 0.1981,  0.3207],
        [ 0.2888,  0.1027],
        [ 0.1601,  0.2366],
        [ 0.2859,  0.5328],
        [ 0.1161,  0.2492],
        [ 0.3025,  0.2953],
        [ 0.4121,  0.2291],
        [ 0.1557, -0.0648],
        [ 0.1290, -0.0882],
        [ 0.0840,  0.3903],
        [ 0.3541, -0.0856],
        [-0.0587,  0.3592]], grad_fn=<AddmmBackward0>)

In [93]:
out, h=model.rnn(model.embedding(X))

In [97]:
last_out = out[:,1]
last_out

tensor([[ 0.5667, -0.1960, -0.1890,  ..., -0.1727,  0.6085,  0.7286],
        [-0.5245,  0.5047, -0.2435,  ...,  0.3505, -0.2806,  0.7301],
        [-0.6323,  0.6543,  0.0531,  ..., -0.0101, -0.0350,  0.1806],
        ...,
        [ 0.1501, -0.6010, -0.6283,  ...,  0.0503,  0.6643,  0.1472],
        [-0.0785,  0.1759,  0.7491,  ..., -0.2248, -0.8514,  0.2855],
        [ 0.0837,  0.7609,  0.3100,  ...,  0.4098,  0.6401, -0.3209]],
       grad_fn=<SelectBackward0>)

# Training

In [104]:
def train(dataloader, model, loss_fn, optimizer, print_per_batches=200):
    size = len(dataloader.dataset)
    model.train()
    for batch, (X, y) in enumerate(dataloader):
        X, y  = X.to(device), y.to(device)

        pred = model(X)
        loss = loss_fn(pred,y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch % print_per_batches ==0:
            loss, current = loss.item(), batch*len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

def test(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval()
    test_loss, correct = 0, 0
    with torch.no_grad():
        for batch, (X, y) in enumerate(dataloader):
            X, y = X.to(device), y.to(device)
            pred = model(X)
            loss = loss_fn(pred,y)
            test_loss += loss.item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
    test_loss /= num_batches
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")
    

loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr = 1e-3)

In [105]:
out = model(X.to(device))
loss = loss_fn(out,y.to(device))
optimizer.zero_grad()
loss.backward()
optimizer.step()

In [106]:
epochs = 5
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train(train_dataloader, model, loss_fn, optimizer)
    test(test_dataloader, model, loss_fn)
print("Done!") 

Epoch 1
-------------------------------
loss: 0.686592  [    0/43374]




loss: 0.649093  [ 6400/43374]
loss: 0.504311  [12800/43374]
loss: 0.384988  [19200/43374]
loss: 0.495392  [25600/43374]
loss: 0.362439  [32000/43374]
loss: 0.553041  [38400/43374]
Test Error: 
 Accuracy: 85.8%, Avg loss: 0.342222 

Epoch 2
-------------------------------
loss: 0.375411  [    0/43374]
loss: 0.443304  [ 6400/43374]
loss: 0.236797  [12800/43374]
loss: 0.517483  [19200/43374]
loss: 0.407685  [25600/43374]
loss: 0.530804  [32000/43374]
loss: 0.425254  [38400/43374]
Test Error: 
 Accuracy: 92.6%, Avg loss: 0.188746 

Epoch 3
-------------------------------
loss: 0.188607  [    0/43374]
loss: 0.197598  [ 6400/43374]
loss: 0.310955  [12800/43374]
loss: 0.315216  [19200/43374]
loss: 0.394037  [25600/43374]
loss: 0.133170  [32000/43374]
loss: 0.169238  [38400/43374]
Test Error: 
 Accuracy: 96.3%, Avg loss: 0.133513 

Epoch 4
-------------------------------
loss: 0.183419  [    0/43374]
loss: 0.157808  [ 6400/43374]
loss: 0.473493  [12800/43374]
loss: 0.132162  [19200/43374]
loss