# Sentiment Analysis with Embedding bag
We will use MLP to perfomr sentiment analysis for Twitter data. Instead of using TF-IDF or other features as input, we will convert each word/token of the text into a word embedding and average all embeddings from the same tweet. 

# Prepare data

In [1]:
import numpy as np
import pandas as pd

In [2]:
train_data_path = "datasets/twitter_sentiment_analysis/twitter_training.csv"
train_data = pd.read_csv(train_data_path,header=None)
train_data.columns = ["Tweet_ID","entity","sentiment","Tweet_content"]

test_data_path = "datasets/twitter_sentiment_analysis/twitter_validation.csv"
test_data = pd.read_csv(test_data_path,header=None)
test_data.columns = ["Tweet_ID","entity","sentiment","Tweet_content"]

In [3]:
## Inlcude Only "Positive" and "Negatvie" twitts to form a binary classification problem
## Label Positve as 1 and Negative as 0
train_data = train_data[train_data.sentiment.isin(["Positive","Negative"])]
train_data["label"] = train_data.sentiment.map({"Positive":1, "Negative":0})
test_data = test_data[test_data.sentiment.isin(["Positive","Negative"])]
test_data["label"] = test_data.sentiment.map({"Positive":1, "Negative":0})

## Prepare data processing pipeline

In [78]:
import torch
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader
##device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")  ## Only works on M-Series Mac
device = torch.device("cpu") ## currently mps does not support EmbeddingBag

In [10]:
tokenizer = get_tokenizer("basic_english")
def yield_tokens(data_iter):
    for text in data_iter:
        yield tokenizer(text)
vocab = build_vocab_from_iterator(yield_tokens(train_data["Tweet_content"].map(str)), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])

In [11]:
vocab(["hello", "world"])

[716, 216]

In [63]:
class TwitterDataset:
    def __init__(self, texts, label):
        self.texts = texts
        self.label = label

    def __len__(self):
        return len(self.label)
    def __getitem__(self, idx):
        return self.texts[idx], self.label[idx]

train_dataset = TwitterDataset(train_data['Tweet_content'].map(str).values,train_data["label"].values)
test_dataset = TwitterDataset(test_data['Tweet_content'].map(str).values,test_data["label"].values)

In [105]:
def collate_batch(batch):
    label_list, text_list, offsets = [], [], [0]
    for  _text, _label in batch:
        label_list.append(_label)
        processed_text = torch.tensor(vocab(tokenizer(_text)), dtype=torch.int64)
        text_list.append(processed_text)
        offsets.append(processed_text.size(0))
    label_list = torch.tensor(label_list,dtype=torch.int64)
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text_list=  torch.cat(text_list)
    return label_list, text_list, offsets

batch_size = 64
train_dataloader = DataLoader(train_dataset, batch_size = batch_size, shuffle=True, collate_fn = collate_batch)
test_dataloader = DataLoader(test_dataset, batch_size = batch_size, shuffle=False, collate_fn = collate_batch)

## Create model

In [66]:
from torch import nn

In [111]:
class TextClassification(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_class):
        super().__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=False)
        self.fc = nn.Linear(embed_dim, num_class)
    def forward(self,x,offsets):
        embeded = self.embedding(x, offsets)
        return self.fc(embeded)
        

In [112]:
model = TextClassification(len(vocab), embed_dim = 64, num_class = 2).to(device)

## Create Train step

In [113]:
def train(dataloader, model, loss_fn, optimizer, print_per_batches=50):
    size = len(dataloader.dataset)
    model.train()
    for batch, (y, X, o) in enumerate(dataloader):
        X, y, o = X.to(device), y.to(device), o.to(device)

        pred = model(X, o)
        loss = loss_fn(pred,y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch % print_per_batches ==0:
            loss, current = loss.item(), batch*len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

def test(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval()
    test_loss, correct = 0, 0
    with torch.no_grad():
        for batch, (y, X, o) in enumerate(dataloader):
            X, y, o = X.to(device), y.to(device), o.to(device)
            pred = model(X, o)
            loss = loss_fn(pred,y)
            test_loss += loss.item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
    test_loss /= num_batches
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")

# Train model

In [114]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr = 1e-2)

In [115]:
epochs = 10
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train(train_dataloader, model, loss_fn, optimizer)
    test(test_dataloader, model, loss_fn)
print("Done!") 

Epoch 1
-------------------------------
loss: 0.699312  [    0/43374]
loss: 0.604927  [69850/43374]
loss: 0.435660  [138400/43374]
loss: 0.577339  [275850/43374]
loss: 0.338536  [294400/43374]
loss: 0.405945  [359250/43374]
loss: 0.236121  [392100/43374]
loss: 0.230910  [455000/43374]
loss: 0.287862  [517600/43374]
loss: 0.300403  [634500/43374]
loss: 0.285999  [748000/43374]
loss: 0.304240  [771650/43374]
loss: 0.335285  [1026600/43374]
loss: 0.367649  [1062750/43374]
Test Error: 
 Accuracy: 96.7%, Avg loss: 0.122118 

Epoch 2
-------------------------------
loss: 0.204199  [    0/43374]
loss: 0.167257  [84300/43374]
loss: 0.155036  [139000/43374]
loss: 0.162015  [187950/43374]
loss: 0.178462  [309800/43374]
loss: 0.119915  [322500/43374]
loss: 0.184415  [434400/43374]
loss: 0.137225  [536900/43374]
loss: 0.322225  [609600/43374]
loss: 0.153902  [644850/43374]
loss: 0.126318  [805000/43374]
loss: 0.219035  [750750/43374]
loss: 0.267717  [939000/43374]
loss: 0.335995  [937950/43374]
Te

# Using nn.Embedding instead of nn.EmbeddingBag
From previous implementation, we see a warning message that `nn.EmbeddingBad` currently does not support a mps backend. We will imlementation the model using `nn.Embedding`

In [126]:
import torch
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torchtext.transforms import VocabTransform, ToTensor

device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")  ## Only works on M-Series Mac

In [133]:
tokenizer = get_tokenizer("basic_english")
def yield_tokens(data_iter):
    for text in data_iter:
        yield tokenizer(text)
vocab = build_vocab_from_iterator(yield_tokens(train_data["Tweet_content"].map(str)), specials=["<unk>", "<pad>"])
vocab.set_default_index(vocab["<unk>"])
totensor = ToTensor(padding_value=vocab["<pad>"])

In [152]:
## We redefine the collate function so that it will do dynamic padding
def collate_batch(batch):
    label_list, text_list = [], []
    for  _text, _label in batch:
        label_list.append(_label)
        processed_text = vocab(tokenizer(_text))
        text_list.append(processed_text)
    label_list = torch.tensor(label_list,dtype=torch.int64)
    text_list=  totensor(text_list)
    return  text_list, label_list

batch_size = 64
train_dataloader = DataLoader(train_dataset, batch_size = batch_size, shuffle=True, collate_fn = collate_batch)
test_dataloader = DataLoader(test_dataset, batch_size = batch_size, shuffle=False, collate_fn = collate_batch)

## Create Model with `nn.Embedding`

In [147]:
from torch import nn
class TextClassification(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_class, padding_idx):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=padding_idx, sparse=False)
        self.fc = nn.Linear(embed_dim, num_class)
    def forward(self,x):
        embeded = self.embedding(x)
        embeded_mean = embeded.mean(dim=1)
        return self.fc(embeded_mean)

In [148]:
model = TextClassification(len(vocab), embed_dim = 64, num_class = 2, padding_idx=vocab["<pad>"]).to(device)

## Training

In [149]:
def train(dataloader, model, loss_fn, optimizer, print_per_batches=50):
    size = len(dataloader.dataset)
    model.train()
    for batch, (X, y) in enumerate(dataloader):
        X, y  = X.to(device), y.to(device)

        pred = model(X)
        loss = loss_fn(pred,y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch % print_per_batches ==0:
            loss, current = loss.item(), batch*len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

def test(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval()
    test_loss, correct = 0, 0
    with torch.no_grad():
        for batch, (X, y) in enumerate(dataloader):
            X, y = X.to(device), y.to(device)
            pred = model(X)
            loss = loss_fn(pred,y)
            test_loss += loss.item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
    test_loss /= num_batches
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")

In [153]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr = 1e-3)

In [154]:
epochs = 20
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train(train_dataloader, model, loss_fn, optimizer)
    test(test_dataloader, model, loss_fn)
print("Done!") 

Epoch 1
-------------------------------
loss: 0.050813  [    0/43374]
loss: 0.076309  [ 3200/43374]
loss: 0.148553  [ 6400/43374]
loss: 0.087332  [ 9600/43374]
loss: 0.019943  [12800/43374]
loss: 0.018043  [16000/43374]
loss: 0.058363  [19200/43374]
loss: 0.031814  [22400/43374]
loss: 0.120483  [25600/43374]
loss: 0.080587  [28800/43374]
loss: 0.212798  [32000/43374]
loss: 0.026256  [35200/43374]
loss: 0.050686  [38400/43374]
loss: 0.127219  [41600/43374]
Test Error: 
 Accuracy: 97.8%, Avg loss: 0.098561 

Epoch 2
-------------------------------
loss: 0.164729  [    0/43374]
loss: 0.113378  [ 3200/43374]
loss: 0.074571  [ 6400/43374]
loss: 0.078975  [ 9600/43374]
loss: 0.063279  [12800/43374]
loss: 0.034681  [16000/43374]
loss: 0.040806  [19200/43374]
loss: 0.057081  [22400/43374]
loss: 0.082453  [25600/43374]
loss: 0.088137  [28800/43374]
loss: 0.113274  [32000/43374]
loss: 0.061517  [35200/43374]
loss: 0.102048  [38400/43374]
loss: 0.081739  [41600/43374]
Test Error: 
 Accuracy: 97.8