# Sentiment Analysis with Multilayer Perceptrons
We will apply the vanilla Neural Network (MLP) to perform sentiment analysis. Since we are not using any NLP specific model structures, we will have to perform feature engineering for the text data. We will use TF-IDF as input for the model.

# Prepare data

In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
train_data_path = "datasets/twitter_sentiment_analysis/twitter_training.csv"
train_data = pd.read_csv(train_data_path,header=None)
train_data.columns = ["Tweet_ID","entity","sentiment","Tweet_content"]

test_data_path = "datasets/twitter_sentiment_analysis/twitter_validation.csv"
test_data = pd.read_csv(test_data_path,header=None)
test_data.columns = ["Tweet_ID","entity","sentiment","Tweet_content"]

In [3]:
## Inlcude Only "Positive" and "Negatvie" twitts to form a binary classification problem
## Label Positve as 1 and Negative as 0
train_data = train_data[train_data.sentiment.isin(["Positive","Negative"])]
train_data["label"] = train_data.sentiment.map({"Positive":1, "Negative":0})
test_data = test_data[test_data.sentiment.isin(["Positive","Negative"])]
test_data["label"] = test_data.sentiment.map({"Positive":1, "Negative":0})

## Calculating TF-IDF

In [4]:
vectorizer = TfidfVectorizer(stop_words='english',max_features=5000) ## For simplicity we restrict to only 5000 features
vectorizer.fit(train_data.Tweet_content.apply(str).tolist()+test_data.Tweet_content.apply(str).tolist())

In [36]:
train_tfidf = vectorizer.transform(train_data.Tweet_content.apply(str)).todense()#.toarray().astype("float32")
test_tfidf = vectorizer.transform(test_data.Tweet_content.apply(str)).todense()#.toarray().astype("float32")

In [37]:
train_tfidf

matrix([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]])

# Building MLP with torch

## Create dataloader

In [38]:
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

In [50]:
class CustomDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.int)

    def __len__(self):
        return self.X.shape[0]
    
    def __getitem__(self,idx):
        return self.X[idx,], self.y[idx]

In [51]:
train_dataset = CustomDataset(train_tfidf,train_data.label.values)
test_dataset = CustomDataset(test_tfidf,test_data.label.values)
x, y = train_dataset[0]
print(x)
print(y)

tensor([0., 0., 0.,  ..., 0., 0., 0.])
tensor(1, dtype=torch.int32)


In [52]:
batch_size = 100
train_dataloader = DataLoader(train_dataset, batch_size = batch_size)
test_dataloader = DataLoader(test_dataset, batch_size = batch_size)

In [53]:
for x, y in train_dataloader:
    print(x.shape)
    print(y.shape)
    break

torch.Size([100, 5000])
torch.Size([100])


## Creating model

In [54]:
from torch import nn

In [55]:
## We define a MLP with 3 layers
class MLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(
            nn.LazyLinear(100),
            nn.ReLU(),
            nn.LazyLinear(100),
            nn.ReLU(),
            nn.LazyLinear(2)
        )
    def forward(self,x):
        return self.net(x)

In [56]:
## Use it if on a device with NVIDIA graphical card
## device = "cuda" if torch.cuda.is_available() else "cpu"  
device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")  ## Only works on M-Series Mac
print(f"Using {device} device")

Using mps device


In [57]:
model = MLP().to(device)
print(model)

MLP(
  (net): Sequential(
    (0): LazyLinear(in_features=0, out_features=100, bias=True)
    (1): ReLU()
    (2): LazyLinear(in_features=0, out_features=100, bias=True)
    (3): ReLU()
    (4): LazyLinear(in_features=0, out_features=2, bias=True)
  )
)


## Create Training step

In [58]:
def train(dataloader, model, loss_fn, optimizer, print_per_batches=50):
    size = len(dataloader.dataset)
    model.train()
    for batch, (X,y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)

        pred = model(X)
        loss = loss_fn(pred,y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch % print_per_batches ==0:
            loss, current = loss.item(), batch*len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

def test(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval()
    test_loss, correct = 0, 0
    with torch.no_grad():
        for batch, (X,y) in enumerate(dataloader):
            X, y = X.to(device), y.to(device)
            pred = model(X)
            loss = loss_fn(pred,y)
            test_loss += loss.item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
    test_loss /= num_batches
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")

## Training the Model

In [59]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr = 1e-3)

In [60]:
model.train()
X, y = next(iter(train_dataloader))
pred = model(X.to(device))
loss = loss_fn(pred, y.to(device))

loss.backward()

In [62]:
epochs = 20
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train(train_dataloader, model, loss_fn, optimizer)
    test(test_dataloader, model, loss_fn)
print("Done!") 

Epoch 1
-------------------------------
loss: 0.182291  [    0/43374]
loss: 0.046150  [ 5000/43374]
loss: 0.042056  [10000/43374]
loss: 0.004592  [15000/43374]
loss: 0.012610  [20000/43374]
loss: 0.008226  [25000/43374]
loss: 0.067306  [30000/43374]
loss: 0.041371  [35000/43374]
loss: 0.041528  [40000/43374]
Test Error: 
 Accuracy: 97.4%, Avg loss: 0.130582 

Epoch 2
-------------------------------
loss: 0.180515  [    0/43374]
loss: 0.038677  [ 5000/43374]
loss: 0.033585  [10000/43374]
loss: 0.003766  [15000/43374]
loss: 0.011529  [20000/43374]
loss: 0.008271  [25000/43374]
loss: 0.076177  [30000/43374]
loss: 0.041988  [35000/43374]
loss: 0.034796  [40000/43374]
Test Error: 
 Accuracy: 97.2%, Avg loss: 0.147953 

Epoch 3
-------------------------------
loss: 0.182421  [    0/43374]
loss: 0.036228  [ 5000/43374]
loss: 0.031685  [10000/43374]
loss: 0.003179  [15000/43374]
loss: 0.011233  [20000/43374]
loss: 0.008557  [25000/43374]
loss: 0.060539  [30000/43374]
loss: 0.030053  [35000/433