# Sentiment Analysis with Multilayer Perceptrons
We will apply the vanilla Neural Network (MLP) to perform sentiment analysis. Since we are not using any NLP specific model structures, we will have to perform feature engineering for the text data. We will use TF-IDF as input for the model.

# Prepare data

In [4]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
train_data_path = "datasets/twitter_sentiment_analysis/twitter_training.csv"
train_data = pd.read_csv(train_data_path,header=None)
train_data.columns = ["Tweet_ID","entity","sentiment","Tweet_content"]

test_data_path = "datasets/twitter_sentiment_analysis/twitter_validation.csv"
test_data = pd.read_csv(test_data_path,header=None)
test_data.columns = ["Tweet_ID","entity","sentiment","Tweet_content"]

In [3]:
## Inlcude Only "Positive" and "Negatvie" twitts to form a binary classification problem
## Label Positve as 1 and Negative as 0
train_data = train_data[train_data.sentiment.isin(["Positive","Negative"])]
train_data["label"] = train_data.sentiment.map({"Positive":1, "Negative":0})
test_data = test_data[test_data.sentiment.isin(["Positive","Negative"])]
test_data["label"] = test_data.sentiment.map({"Positive":1, "Negative":0})

## Calculating TF-IDF

In [5]:
vectorizer = TfidfVectorizer(stop_words='english',max_features=5000) ## For simplicity we restrict to only 5000 features
vectorizer.fit(train_data.Tweet_content.apply(str).tolist()+test_data.Tweet_content.apply(str).tolist())

In [13]:
train_tfidf = vectorizer.transform(train_data.Tweet_content.apply(str)).todense()
test_tfidf = vectorizer.transform(test_data.Tweet_content.apply(str)).todense()

In [14]:
train_tfidf.shape

(43374, 5000)

# Building MLP with torch

## Create dataloader

In [28]:
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

In [31]:
class CustomDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X)
        self.y = torch.tensor(y)

    def __len__(self):
        return self.X.shape[0]
    
    def __getitem__(self,idx):
        return self.X[idx,], self.y[idx]

In [32]:
train_dataset = CustomDataset(train_tfidf,train_data.label.values)
test_dataset = CustomDataset(test_tfidf,test_data.label.values)
x, y = train_dataset[0]
print(x)
print(y)

tensor([0., 0., 0.,  ..., 0., 0., 0.], dtype=torch.float64)
tensor(1)


In [33]:
batch_size = 100
train_dataloader = DataLoader(train_dataset, batch_size = batch_size)
test_dataloader = DataLoader(test_dataset, batch_size = batch_size)

In [34]:
for x, y in train_dataloader:
    print(x.shape)
    print(y.shape)
    break

torch.Size([100, 5000])
torch.Size([100])
