# Sentiment Analysis with TextCNN
We will use CNN over text embeddings to performance text classification. We can think of textCNN as a deep version of n-gram.
See [Section 15.3 of D2L](https://classic.d2l.ai/chapter_natural-language-processing-applications/sentiment-analysis-cnn.html) for more details.

# Prepare data

In [2]:
import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torchtext.transforms import VocabTransform, ToTensor

In [7]:
train_data_path = "datasets/twitter_sentiment_analysis/twitter_training.csv"
train_data = pd.read_csv(train_data_path,header=None)
train_data.columns = ["Tweet_ID","entity","sentiment","Tweet_content"]

test_data_path = "datasets/twitter_sentiment_analysis/twitter_validation.csv"
test_data = pd.read_csv(test_data_path,header=None)
test_data.columns = ["Tweet_ID","entity","sentiment","Tweet_content"]

train_data.sentiment.unique()

array(['Positive', 'Neutral', 'Negative', 'Irrelevant'], dtype=object)

In [10]:
sentiment_to_label = {"Negative":0, "Irrelevant":1, "Neutral":2, "Positive": 3}
train_data["label"] = train_data.sentiment.map(sentiment_to_label)
test_data["label"] = test_data.sentiment.map(sentiment_to_label)

train_data.label.unique()

array([3, 2, 0, 1])

In [11]:
class TwitterDataset:
    def __init__(self, texts, label):
        self.texts = texts
        self.label = label

    def __len__(self):
        return len(self.label)
    def __getitem__(self, idx):
        return self.texts[idx], self.label[idx]

train_dataset = TwitterDataset(train_data['Tweet_content'].map(str).values,train_data["label"].values)
test_dataset = TwitterDataset(test_data['Tweet_content'].map(str).values,test_data["label"].values)

In [12]:
tokenizer = get_tokenizer("basic_english")
def yield_tokens(data_iter):
    for text in data_iter:
        yield tokenizer(text)
vocab = build_vocab_from_iterator(yield_tokens(train_data["Tweet_content"].map(str)), specials=["<unk>", "<pad>"])
vocab.set_default_index(vocab["<unk>"])
totensor = ToTensor(padding_value=vocab["<pad>"])

In [13]:
## We redefine the collate function so that it will do dynamic padding
def collate_batch(batch):
    label_list, text_list, text_lens = [], [], []
    for  _text, _label in batch:
        label_list.append(_label)
        processed_text = vocab(tokenizer(_text))
        text_list.append(processed_text)
        text_lens.append(len(processed_text))
    label_list = torch.tensor(label_list,dtype=torch.int64)
    text_list=  totensor(text_list)
    text_lens = torch.tensor(text_lens,dtype = torch.int64)
    return  text_list, label_list, text_lens

batch_size = 32
train_dataloader = DataLoader(train_dataset, batch_size = batch_size, shuffle=True, collate_fn = collate_batch)
test_dataloader = DataLoader(test_dataset, batch_size = batch_size, shuffle=False, collate_fn = collate_batch)

In [14]:
X, y, l = next(iter(train_dataloader))
print(X.shape)
print(y.shape)

torch.Size([32, 55])
torch.Size([32])


# Building textCNN

In [16]:
from torch import nn
device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")  ## Only works on M-Series Mac

class TextCNN(nn.Module):
    def __init__(self, vocab_size, embed_size, kernel_sizes, num_channels, n_class=4):
        super(TextCNN,self).__init__()
        self.n_class = n_class
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.dropout = nn.Dropout(0.5)
        self.decoder = nn.Linear(sum(num_channels),n_class)
        self.convs = nn.ModuleList()
        for c, k in zip(num_channels, kernel_sizes):
            self.convs.append(nn.Conv1d(embed_size,c,k))
        self.pool = nn.AdaptiveMaxPool1d(1)
        self.relu = nn.ReLU()

    def forward(self, inputs):
        embeddings = self.embedding(inputs)
        embeddings = embeddings.permute(0,2,1)
        encoding = torch.cat([
            torch.squeeze(self.relu(self.pool(conv(embeddings))), dim=-1)
            for conv in self.convs], dim=1
        )
        outputs = self.decoder(self.dropout(encoding))
        return outputs

In [21]:
model = TextCNN(len(vocab),64,[3,4,5],[20,20,20]).to(device)

# Training

In [22]:
def train(dataloader, model, loss_fn, optimizer, print_per_batches=200):
    size = len(dataloader.dataset)
    model.train()
    for batch, (X, y, l) in enumerate(dataloader):
        X, y, l  = X.to(device), y.to(device), l.to(device)

        pred = model(X)
        loss = loss_fn(pred,y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch % print_per_batches ==0:
            loss, current = loss.item(), batch*len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

def test(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval()
    test_loss, correct = 0, 0
    with torch.no_grad():
        for batch, (X, y, l) in enumerate(dataloader):
            X, y, l = X.to(device), y.to(device), l.to(device)
            pred = model(X)
            loss = loss_fn(pred,y)
            test_loss += loss.item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
    test_loss /= num_batches
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")
    

loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr = 1e-3)

In [23]:
epochs = 10
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train(train_dataloader, model, loss_fn, optimizer)
    test(test_dataloader, model, loss_fn)
print("Done!") 

Epoch 1
-------------------------------
loss: 1.607651  [    0/74682]
loss: 1.340992  [ 6400/74682]
loss: 1.331274  [12800/74682]
loss: 1.212045  [19200/74682]
loss: 1.295337  [25600/74682]
loss: 1.177739  [32000/74682]
loss: 1.240867  [38400/74682]
loss: 1.367395  [44800/74682]
loss: 1.226619  [51200/74682]
loss: 1.096165  [57600/74682]
loss: 1.104173  [64000/74682]
loss: 1.133745  [70400/74682]
Test Error: 
 Accuracy: 63.0%, Avg loss: 0.969664 

Epoch 2
-------------------------------
loss: 1.166240  [    0/74682]
loss: 1.059350  [ 6400/74682]
loss: 1.082567  [12800/74682]
loss: 0.938266  [19200/74682]
loss: 1.120689  [25600/74682]
loss: 1.082127  [32000/74682]
loss: 1.001250  [38400/74682]
loss: 1.128776  [44800/74682]
loss: 0.744910  [51200/74682]
loss: 0.789417  [57600/74682]
loss: 0.872674  [64000/74682]
loss: 0.941402  [70400/74682]
Test Error: 
 Accuracy: 72.6%, Avg loss: 0.756742 

Epoch 3
-------------------------------
loss: 0.876672  [    0/74682]
loss: 0.915215  [ 6400/746