In [1]:
import torch
from torch import nn
from tqdm.notebook import tqdm
import pandas as pd
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
import re
from ipywidgets import interact_manual
import unicodedata
import sys

# setting path
sys.path.append("../../")
from modules.blocks import EncoderBlock
from modules.layers import PositionalEncoding, Embedding
from modules.utils import create_pad_mask

Data source: https://www.kaggle.com/datasets/saurabhshahane/twitter-sentiment-dataset

In [2]:
# data = pd.read_csv("datasets/Twitter_Data.csv")
# data.drop_duplicates(subset="clean_text", inplace=True)

In [3]:
# data.isnull().sum()

In [4]:
# data.dropna(inplace=True)
# data.isnull().sum()

In [5]:
def preprocess_text(text: str):
    text = unicodedata.normalize("NFC", text)
    text = re.sub(r"&[a-z]+?;", "", text)
    text = re.sub(r"<[^>]+>", " _html_tag_ ", text)
    text = re.sub(
        r"[a-zA-Z0-9_-][.a-zA-Z0-9_-]*@[a-zA-Z0-9_-]+(\.[a-zA-Z0-9_-]+){1,3}", " _email_ ", text
    )
    text = re.sub(r"https?://[a-zA-Z0-9-_&.?/%=]*", " _url_link_ ", text)
    text = re.sub(r"#[_a-zA-Z0-9]+", " _hash_tag_ ", text)
    text = re.sub(r"@[_a-zA-Z0-9]+", " _tag_ ", text)
    text = re.sub(r"[0-9][0-9.,]*", " _num_ ", text)
    text = re.sub(r"[^\sa-zA-Z0-9.,!;:/\'^()=<>_?]", "", text)
    text = re.sub(r"(\s*[.!?:;-]\s*)+", r" \1 ", text)

    return " ".join(word_tokenize(text)).lower()

In [6]:
@interact_manual
def _(text=""):
    return preprocess_text(text)

interactive(children=(Text(value='', description='text'), Button(description='Run Interact', style=ButtonStyle…

In [7]:
# tqdm.pandas()
# data["clean_text"] = data["clean_text"].progress_apply(
#     lambda x: preprocess_text(str(x))
# )
# data["category"] = data["category"].progress_apply(lambda x: int(x + 1))
# train_data, val_test_data = train_test_split(data, test_size=0.2, random_state=42)
# val_data, test_data = train_test_split(val_test_data, test_size=0.5, random_state=42)
# train_data.to_csv("train_pr.csv", index=False)
# val_data.to_csv("val_pr.csv", index=False)
# test_data.to_csv("test_pr.csv", index=False)

In [8]:
train_data = pd.read_csv("datasets/train_pr.csv")
val_data = pd.read_csv("datasets/val_pr.csv")
test_data = pd.read_csv("datasets/test_pr.csv")

In [9]:
from collections import Counter

PAD = "[PAD]"
UNK = "[UNK]"


def build_vocab(df, min_freq=2):
    counter = Counter()
    for tweet in df.itertuples():
        counter.update(str(tweet.clean_text).split())

    vocab = sorted([w for w, c in counter.items() if c >= min_freq])

    w2i = {w: i for i, w in enumerate([PAD, UNK] + vocab)}
    i2w = {v: k for k, v in w2i.items()}

    return w2i, i2w


w2i, i2w = build_vocab(train_data)

In [10]:
from torch.utils.data import DataLoader, Dataset


class TweetDataset(Dataset):
    def __init__(self, df, vocab):
        self.df = df
        self.vocab = vocab

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index: int):
        text = str(self.df.clean_text.values[index]).split()

        return (
            self.df.category.values[index],
            [self.vocab.get(w, self.vocab[UNK]) for w in text],
            len(text),
        )


class TweetDataCollate:
    def __init__(self, max_length, vocab):
        self.max_length = max_length
        self.vocab = vocab

    def __call__(self, batch):
        labels = []
        ids = []
        lens = []

        for label, idx, l in batch:
            labels.append(label)
            ids.append(
                idx[: self.max_length]
                + max(0, self.max_length - len(idx)) * [self.vocab[PAD]]
            )
            lens.append(l if l <= self.max_length else self.max_length)

        return torch.LongTensor(labels), torch.LongTensor(ids), torch.LongTensor(lens)

In [12]:
vocab_size = len(w2i)
max_length = 128
n_heads = 8
d_model = 128
n_blocks = 4
d_ff = 512
d_k = d_v = d_model // n_heads
p_drop = 0.4
n_classes = 3
batch_size = 128

In [13]:
collate = TweetDataCollate(max_length, w2i)
train_dl = DataLoader(
    TweetDataset(train_data, w2i),
    shuffle=True,
    batch_size=batch_size,
    collate_fn=collate,
)
val_dl = DataLoader(
    TweetDataset(val_data, w2i), batch_size=batch_size * 2, collate_fn=collate
)
test_dl = DataLoader(
    TweetDataset(test_data, w2i), batch_size=batch_size * 2, collate_fn=collate
)

In [14]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [15]:
clf = EncoderClassifier(
    vocab_size,
    n_heads,
    max_length,
    n_blocks,
    d_model,
    d_ff,
    d_k,
    d_v,
    n_classes,
    p_drop,
).to(device)

for p in clf.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)

In [16]:
from torch.optim import Adam
from torch.optim.lr_scheduler import ExponentialLR

optimizer = Adam(clf.parameters(), lr=0.0008, betas=(0.9, 0.98), weight_decay=0.001)
loss_fn = nn.CrossEntropyLoss()
scheduler = ExponentialLR(optimizer, 0.999**0.125)

In [17]:
import os

os.makedirs("checkpoints", exist_ok=True)


def train_and_eval(
    model,
    optimizer,
    loss_fn,
    scheduler,
    epochs,
    train_dl,
    val_dl,
    early_stopping=5,
    model_name="sample_model",
):
    p_bar = tqdm(total=len(train_dl))

    best_val_loss = 25042001
    patience = 0

    for epoch in range(epochs):
        train_loss = 0
        train_corr = 0
        val_loss = 0
        val_corr = 0

        model.train()
        for y, x, x_len in train_dl:
            y, x, x_len = y.to(device), x.to(device), x_len.to(device)

            logits = model(x, x_len)
            loss = loss_fn(logits, y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            train_loss += loss.item()
            train_corr += (y == logits.argmax(-1)).sum().item()
            p_bar.update(1)
        scheduler.step()

        model.eval()
        with torch.inference_mode():
            for y, x, x_len in val_dl:
                y, x, x_len = y.to(device), x.to(device), x_len.to(device)

                logits = model(x, x_len)

                val_loss += loss_fn(logits, y).item()
                val_corr += (y == logits.argmax(-1)).sum().item()

        train_loss /= len(train_dl)
        val_loss /= len(val_dl)
        train_acc = train_corr / len(train_dl.dataset)
        val_acc = val_corr / len(val_dl.dataset)

        if val_loss > best_val_loss:
            patience += 1
        else:
            torch.save(
                model.state_dict(),
                f"checkpoints/{model_name}.pth",
            )
            patience = 0
            best_val_loss = val_loss

        print(
            f"Epoch {epoch+1}:\n Train loss: {train_loss:.6f} - Train acc: {train_acc:.6f}\n Val loss: {val_loss:.6f} - Val acc: {val_acc:.6f}"
        )

        if patience >= early_stopping:
            print(
                f"Stopped since val loss has not improved in the last {early_stopping} epochs..."
            )
            break

        p_bar.reset()

In [18]:
epochs = 100
model_name = "encoder_clf"
train_and_eval(
    clf, optimizer, loss_fn, scheduler, epochs, train_dl, val_dl, model_name=model_name
)

  0%|          | 0/1019 [00:00<?, ?it/s]

Epoch 1:
 Train loss: 0.505428 - Train acc: 0.837607
 Val loss: 0.236478 - Val acc: 0.938455
Epoch 2:
 Train loss: 0.251264 - Train acc: 0.933707
 Val loss: 0.204853 - Val acc: 0.940603
Epoch 3:
 Train loss: 0.236071 - Train acc: 0.935992
 Val loss: 0.203866 - Val acc: 0.942934
Epoch 4:
 Train loss: 0.232603 - Train acc: 0.936982
 Val loss: 0.190742 - Val acc: 0.943487
Epoch 5:
 Train loss: 0.226595 - Train acc: 0.938424
 Val loss: 0.189299 - Val acc: 0.945818
Epoch 6:
 Train loss: 0.224444 - Train acc: 0.938431
 Val loss: 0.201386 - Val acc: 0.947414
Epoch 7:
 Train loss: 0.223790 - Train acc: 0.938370
 Val loss: 0.192810 - Val acc: 0.945021
Epoch 8:
 Train loss: 0.223068 - Train acc: 0.938186
 Val loss: 0.182286 - Val acc: 0.946800
Epoch 9:
 Train loss: 0.221698 - Train acc: 0.939099
 Val loss: 0.189435 - Val acc: 0.946739
Epoch 10:
 Train loss: 0.221247 - Train acc: 0.938915
 Val loss: 0.175689 - Val acc: 0.948825
Epoch 11:
 Train loss: 0.220197 - Train acc: 0.939697
 Val loss: 0.19

In [19]:
from sklearn.metrics import classification_report, confusion_matrix

pretrained = EncoderClassifier(
    vocab_size,
    n_heads,
    max_length,
    n_blocks,
    d_model,
    d_ff,
    d_k,
    d_v,
    n_classes,
    p_drop,
)

pretrained.load_state_dict(torch.load(f"checkpoints/{model_name}.pth"))
pretrained.to(device)
pretrained.eval()

y_true = []
y_pred = []
val_corr = 0
with torch.inference_mode():
    for y, x, x_len in test_dl:
        y, x, x_len = y.to(device), x.to(device), x_len.to(device)

        y_hat = pretrained(x, x_len).argmax(-1)
        val_corr += (y == y_hat).sum().item()
        y_true += y.tolist()
        y_pred += y_hat.tolist()

print(classification_report(y_true, y_pred))
print(confusion_matrix(y_true, y_pred, normalize="true"))

              precision    recall  f1-score   support

           0       0.92      0.88      0.90      3590
           1       0.99      0.97      0.98      5527
           2       0.93      0.96      0.95      7180

    accuracy                           0.95     16297
   macro avg       0.95      0.94      0.94     16297
weighted avg       0.95      0.95      0.95     16297

[[0.88133705 0.00891365 0.1097493 ]
 [0.00796092 0.97412701 0.01791207]
 [0.03356546 0.00640669 0.96002786]]
