In [1]:
import re
import sys
import unicodedata

import pandas as pd
import torch
from ipywidgets import interact_manual
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from torch import nn
from torch.utils.data import DataLoader
from tqdm.notebook import tqdm

# setting path
sys.path.append("../../")
from models import EncoderArgs, EncoderClassifier
from utils.datasets import TweetDataset
from utils.trainer import TrainingArgs, eval, train_and_val

Data source: https://www.kaggle.com/datasets/saurabhshahane/twitter-sentiment-dataset

In [None]:
# data = pd.read_csv("datasets/Twitter_Data.csv")
# data.drop_duplicates(subset="clean_text", inplace=True)

In [None]:
# data.isnull().sum()

In [None]:
# data.dropna(inplace=True)
# data.isnull().sum()

In [2]:
def preprocess_text(text: str):
    text = unicodedata.normalize("NFC", text)
    text = re.sub(r"&[a-z]+?;", "", text)
    text = re.sub(r"<[^>]+>", " _html_tag_ ", text)
    text = re.sub(
        r"[a-zA-Z0-9_-][.a-zA-Z0-9_-]*@[a-zA-Z0-9_-]+(\.[a-zA-Z0-9_-]+){1,3}",
        " _email_ ",
        text,
    )
    text = re.sub(r"https?://[a-zA-Z0-9-_&.?/%=]*", " _url_link_ ", text)
    text = re.sub(r"#[_a-zA-Z0-9]+", " _hash_tag_ ", text)
    text = re.sub(r"@[_a-zA-Z0-9]+", " _tag_ ", text)
    text = re.sub(r"[0-9][0-9.,]*", " _num_ ", text)
    text = re.sub(r"[^\sa-zA-Z0-9.,!;:/\'^()=<>_?]", "", text)
    text = re.sub(r"(\s*[.!?:;-]\s*)+", r" \1 ", text)

    return " ".join(word_tokenize(text)).lower()

In [3]:
@interact_manual
def _(text=""):
    return preprocess_text(text)

interactive(children=(Text(value='', description='text'), Button(description='Run Interact', style=ButtonStyle…

In [None]:
# tqdm.pandas()
# data["clean_text"] = data["clean_text"].progress_apply(
#     lambda x: preprocess_text(str(x))
# )
# data["category"] = data["category"].progress_apply(lambda x: int(x + 1))
# train_data, val_test_data = train_test_split(data, test_size=0.2, random_state=42)
# val_data, test_data = train_test_split(val_test_data, test_size=0.5, random_state=42)
# train_data.to_csv("train_pr.csv", index=False)
# val_data.to_csv("val_pr.csv", index=False)
# test_data.to_csv("test_pr.csv", index=False)

In [4]:
train_data = pd.read_csv("datasets/train_pr.csv")
val_data = pd.read_csv("datasets/val_pr.csv")
test_data = pd.read_csv("datasets/test_pr.csv")

In [5]:
from collections import Counter

PAD = "[PAD]"
UNK = "[UNK]"


def build_vocab(df, min_freq=2):
    counter = Counter()
    for tweet in df.itertuples():
        counter.update(str(tweet.clean_text).split())

    vocab = sorted([w for w, c in counter.items() if c >= min_freq])

    w2i = {w: i for i, w in enumerate([PAD, UNK] + vocab)}
    i2w = {v: k for k, v in w2i.items()}

    return w2i, i2w


w2i, i2w = build_vocab(train_data)

In [6]:
model_args = EncoderArgs(
    vocab_size=len(w2i),
    n_heads=8,
    n_blocks=4,
    d_model=128,
    d_head=128 // 8,
    bias=False,
    p_drop=0.4,
    max_length=128,
    n_classes=3,
)

In [7]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [8]:
clf = EncoderClassifier(model_args).to(device)

for p in clf.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)

In [9]:
batch_size = 128
train_dl = TweetDataset(train_data, w2i, model_args.max_length)
train_dl = DataLoader(
    train_dl,
    shuffle=True,
    batch_size=batch_size,
    collate_fn=train_dl.collate_fn,
)
val_dl = TweetDataset(val_data, w2i, model_args.max_length)
val_dl = DataLoader(val_dl, batch_size=batch_size * 2, collate_fn=val_dl.collate_fn)
test_dl = DataLoader(
    TweetDataset(test_data, w2i, model_args.max_length),
    batch_size=batch_size * 2,
    collate_fn=val_dl.collate_fn,
)

def loss_batch_cls(
    batch,
    model,
    device,
    loss_fn,
):
    y, x, x_len = batch[0].to(device), batch[1].to(device), batch[2].to(device)
    logits = model(x, x_len)
    return loss_fn(logits, y), {"acc": (y == logits.argmax(-1)).sum().item()}


loss_fn = nn.CrossEntropyLoss()

training_args = TrainingArgs(
    loss_batch=lambda batch, model, device: loss_batch_cls(
        batch, model, device, loss_fn
    ),
    train_dl=train_dl,
    val_dl=val_dl,
    test_dl=test_dl,
    n_epochs=100,
    early_stopping_patience=5,
    device=device,
)

In [10]:
from torch.optim import Adam
from torch.optim.lr_scheduler import ExponentialLR

optimizer = Adam(clf.parameters(), lr=0.0008, betas=(0.9, 0.98), weight_decay=0.001)
scheduler = ExponentialLR(optimizer, 0.999**0.125)

In [None]:
epochs = 100
model_name = "encoder_clf"
train_and_val(
    clf,
    optimizer,
    scheduler,
    training_args,
    model_name=model_name,
)

In [12]:
clf.load_state_dict(torch.load(f"checkpoints/{model_name}.pth"))
eval(
    clf,
    training_args.loss_batch,
    training_args.test_dl,
    training_args.device,
)

Evaluation:
	Test loss: 0.114752 | Test acc: 0.965883


{'test': {'loss': [0.11475180799607188],
  'metrics': [{'acc': 0.9658832914033257}]}}

In [13]:
from sklearn.metrics import classification_report, confusion_matrix

y_true = []
y_pred = []
val_corr = 0
with torch.inference_mode():
    for y, x, x_len in test_dl:
        y, x, x_len = y.to(device), x.to(device), x_len.to(device)

        y_hat = clf(x, x_len).argmax(-1)
        val_corr += (y == y_hat).sum().item()
        y_true += y.tolist()
        y_pred += y_hat.tolist()

print(classification_report(y_true, y_pred))
print(confusion_matrix(y_true, y_pred, normalize="true"))

              precision    recall  f1-score   support

           0       0.94      0.92      0.93      3590
           1       0.99      0.98      0.99      5527
           2       0.96      0.98      0.97      7180

    accuracy                           0.97     16297
   macro avg       0.96      0.96      0.96     16297
weighted avg       0.97      0.97      0.97     16297

[[0.92284123 0.00389972 0.07325905]
 [0.01157952 0.97919305 0.00922743]
 [0.02047354 0.00236769 0.97715877]]
