In [3]:
import torch
from torch import nn
from tqdm.notebook import tqdm
import pandas as pd
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
import re
from ipywidgets import interact_manual
import unicodedata
import sys
 
# setting path
sys.path.append('../')
from models.transformers import Encoder
from modules.utils import create_sequence_mask

In [4]:
train_data = pd.read_csv('train_pr.csv')
val_data = pd.read_csv('val_pr.csv')

In [5]:
def preprocess_text(text:str):
    text = unicodedata.normalize('NFC',text)
    text = re.sub(r'&[a-z]+?;','',text)
    text = re.sub(r'<[^>]+>',' _html_tag_ ',text)
    text = re.sub(r'[a-zA-Z0-9-_][.a-zA-Z0-9-_]*@([.a-zA-Z0-9_-]+){,3}',' _email_ ',text)
    text = re.sub(r'https?://[a-zA-Z0-9-_&.?/%=]*',' _url_link_ ',text)
    text = re.sub(r'#[_a-zA-Z0-9]+',' _hash_tag_ ',text)
    text = re.sub(r'@[_a-zA-Z0-9]+',' _tag_ ',text)
    text = re.sub(r'[0-9][0-9.,]*',' _num_ ',text)
    text = re.sub(r'[^\sa-zA-Z0-9.,!;:/\'^()=<>_?]','',text)
    text = re.sub(r'(\s*[.!?:;-]\s*)+',r' \1 ', text)
    
    return ' '.join(word_tokenize(text)).lower()

In [6]:
@interact_manual
def _(text=''):
    return preprocess_text(text)

interactive(children=(Text(value='', description='text'), Button(description='Run Interact', style=ButtonStyle…

In [7]:
"""
tqdm.pandas()
train_data['SentimentText'] = train_data['SentimentText'].progress_apply(preprocess_text)
train_data.to_csv('train_pr.csv',index=False)
val_data['SentimentText'] = val_data['SentimentText'].progress_apply(preprocess_text)
val_data.to_csv('val_pr.csv',index=False)
"""

"\ntqdm.pandas()\ntrain_data['SentimentText'] = train_data['SentimentText'].progress_apply(preprocess_text)\ntrain_data.to_csv('train_pr.csv',index=False)\nval_data['SentimentText'] = val_data['SentimentText'].progress_apply(preprocess_text)\nval_data.to_csv('val_pr.csv',index=False)\n"

In [8]:
from collections import Counter

PAD = "[PAD]"
UNK = "[UNK]"


def build_vocab(df, min_freq=3):
    counter = Counter()
    for tweet in df.itertuples():
        counter.update(tweet.SentimentText.split())

    vocab = sorted([w for w, c in counter.items() if c >= min_freq])

    w2i = {w: i for i, w in enumerate([PAD, UNK] + vocab)}
    i2w = {v: k for k, v in w2i.items()}

    return w2i, i2w


w2i, i2w = build_vocab(train_data)

In [9]:
from torch.utils.data import DataLoader, Dataset


class TweetDataset(Dataset):
    def __init__(self, df, vocab):
        self.df = df
        self.vocab = vocab

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index: int):
        text = self.df["SentimentText"].values[index].split()

        return (
            self.df["Sentiment"].values[index],
            [self.vocab.get(w, self.vocab[UNK]) for w in text],
            len(text),
        )


class TweetDatasetCollate:
    def __init__(self, max_length, vocab):
        self.max_length = max_length
        self.vocab = vocab

    def __call__(self, batch):
        labels = []
        ids = []
        lens = []

        for label, idx, l in batch:
            labels.append(label)
            ids.append(
                idx[: self.max_length]
                + max(0, self.max_length - len(idx)) * [self.vocab[PAD]]
            )
            lens.append(l if l <= self.max_length else self.max_length)

        return torch.LongTensor(labels), torch.LongTensor(ids), torch.LongTensor(lens)

In [10]:
class EncoderClassifier(nn.Module):
    def __init__(
        self,
        vocab_size: int,
        n_heads: int,
        max_length: int,
        n_blocks: int,
        d_model: int,
        d_ff: int,
        d_k: int,
        d_v: int,
        n_classes: int,
        p_drop: float,
    ):
        super().__init__()
        self.max_length = max_length
        self.encoder = Encoder(
            vocab_size, n_heads, max_length, n_blocks, d_model, d_ff, d_k, d_v, p_drop
        )
        self.fc = nn.Linear(d_model, n_classes)
        self.dropout = nn.Dropout(p=p_drop)

    def forward(self, x: torch.Tensor, x_lengths: torch.Tensor):
        x_mask = create_sequence_mask(x_lengths, self.max_length)
        x = torch.relu(self.encoder(x, x_mask))
        return self.fc(self.dropout((x * x_mask.squeeze().unsqueeze(-1)).mean(dim=1)))

In [11]:
vocab_size = len(w2i)
max_length = 128
n_heads = 8
d_model = 128
n_blocks = 4
d_ff = 128
d_k = d_v = d_model // n_heads
p_drop = 0.4
n_classes = 2
batch_size = 128

In [12]:
collate = TweetDatasetCollate(max_length, w2i)
train_dl = DataLoader(
    TweetDataset(train_data, w2i),
    shuffle=True,
    batch_size=batch_size,
    collate_fn=collate,
)
val_dl = DataLoader(
    TweetDataset(val_data, w2i), batch_size=batch_size * 2, collate_fn=collate
)

In [13]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu');device

device(type='cuda')

In [14]:
clf = EncoderClassifier(
    vocab_size,
    n_heads,
    max_length,
    n_blocks,
    d_model,
    d_ff,
    d_k,
    d_v,
    n_classes,
    p_drop,
).to(device)

for p in clf.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)

In [15]:
from torch.optim import Adam
from torch.optim.lr_scheduler import ExponentialLR

optimizer = Adam(clf.parameters(), lr=0.0008, betas=(0.9, 0.98), weight_decay=0.001)
loss_fn = nn.CrossEntropyLoss()
scheduler = ExponentialLR(optimizer, 0.999**0.125)

In [16]:
import os

os.makedirs("checkpoints", exist_ok=True)

def train_and_eval(
    model,
    optimizer,
    loss_fn,
    scheduler,
    epochs,
    train_dl,
    val_dl,
    early_stopping=5,
    model_name="sample_model",
):
    p_bar = tqdm(total=len(train_dl))

    best_val_loss = 25042001
    patience = 0

    for epoch in range(epochs):
        train_loss = 0
        train_corr = 0
        val_loss = 0
        val_corr = 0

        model.train()
        for y, x, x_len in train_dl:
            y, x, x_len = y.to(device), x.to(device), x_len.to(device)

            logits = model(x, x_len)
            loss = loss_fn(logits, y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            train_loss += loss.item()
            train_corr += (y == logits.argmax(-1)).sum().item()
            p_bar.update(1)
        scheduler.step()

        model.eval()
        with torch.inference_mode():
            for y, x, x_len in val_dl:
                y, x, x_len = y.to(device), x.to(device), x_len.to(device)

                logits = model(x, x_len)

                val_loss += loss_fn(logits, y).item()
                val_corr += (y == logits.argmax(-1)).sum().item()

        train_loss /= len(train_dl)
        val_loss /= len(val_dl)
        train_acc = train_corr / len(train_dl.dataset)
        val_acc = val_corr / len(val_dl.dataset)

        if val_loss > best_val_loss:
            patience += 1
        else:
            torch.save(
                model.state_dict(),
                f"checkpoints/{model_name}_{(epoch+1)*len(train_dl)}.pth",
            )
            patience = 0
            best_val_loss = val_loss

        print(
            f"Epoch {epoch+1}:\n Train loss: {train_loss:.6f} - Train acc: {train_acc:.6f}\n Val loss: {val_loss:.6f} - Val acc: {val_acc:.6f}"
        )

        if patience >= early_stopping:
            print(
                f"Stopped since val loss has not improved in the last {early_stopping} epochs..."
            )
            break

        p_bar.reset()

In [17]:
epochs = 100
train_and_eval(clf, optimizer, loss_fn, scheduler, epochs, train_dl, val_dl)

  0%|          | 0/625 [00:00<?, ?it/s]

Epoch 1:
 Train loss: 0.564640 - Train acc: 0.722406
 Val loss: 0.509089 - Val acc: 0.762126
Epoch 2:
 Train loss: 0.509808 - Train acc: 0.764311
 Val loss: 0.492942 - Val acc: 0.769927
Epoch 3:
 Train loss: 0.499559 - Train acc: 0.769399
 Val loss: 0.492653 - Val acc: 0.766327
Epoch 4:
 Train loss: 0.495759 - Train acc: 0.770724
 Val loss: 0.488009 - Val acc: 0.769027
Epoch 5:
 Train loss: 0.493141 - Train acc: 0.771487
 Val loss: 0.487774 - Val acc: 0.765777
Epoch 6:
 Train loss: 0.493293 - Train acc: 0.772787
 Val loss: 0.492504 - Val acc: 0.764726
Epoch 7:
 Train loss: 0.490129 - Train acc: 0.773875
 Val loss: 0.484608 - Val acc: 0.769727
Epoch 8:
 Train loss: 0.489195 - Train acc: 0.774125
 Val loss: 0.483867 - Val acc: 0.772477
Epoch 9:
 Train loss: 0.489668 - Train acc: 0.774625
 Val loss: 0.481184 - Val acc: 0.771627
Epoch 10:
 Train loss: 0.488876 - Train acc: 0.774587
 Val loss: 0.493154 - Val acc: 0.760326
Epoch 11:
 Train loss: 0.487455 - Train acc: 0.775687
 Val loss: 0.48

In [18]:
from sklearn.metrics import f1_score

pretrained = EncoderClassifier(
    vocab_size,
    n_heads,
    max_length,
    n_blocks,
    d_model,
    d_ff,
    d_k,
    d_v,
    n_classes,
    p_drop,
)

pretrained.load_state_dict(torch.load("checkpoints/sample_model_13125.pth"))
pretrained.to(device)
pretrained.eval()

y_true = []
y_pred = []
val_corr = 0
with torch.inference_mode():
    for y, x, x_len in val_dl:
        y, x, x_len = y.to(device), x.to(device), x_len.to(device)

        y_hat = pretrained(x, x_len).argmax(-1)
        val_corr += (y == y_hat).sum().item()
        y_true += y.tolist()
        y_pred += y_hat.tolist()

print(f"Best acc: {val_corr/len(val_dl.dataset):.6f}")
print(f"Best f1: {f1_score(y_true,y_pred):.6f}")

Best acc: 0.775578
Best f1: 0.802395
