In [1]:
# import os
# os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [2]:
from __future__ import annotations
from collections import defaultdict

import json
import copy

import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils import clip_grad_norm_

from common import *
from lstm import LSTMBinaryClassifier, calc_loss

import pandas as pd

In [3]:
DATA_DIR = "../data/"
TRAIN_FN = "train_tokenized.json"
VAL_FN = "val_tokenized.json"
TEST_FN = "test_tokenized.json"
W2I_FN = "word2ind.json"

In [4]:
train_df = pd.read_json(DATA_DIR + TRAIN_FN, lines=True)
val_df = pd.read_json(DATA_DIR + VAL_FN, lines=True)
with open(DATA_DIR + W2I_FN, "r") as fin:
    w2i = json.load(fin)
train_df["tokens"] = train_df["tokens"].apply(lambda s: vectorize_tokens(s, w2i))
val_df["tokens"] = val_df["tokens"].apply(lambda s: vectorize_tokens(s, w2i))

In [11]:
from torch.nn.utils.rnn import pad_sequence
class NewspaperSarcasmDataset(Dataset):
    def __init__(self, tokens: list[NDArray], labels: list[int], w2i: defaultdict[str, int]):
        super().__init__()
        tokens = [torch.LongTensor(tk) for tk in tokens]
        self.tokens = pad_sequence(tokens, batch_first=True, padding_value=w2i[TK_END])
        self.lengths = torch.LongTensor([len(tk) for tk in tokens])
        self.labels = torch.LongTensor(labels)

    def __len__(self):
        return len(self.tokens)
    
    def __getitem__(self, i):
        return self.tokens[i], self.lengths[i], self.labels[i]

train_set = NewspaperSarcasmDataset(train_df["tokens"], train_df["is_sarcastic"], w2i)
val_set = NewspaperSarcasmDataset(val_df["tokens"], val_df["is_sarcastic"], w2i)

In [None]:
# Sanity check on forward pass
inputs, lengths, labels = train_set[0:5]
model = LSTMBinaryClassifier(len(w2i), 100, [64,10])
output, _ = model.forward(inputs, lengths)
print(output)

tensor([[ 0.0660, -0.3232],
        [ 0.0660, -0.3232],
        [ 0.0660, -0.3232],
        [ 0.0823, -0.3310],
        [ 0.0660, -0.3232]], grad_fn=<SliceBackward0>)


In [7]:
EMBED_SIZE = 100
HIDDEN_SIZE = [64, 10]
clip_value = 4.0  
BATCH_SIZE = 32
NUM_EPOCHS = 150
MIN_EPOCHS = 25
EARLY_STOP_THRESHOLD = 0.2
NOISE_SD = 1e-3
ALPHA = 5e-3
LR_GAMMA = 0.98

MODEL_OUTPUT_DIR = ""
CHKPT_INTERVAL = 10

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"device: {device}")

device: cuda


In [8]:
train_loader = DataLoader(train_set, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_set, batch_size=BATCH_SIZE, shuffle=True)

In [9]:
model = LSTMBinaryClassifier(len(w2i), EMBED_SIZE, HIDDEN_SIZE).to(device)
criterion = nn.CrossEntropyLoss(reduction="sum")
optimizer = optim.Adam(model.parameters(), lr=ALPHA)
scheduler = optim.lr_scheduler.ExponentialLR(optimizer, gamma=LR_GAMMA)

In [10]:
min_loss = np.inf
for epoch in range(NUM_EPOCHS):
    print(f"epoch {epoch}:")

    # Train over training set
    running_loss = 0
    correct = 0
    for sid, (sent, lengths, labels) in enumerate(train_loader):
        # Calculate model error and  propogate loss
        loss, corr = calc_loss(model, sent, lengths, labels, criterion, device=device)
        running_loss += loss.item()
        correct += corr.item()
        loss.backward()

        # Regularization: gradient clipping and noisy gradients
        clip_grad_norm_(model.parameters(), clip_value * BATCH_SIZE)
        for layer in model.parameters():
            layer.grad += torch.randn_like(layer.grad) * NOISE_SD * BATCH_SIZE
        optimizer.step()

    # Evaluate over validation set
    val_loss = 0
    val_correct = 0
    with torch.no_grad():
        for sid, (sent, lengths, labels) in enumerate(val_loader):
            loss, corr = calc_loss(model, sent, lengths, labels, criterion, device=device)
            val_loss += loss.item()
            val_correct += corr.item()

    # Print loss and accuracy
    train_loss = running_loss / len(train_set)
    val_loss = val_loss / len(val_set)
    print(f" train_loss={train_loss:.4f}, val_loss={val_loss:.4f}")
    print(
        f" train_acc={correct / len(train_set):.4f}, val_acc={val_correct/len(val_set):.4f}"
    )

    # Early stoppage:
    if val_loss < min_loss:
        min_loss = val_loss
    elif epoch > MIN_EPOCHS and val_loss > min_loss + EARLY_STOP_THRESHOLD:
        torch.save(model.state_dict(), f"{MODEL_OUTPUT_DIR}SD_LSTM_ep{epoch}.pt")
        print(f"Early stopping at epoch {epoch}.")
    
    # Save checkpoint
    if epoch % CHKPT_INTERVAL == 0 and epoch > 0:
        torch.save(model.state_dict(), f"{MODEL_OUTPUT_DIR}SD_LSTM_ep{epoch}.pt")
    
    # Update LR
    scheduler.step()

# Save final model
torch.save(model.state_dict(), f"{MODEL_OUTPUT_DIR}SD_LSTM_final.pt")


epoch 0:
 train_loss=0.6898, val_loss=0.6912
 train_acc=0.5324, val_acc=0.5328
epoch 1:
 train_loss=0.6894, val_loss=0.6902
 train_acc=0.5347, val_acc=0.5347
epoch 2:
 train_loss=0.6896, val_loss=0.6884
 train_acc=0.5334, val_acc=0.5354
epoch 3:
 train_loss=0.6886, val_loss=0.6906
 train_acc=0.5363, val_acc=0.5317
epoch 4:
 train_loss=0.6881, val_loss=0.6893
 train_acc=0.5371, val_acc=0.5350
epoch 5:
 train_loss=0.6879, val_loss=0.6900
 train_acc=0.5374, val_acc=0.5336
epoch 6:
 train_loss=0.6886, val_loss=0.6892
 train_acc=0.5360, val_acc=0.5361
epoch 7:
 train_loss=0.6877, val_loss=0.6873
 train_acc=0.5373, val_acc=0.5363
epoch 8:
 train_loss=0.6859, val_loss=0.6903
 train_acc=0.5388, val_acc=0.5349
epoch 9:
 train_loss=0.6866, val_loss=0.6880
 train_acc=0.5379, val_acc=0.5349
epoch 10:
 train_loss=0.6853, val_loss=0.6880
 train_acc=0.5394, val_acc=0.5378
epoch 11:
 train_loss=0.6857, val_loss=0.6901
 train_acc=0.5387, val_acc=0.5338
epoch 12:
 train_loss=0.6861, val_loss=0.6891
 tra

In [16]:
test_df = pd.read_json(DATA_DIR + TEST_FN, lines=True)
test_df["tokens"] = test_df["tokens"].apply(lambda s: vectorize_tokens(s, w2i))
test_set = NewspaperSarcasmDataset(test_df["tokens"], test_df["is_sarcastic"], w2i)
test_loader = DataLoader(test_set, batch_size=BATCH_SIZE)
test_loss = test_correct = 0
with torch.no_grad():
    for sid, (sent, lengths, labels) in enumerate(test_loader):
        loss, corr = calc_loss(model, sent, lengths, labels, criterion, device=device)
        test_loss += loss.item()
        test_correct += corr
print(f"{len(test_set)} samples tested.")
print(f"test_loss={test_loss/len(test_set):.4f}, test_acc={test_correct/len(test_set):.4f}")

2864 samples tested.
test_loss=0.6827, test_acc=0.5356
