In [1]:
import random
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

SEED = 515
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# Load Data

In [2]:
import torchtext

NGrams = 2
train_dataset, test_dataset = torchtext.datasets.AG_NEWS(root="../assets/data", ngrams=NGrams, vocab=None)

120000lines [00:08, 14440.58lines/s]
120000lines [00:15, 7844.34lines/s]
7600lines [00:00, 8029.35lines/s]


In [3]:
train_dataset[0]

(2,
 tensor([    572,     564,       2,    2326,   49106,     150,      88,       3,
            1143,      14,      32,      15,      32,      16,  443749,       4,
             572,     499,      17,      10,  741769,       7,  468770,       4,
              52,    7019,    1050,     442,       2,   14341,     673,  141447,
          326092,   55044,    7887,     411,    9870,  628642,      43,      44,
             144,     145,  299709,  443750,   51274,     703,   14312,      23,
         1111134,  741770,  411508,  468771,    3779,   86384,  135944,  371666,
            4052]))

# Define Model

In [4]:
class TextSentiment(nn.Module):
    def __init__(self, voc_size, emb_dim, n_class):
        super(TextSentiment, self).__init__()
        # TODO: Why sparse?
        self.emb_sum = nn.EmbeddingBag(voc_size, emb_dim, sparse=True)
        self.fc = nn.Linear(emb_dim, n_class)
        self.init_weights()

    def init_weights(self):
        initrange = 0.5
        # Use `Tensor.data` to NOT track computation history. 
        self.emb_sum.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

    def forward(self, text, offsets):
        embedded = self.emb_sum(text, offsets)
        return self.fc(embedded)

In [5]:
BATCH_SIZE = 16
VOC_SIZE = len(train_dataset.get_vocab())
EMB_DIM = 32
N_CLASS = len(train_dataset.get_labels())

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = TextSentiment(VOC_SIZE, EMB_DIM, N_CLASS).to(device)

# Training

In [6]:
def generate_batch(batch):
    """
    Transform a batch from dataset to a concatenated tensor with offsets. 
    """
    labels = torch.tensor([entry[0] for entry in batch])
    texts = [entry[1] for entry in batch]
    offsets = [0] + [len(t) for t in texts[:-1]]
    offsets = torch.tensor(offsets).cumsum(dim=-1)
    texts = torch.cat(texts)
    return texts, offsets, labels

In [7]:
import time
from torch.utils.data import DataLoader
from torch.utils.data.dataset import random_split
N_EPOCHS = 5
min_valid_loss = np.inf

loss_func = nn.CrossEntropyLoss().to(device)
optimizer = optim.SGD(model.parameters(), lr=4.0)
# Switch `lr = lr * gamma` every `step_size` times that `scheduler.step()` is called. 
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.9)

train_len = int(len(train_dataset) * 0.95)
sub_train_dataset, sub_valid_dataset = random_split(train_dataset, [train_len, len(train_dataset)-train_len])
sub_train_loader = DataLoader(sub_train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=generate_batch)
sub_valid_loader = DataLoader(sub_valid_dataset, batch_size=BATCH_SIZE*5, shuffle=False, collate_fn=generate_batch)

for epoch in range(N_EPOCHS):
    start_time = time.time()

    # Training
    train_loss = 0
    train_acc = 0
    for i, (texts, offsets, labels) in enumerate(sub_train_loader):
        texts, offsets, labels = texts.to(device), offsets.to(device), labels.to(device)
        # Forward pass
        outs = model(texts, offsets)
        # Calculate loss
        loss = loss_func(outs, labels)
        # Backward propagation
        optimizer.zero_grad()
        loss.backward()
        # Update weights
        optimizer.step()

        train_loss += loss.item()
        train_acc += (outs.argmax(dim=-1) == labels).sum().item()

    train_loss /= len(sub_train_dataset)
    train_acc /= len(sub_train_dataset)
    # Adjust the learning rate
    scheduler.step()

    # Evaluating
    model.eval()
    valid_loss = 0
    valid_acc = 0
    for i, (texts, offsets, labels) in enumerate(sub_valid_loader):
        texts, offsets, labels = texts.to(device), offsets.to(device), labels.to(device)
        
        with torch.no_grad():
            outs = model(texts, offsets)
            loss = loss_func(outs, labels)
            valid_loss += loss.item()
            valid_acc += (outs.argmax(dim=-1) == labels).sum().item()

    valid_loss /= len(sub_valid_dataset)
    valid_acc /= len(sub_valid_dataset)
    model.train()

    secs = int(time.time() - start_time)
    mins = secs / 60
    secs = secs % 60

    print(f"Epoch: {epoch + 1} | time in {mins} minutes, {secs} seconds")
    print(f"\tLoss: {train_loss:.4f} (train) | Acc: {train_acc * 100:.1f}% (train)")
    print(f"\tLoss: {valid_loss:.4f} (valid) | Acc: {valid_acc * 100:.1f}% (valid)")

Epoch: 1 | time in 0.15 minutes, 9 seconds
	Loss: 0.0261 (train) | Acc: 84.7% (train)
	Loss: 0.0037 (valid) | Acc: 90.3% (valid)
Epoch: 2 | time in 0.15 minutes, 9 seconds
	Loss: 0.0119 (train) | Acc: 93.7% (train)
	Loss: 0.0037 (valid) | Acc: 90.4% (valid)
Epoch: 3 | time in 0.15 minutes, 9 seconds
	Loss: 0.0070 (train) | Acc: 96.3% (train)
	Loss: 0.0044 (valid) | Acc: 89.9% (valid)
Epoch: 4 | time in 0.15 minutes, 9 seconds
	Loss: 0.0038 (train) | Acc: 98.1% (train)
	Loss: 0.0048 (valid) | Acc: 89.8% (valid)
Epoch: 5 | time in 0.15 minutes, 9 seconds
	Loss: 0.0022 (train) | Acc: 99.0% (train)
	Loss: 0.0055 (valid) | Acc: 89.7% (valid)


# Evaluating

In [8]:
# Testing
model.eval()
test_loss = 0
test_acc = 0
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE*5, shuffle=False, collate_fn=generate_batch)
for i, (texts, offsets, labels) in enumerate(test_loader):
    texts, offsets, labels = texts.to(device), offsets.to(device), labels.to(device)
    
    with torch.no_grad():
        outs = model(texts, offsets)
        loss = loss_func(outs, labels)
        test_loss += loss.item()
        test_acc += (outs.argmax(dim=-1) == labels).sum().item()

test_loss /= len(test_dataset)
test_acc /= len(test_dataset)
model.train()

print(f"\tLoss: {test_loss:.4f} (test) | Acc: {test_acc * 100:.1f}% (test)")

Loss: 0.0046 (test) | Acc: 90.8% (test)


In [9]:
import re
from torchtext.data.utils import get_tokenizer, ngrams_iterator

ag_news_label = {1: "World",
                 2: "Sports",
                 3: "Business",
                 4: "Sci/Tec"}

ex_text_str = "MEMPHIS, Tenn. – Four days ago, Jon Rahm was \
    enduring the season’s worst weather conditions on Sunday at The \
    Open on his way to a closing 75 at Royal Portrush, which \
    considering the wind and the rain was a respectable showing. \
    Thursday’s first round at the WGC-FedEx St. Jude Invitational \
    was another story. With temperatures in the mid-80s and hardly any \
    wind, the Spaniard was 13 strokes better in a flawless round. \
    Thanks to his best putting performance on the PGA Tour, Rahm \
    finished with an 8-under 62 for a three-stroke lead, which \
    was even more impressive considering he’d never played the \
    front nine at TPC Southwind."

tokenizer = get_tokenizer("basic_english")
vocab = train_dataset.get_vocab()
text = torch.tensor([vocab[token] for token in ngrams_iterator(tokenizer(ex_text_str), NGrams)], device=device)

with torch.no_grad():
    outs = model(text, torch.tensor([0], device=device))

print(outs)
print(ag_news_label[outs.argmax().item() + 1])

tensor([[-2.8763,  6.8487, -2.4136, -1.6231]], device='cuda:0')
Sports
