# Name: Yichen Huang
# CUID: yh3550

In [1]:
import argparse
import logging
import time

import torch
from torch.utils.data import DataLoader
from torch.utils.data.dataset import random_split
from torchtext.data.functional import to_map_style_dataset
from torchtext.data.utils import get_tokenizer
from torchtext.datasets import DATASETS
from torchtext.prototype.transforms import load_sp_model, PRETRAINED_SP_MODEL, SentencePieceTokenizer
from torchtext.utils import download_from_url
from torchtext.vocab import build_vocab_from_iterator
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
import torch.nn.functional as F
from torchtext.vocab import GloVe, FastText

### Information and Info
- torchtext repo: https://github.com/pytorch/text/tree/main/torchtext
- torchtext documentation: https://pytorch.org/text/stable/index.html
- Please FILL IN the parts that need to be filled in for this notebook.

### Constants

In [2]:
DATASET = "AG_NEWS"
DATA_DIR = ".data"
DEVICE = "cpu"
EMBED_DIM = 300
LR = 4.0
BATCH_SIZE = 16
NUM_EPOCHS = 5
PADDING_VALUE = 0
PADDING_IDX = PADDING_VALUE
PAD = '<pad>'
UNK = '<unk>'

# Fill this in with code to make this notebook run.
FILL_IN = "FILL IN"

### Get the tokenizer
- Different models tolenize in different ways. 
    - Word2Vec / GloVe does words (WordLevel).


In [3]:
# Get the basic english tokenizer using the get_tokenizer function.
basic_english_tokenizer = get_tokenizer("basic_english")

In [4]:
# Do not remove this.
assert(len(basic_english_tokenizer("This is some text ...")) == 7)

In [5]:
# Needed later.
TOKENIZER = basic_english_tokenizer

### Get the data and get the vocabulary.

In [6]:
# This function should loop over the (label, text) data pair and tokenize the text.
# It should yield a list of the tokens for each text.
def yield_tokens(data_iter):
    for _, text in data_iter:
        yield TOKENIZER(text)

In [7]:
train_iter = DATASETS[DATASET](root=DATA_DIR, split="train")
# Use build_vocab_from_iterator to build the vocabulary.
# This function should take yield_tokens.
# The special characters are PAD and UNK.
VOCAB = build_vocab_from_iterator(yield_tokens(train_iter), specials=[PAD, UNK])

# Make the default index the same as that of the unk_token.
VOCAB.set_default_index(VOCAB[UNK])

### Get GloVe vectors

Information about pretrained vectors: 
- https://pytorch.org/text/stable/_modules/torchtext/vocab/vectors.html#GloVe
- https://github.com/pytorch/text/blob/e3799a6eecef451f6e66c9c20b6432c5f078697f/torchtext/vocab/vectors.py#L263

In [8]:
# Set GLOVE to the name='840B' GloVe vectors of dimension 300. 
GLOVE = GloVe(name='840B', dim=300)

If the embeddings are not in the token space, a zero vector will be returned.

In [9]:
# Get the vectors for all the tokens in s = "Hello, How are you?"
# Look up "get_vecs_by_tokens" for GloVe vectors.
# Add an assertion checking that the dimensions of wat you get is dimension (???, 300).
assert(GLOVE.get_vecs_by_tokens("Hello, How are you?".split()).shape == (4, 300))

In [10]:
# Let s = ""<pad> <unk> the man Man ahsdhashdahsdhash".
# What are the vectors of each token. Print this below.
s = "<pad> <unk> the man Man ahsdhashdahsdhash"
GLOVE.get_vecs_by_tokens(s.split())

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.2720, -0.0620, -0.1884,  ...,  0.1302, -0.1832,  0.1323],
        [-0.1731,  0.2066,  0.0165,  ...,  0.1666, -0.3834, -0.0738],
        [-0.5131, -0.0228, -0.0271,  ..., -0.1723, -0.0968, -0.1644],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]])

### Helper functions

These functions tokenize the string input and then map each token to the integer representation in the vocabulary.

In [11]:
# Return for a sentence the int tokens for that sentence.
# I.e., you pass in "a b c d" and get out [1, 2, 3, 4].
def text_pipeline(text):
    return [VOCAB[token] for token in TOKENIZER(text)]

# Return the label starting at 0. I.e. map each label to fo from 0, not 2 or whatever it starts from.
def label_pipeline(label):
    return label - 1

Nice link on collate_fn and DataLoader in PyTorch: https://python.plainenglish.io/understanding-collate-fn-in-pytorch-f9d1742647d3

In [12]:
# As we loop through batches, this function gets applied to each raw batch.
def collate_batch(batch):
    label_list, text_list = [], []
    for i in batch:
        # Get the label from {1, 2, 3, 4} to {0, 1, 2, 3}
        # Append the label to the label_list.
        label_list.append(label_pipeline(i[0]))
                
        # Return a list of ints.
        # Get a torch tensor of the sentence, this sould be a tensor of torch.int64.
        processed_text = torch.tensor(text_pipeline(i[1]))
        text_list.append(processed_text.clone().detach())
    
    # Transform the label_list into a tensor. 
    label_list = torch.tensor(label_list)
    # Pad the list of text_list tensors so they all have the same length.
    # Use batch_first = True.
    # Use padding_valid = PADDING_VALUE
    text_list = pad_sequence(text_list, batch_first=True, padding_value=PADDING_VALUE)
            
    return label_list.to(DEVICE), text_list.to(DEVICE)

### Get the data

In [13]:
train_iter = DATASETS[DATASET](root=DATA_DIR, split="train")
# Get the number of classes.
class_ = set()
for i in train_iter:
    class_.add(i[0])
num_class = len(class_)
# What are the classes?
print(f"The number of classes is {num_class}")

The number of classes is 4


In [14]:
VOCAB.get_itos()[0]

'<pad>'

### Set up the model

In [15]:
# A more complicated model. We'll explore this after we learn word embeddings.


class TextClassificationModel(nn.Module):
    def __init__(
        self,
        vocab_size,
        embed_dim,
        num_class,
        initialize_with_glove = True,
        fine_tune_embeddings = True
    ):
        super(TextClassificationModel, self).__init__()
        # Set to an embedding of (vocab_size, embed_dim) size.
        # Use padding_idx = PADDING_IDX.
        # This is so we don't get gradients for padding tokens and use 0 as the vector for these.
        self.embedding = nn.Embedding(
            vocab_size,
            embed_dim,
            padding_idx=PADDING_IDX
        )
        
        if initialize_with_glove:
            # Turn off the gradient for the embedding weight as we are going to modify it. 
            with torch.no_grad(): 
                for i in range(vocab_size):
                    # Get the token index in VOCAB.
                    token = VOCAB.get_itos()[i]
                    
                    # Modify the embedding matrix to be the GloVe vector for this token.
                    self.embedding.weight[i, :] = GLOVE.get_vecs_by_tokens(token)
                torch.enable_grad()
                # Turn on the gradient after we modify it.
                # You could do this in another way by wrapping this in @torch.no_grad decorator.
            
        
        # No fine tuning means once you intialize, these are constant.
        if not fine_tune_embeddings:
            # Turn off the gradient for the embedding weight matric if you don't fine tune them.
            #torch.no_grad()
            self.embedding.requires_grad_ = False
        
        
        # Set fc to be a linear layer of dimension (embed_dim, num_class).
        self.fc = nn.Linear(embed_dim, num_class)
        self.init_weights()

    def init_weights(self):
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

    def forward(self, text):
        # Get the embeddings for all tokens in the batch of text.
        embedded = self.embedding(text)
        # Across dimension 1, get the mean vector. This gets the mean vector per sentence in the batch.
        # Make sure you squeeze any dimension that's 1. This should be (N, d), where N is the batch dimension and d is the word vector dimension.
        embedded_sum = torch.mean(embedded, dim=1).squeeze(1)
        
        # Run through a linear layer self.fc and also apply ReLU.
        relu = nn.ReLU()
        logits = relu(self.fc(embedded_sum))
        return logits

### Set up the 

In [16]:
# Set to be the CrossEntropyLoss.
criterion = torch.nn.CrossEntropyLoss()
# Set model to the TextClassification model.
# Turn on intialize_with_glove ad fine_tune_embeddings.
model = TextClassificationModel(num_class=num_class, vocab_size=len(VOCAB), embed_dim=EMBED_DIM, initialize_with_glove=True, fine_tune_embeddings=True)

# Set the optimizer for SGD with learning rate LR. The parameters are model.parameters.
optimizer = torch.optim.SGD(model.parameters(), lr=LR)
# Schedule the learning rate decay to go down each epoch by 1/10.
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.1)

### Set up the data

In [17]:
train_iter, test_iter = DATASETS[DATASET]()
# This puts things in a nice format.
train_dataset = to_map_style_dataset(train_iter)
test_dataset = to_map_style_dataset(test_iter)

# Set num_train to 95% length of train_dataset.
# This should be an integer.
num_train = int(0.95 * len(train_dataset))
# The array below should have 2 ints in it, num_train, and the 5% left over for validation.
split_train_, split_valid_ = random_split(train_dataset, [num_train,len(train_dataset) - num_train])

# Set to a DataLoader on the training data with batch_size BATCH_SIZE and specify collate_batch.
train_dataloader = DataLoader(split_train_, batch_size=BATCH_SIZE, collate_fn=collate_batch)
valid_dataloader = DataLoader(split_valid_, batch_size=BATCH_SIZE, collate_fn=collate_batch)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, collate_fn=collate_batch)

### Train the model

In [18]:
def train(dataloader, model, optimizer, criterion, epoch):
    model.train()
    total_acc, total_count = 0, 0
    log_interval = 500

    for idx, (label, text) in enumerate(dataloader):
        
        # Zero the gradients.
        optimizer.zero_grad()
        
        logits = model(text)
        
        # Get the loss.
        loss = criterion(logits.float(),label)
        
        # Do back propagation.
        loss.backward()
        
        # Clip the gradients at 0.1
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
        
        # Do an optimization step.
        optimizer.step()
        # Get the accuracy for this batch.
        total_acc += (logits.argmax(1) == label).sum()
        # Get the number of rows in this batch. Use labels.
        total_count += len(label)
        
        if idx % log_interval == 0 and idx > 0:
            print(
                "| epoch {:3d} | {:5d}/{:5d} batches "
                "| accuracy {:8.3f}".format(epoch, idx, len(dataloader), total_acc / total_count)
            )
            total_acc, total_count = 0, 0

In [19]:
for idx, (label, text) in enumerate(train_dataloader):
    print(label)
    break

tensor([1, 1, 1, 2, 2, 3, 2, 3, 3, 3, 1, 3, 1, 1, 1, 2])


In [20]:
def evaluate(dataloader, model):
    model.eval()
    total_acc, total_count = 0, 0

    with torch.no_grad():
        for idx, (label, text) in enumerate(dataloader):
            logits = model(text)
            total_acc += (logits.argmax(1) == label).sum() 
            total_count += len(label)
    return total_acc / total_count

In [21]:
for epoch in range(1, NUM_EPOCHS + 1):
    epoch_start_time = time.time()
    train(train_dataloader, model, optimizer, criterion, epoch)
    accu_val = evaluate(valid_dataloader, model)
    scheduler.step()
    print("-" * 59)
    print(
        "| end of epoch {:3d} | time: {:5.2f}s | "
        "valid accuracy {:8.3f} ".format(epoch, time.time() - epoch_start_time, accu_val)
    )
    print("-" * 59)

print("Checking the results of test dataset.")
accu_test = evaluate(test_dataloader, model)
print("test accuracy {:8.3f}".format(accu_test))

| epoch   1 |   500/ 7125 batches | accuracy    0.296
| epoch   1 |  1000/ 7125 batches | accuracy    0.423
| epoch   1 |  1500/ 7125 batches | accuracy    0.533
| epoch   1 |  2000/ 7125 batches | accuracy    0.590
| epoch   1 |  2500/ 7125 batches | accuracy    0.600
| epoch   1 |  3000/ 7125 batches | accuracy    0.620
| epoch   1 |  3500/ 7125 batches | accuracy    0.623
| epoch   1 |  4000/ 7125 batches | accuracy    0.636
| epoch   1 |  4500/ 7125 batches | accuracy    0.813
| epoch   1 |  5000/ 7125 batches | accuracy    0.855
| epoch   1 |  5500/ 7125 batches | accuracy    0.861
| epoch   1 |  6000/ 7125 batches | accuracy    0.863
| epoch   1 |  6500/ 7125 batches | accuracy    0.872
| epoch   1 |  7000/ 7125 batches | accuracy    0.871
-----------------------------------------------------------
| end of epoch   1 | time: 121.23s | valid accuracy    0.881 
-----------------------------------------------------------
| epoch   2 |   500/ 7125 batches | accuracy    0.892
| epoch 

In [22]:
train_dataloader

<torch.utils.data.dataloader.DataLoader at 0x127f1d370>