#ENGR 8990 - Deep Learning & Engineering Applications
## Assignment 4 - Transformer for Sentiment Classification
In this assignment, you will code a transformer model for sentiment classification.

1.   Construct a transformer encoder (you could use the one in NB13) as the backbone and add a linear classifier for sentiment classification using the IMDB dataset (note: the vocab for IMDB is different from the NMT dataset used in NB13).

2.   Train the model and display the proggess showing performance metrics (e.g., loss and accuracy).

2.   Evaluate the trained model on the test dataset.

In [None]:
import torch
from torch import nn
import torch.nn.functional as f
import numpy as np
from collections import Counter
from torchtext.datasets import IMDB
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import Vocab
from d2l import torch as d2l
import math

In [None]:
train_iter, test_iter = IMDB(split=('train', 'test'))
len(train_iter), len(test_iter)

(25000, 25000)

In [None]:
next(train_iter)

('neg',
 'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far be

In [None]:
tokenizer = get_tokenizer('basic_english')
train_iter = IMDB(split='train')
counter = Counter()
for (label, line) in train_iter:
    counter.update(tokenizer(line))
vocab = Vocab(counter, min_freq=10, specials=('<unk>', '<BOS>', '<EOS>', '<PAD>'))

In [None]:
print("The length of the new vocab is", len(vocab))

The length of the new vocab is 20439


In [None]:
new_stoi = vocab.stoi
print("The index of '<BOS>' is", new_stoi['<BOS>'])

The index of '<BOS>' is 1


In [None]:
new_itos = vocab.itos
print("The token at index 2 is", new_itos[2])

The token at index 2 is <EOS>


In [None]:
text_transform = lambda x: [vocab['<BOS>']] + [vocab[token] for token in tokenizer(x)] + [vocab['<EOS>']]
label_transform = lambda x: 1 if x == 'pos' else 0

# Print out the output of text_transform
print("output of the text_transform:", text_transform("here is an example"))

output of the text_transform: [1, 134, 12, 43, 467, 2]


In [None]:
def collate_batch(batch):
    label_list, text_list = [], []
    for (_label, _text) in batch:
        label_list.append(label_transform(_label))
        processed_text = torch.tensor(text_transform(_text))
        text_list.append(processed_text)
    return torch.tensor(label_list), pad_sequence(text_list, padding_value=3.0)

train_iter = IMDB(split='train')
train_dataloader = DataLoader(list(train_iter), batch_size=128, shuffle=True,
                              collate_fn=collate_batch)

test_iter = IMDB(split='test')
test_dataloader = DataLoader(list(test_iter), batch_size=128, shuffle=True,
                              collate_fn=collate_batch)

In [None]:
#train_iterator, valid_iterator = iter(train_loader), iter(valid_loader)
for batch in train_dataloader:
    print (batch[0])
    break

tensor([1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1,
        1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1,
        0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0,
        1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1,
        0, 0, 0, 0, 0, 0, 0, 0])


## Your codes go here:

In [None]:
class PositionalEncoding(nn.Module):
    """
    https://pytorch.org/tutorials/beginner/transformer_tutorial.html
    """

    def __init__(self, d_model, vocab_size=5000, dropout=0.1):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(vocab_size, d_model)
        position = torch.arange(0, vocab_size, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(
            torch.arange(0, d_model, 2).float()
            * (-math.log(10000.0) / d_model)
        )
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer("pe", pe)

    def forward(self, x):
        x = x + self.pe[:, : x.size(1), :]
        return self.dropout(x)


In [None]:
class Net(nn.Module):
    """
    Text classifier based on a pytorch TransformerEncoder.
    """

    def __init__(
        self,
        embeddings,
        nhead=8,
        dmodel=8,
        dim_feedforward=2048,
        num_layers=5,
        dropout=0.1,
        activation="relu",
        classifier_dropout=0.1,
    ):

        super().__init__()

        #vocab_size, d_model = embeddings.size()
        vocab_size = len(embeddings)
        assert dmodel % nhead == 0, "nheads must divide evenly into d_model"

        #self.emb = nn.Embedding.from_pretrained(embeddings, freeze=False)
        self.emb = nn.Embedding(vocab_size, dmodel)

        self.pos_encoder = PositionalEncoding(
            d_model=dmodel,
            dropout=dropout,
            vocab_size=vocab_size,
        )

        encoder_layer = nn.TransformerEncoderLayer(
            d_model=dmodel,
            nhead=nhead,
            dim_feedforward=dim_feedforward,
            dropout=dropout,
        )
        self.transformer_encoder = nn.TransformerEncoder(
            encoder_layer,
            num_layers=num_layers,
        )
        self.classifier = nn.Linear(dmodel, 1)
        self.d_model = dmodel
        self.dropout = nn.Dropout(classifier_dropout)

    def forward(self, x):
        x = self.emb(x) * math.sqrt(self.d_model)
        x = self.pos_encoder(x)
        x = self.transformer_encoder(x)
        x = x.mean(dim=1)
        x = self.dropout(x)
        x = self.classifier(x)

        return x


In [None]:
device = torch.device("cuda:0" if not torch.cuda.is_available() else "cpu")
threshold = torch.tensor([0.5])

epochs = 10
model = Net(
    vocab,
    nhead=8,  # the number of heads in the multiheadattention models
    dmodel=8,
    dim_feedforward=2048,  # the dimension of the feedforward network model in nn.TransformerEncoder
    num_layers=5,
    dropout=0.2,
    classifier_dropout=0.2,
).to(device)

criterion = nn.BCELoss()

lr = 1e-2
optimizer = torch.optim.Adam(
    (p for p in model.parameters() if p.requires_grad), lr=lr
)

scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min')
#animator = d2l.Animator(xlabel='epoch', xlim=[1, num_epochs],
#                            legend=['train loss', 'train acc'])
torch.manual_seed(0)
#timer, num_batches = d2l.Timer(), len(train_iter)
print("starting")
def xavier_init_weights(m):
        if type(m) == nn.Linear:
            nn.init.xavier_uniform_(m.weight)
        if type(m) == nn.GRU:
            for param in m._flat_weights_names:
                if "weight" in param:
                    nn.init.xavier_uniform_(m._parameters[param])
#model.apply(xavier_init_weights)
model.train()
for epoch in range(epochs):
    #metric = d2l.Accumulator(3)
    print("Epoch: %3d"% (epoch))
    acc = 0
    i = 0
    print("Learning Rate: ", lr)
    for idx, batch in enumerate(iter(train_dataloader)):
        X = batch[1]
        Y = batch[0]
        #print(torch.transpose(X, 0, 1))
        #print(Y)

        predictions = model(torch.transpose(X, 0, 1).to(device))
        print("Predicting on batch number %3d out of %3d" % (idx, len(train_dataloader)))
        labels = Y.to(device)
        #print(labels.dtype)
        #print(labels)
        #print(predictions)
        predictions = (predictions>threshold).float()*1
        labels = labels.type(torch.FloatTensor)
        #print(predictions.dtype)
        #print(predictions)
        predictions = predictions.squeeze(1)
        loss = criterion(predictions, labels)
        loss.requires_grad_(True)
        #correct = predictions.argmax(axis=1) == labels
        correct = predictions == labels
        acc = correct.sum().item() / correct.size(0)
        print("Accuracy on batch: ", acc*100)
        print("Loss on batch: ", loss)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)

        optimizer.step()
        scheduler.step(loss)
        i +=1

starting
Epoch:   0
Learning Rate:  0.01
Predicting on batch number   0 out of 196
Accuracy on batch:  50.78125
Loss on batch:  tensor(49.2188, requires_grad=True)
Predicting on batch number   1 out of 196
Accuracy on batch:  46.09375
Loss on batch:  tensor(53.9062, requires_grad=True)
Predicting on batch number   2 out of 196
Accuracy on batch:  51.5625
Loss on batch:  tensor(48.4375, requires_grad=True)
Predicting on batch number   3 out of 196
Accuracy on batch:  51.5625
Loss on batch:  tensor(48.4375, requires_grad=True)
Predicting on batch number   4 out of 196
Accuracy on batch:  50.78125
Loss on batch:  tensor(49.2188, requires_grad=True)
Predicting on batch number   5 out of 196
Accuracy on batch:  47.65625
Loss on batch:  tensor(52.3438, requires_grad=True)


KeyboardInterrupt: 

In [None]:
with torch.no_grad():
        test_epoch_loss = 0
        test_epoch_correct = 0
        test_epoch_count = 0
        model.eval()
        for idx, batch in enumerate(iter(test_dataloader)):
            X = batch[1]
            Y = batch[0]
            predictions = model(torch.transpose(X, 0, 1).to(device))
            labels = Y.to(device)
            predictions = (predictions>threshold).float()*1
            predictions = predictions.squeeze(1)
            labels = labels.type(torch.FloatTensor)
            loss = criterion(predictions, labels)
            correct = predictions == labels
            acc = correct.sum().item() / correct.size(0)
            test_epoch_correct += correct.sum().item()
            test_epoch_count += correct.size(0)
            test_epoch_loss += loss.item()
        print("Test epoch accuracy: %5.2f" % (test_epoch_correct / test_epoch_count))


Test epoch accuracy:  0.50
