#HW 7 - Programming (Transformers)

Before you start on this programming assignment, make a copy of this file by clicking on the File button on the top left corner (right below the file name) and select "Save a Copy in Drive". Work on that copy and upload your completed transformer.ipynb file to Gradescope.

Run **ALL** the cells in the notebook sequentially. Do **not** modify any other cells or the code may break.

Cell with blanks to be filled in have "TODOs" and comments to explain what needs to be filled.

In [None]:
import torchtext
from torchtext.data import functional as textF
from torch import nn, Tensor
from typing import Tuple
import os
import math
import torch
from torch import Tensor
import torch.nn as nn
from torch.nn import TransformerEncoder, TransformerEncoderLayer

import torch.optim as optim
import torch.utils.data as data
import math
import copy

import time
from tqdm import trange, tqdm
import torch.nn.functional as F


import os
from tempfile import TemporaryDirectory

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Currently available device is: ", device)

print("Downloading the Wikitext dataset for pretraining")
url_wiki = 'https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-v1.zip'
torchtext.utils.download_from_url(url_wiki)
torchtext.utils.extract_archive('/content/.data/wikitext-103-v1.zip', './')

## Train SentencePiece tokenizer on wikitext

In [None]:
# 3 minutes to train tokenizer
sp_model = textF.generate_sp_model('./wikitext-103/wiki.train.tokens', vocab_size = 50000, model_type='bpe', model_prefix = 'spm')
sp_model = textF.load_sp_model("/content/spm.model")

## Pretraining on Wikitext

In [None]:
class Corpus(object):
    def __init__(self, path, tokenizer_model):
        self.tokenizer = textF.sentencepiece_tokenizer(tokenizer_model)
        self.numericalizer = textF.sentencepiece_numericalizer(tokenizer_model)
        self.train = self.numericalize(os.path.join(path, 'wiki.train.tokens'))
        self.valid = self.numericalize(os.path.join(path, 'wiki.valid.tokens'))

    def flatten_extend(self, matrix):
        flat_list = []
        for row in matrix:
            flat_list.extend(row)
        return flat_list

    def numericalize(self, path):
        assert os.path.exists(path)
        tok_lines = []
        max_tok = 1000000
        with open(path, 'r', encoding="utf8") as f:
            num_tok = 0
            for i, line in tqdm(list(enumerate(f.readlines()))):
                toks = list(self.numericalizer([line]))
                toks = toks[0]
                tok_lines.append(toks)
                num_tok += len(toks)
                if num_tok > max_tok:
                    break
        return torch.tensor(self.flatten_extend(tok_lines))

def random_batch_sampler(tokens, device, batch_size, seq_len):
    n_tokens = tokens.shape[0]
    while True:
        start_indices = torch.randint(0, n_tokens - seq_len + 1, (batch_size,))
        sequences = torch.stack([tokens[start:start + seq_len] for start in start_indices])
        yield sequences.to(device)


def sequential_batch_sampler(tokens, device, batch_size, seq_len):
    n_tokens = tokens.shape[0]
    total_len = batch_size * seq_len

    for i in range(0, n_tokens - total_len + 1, total_len):
        batch = tokens[i:i + total_len].view(batch_size, seq_len)
        yield batch.to(device)

In [None]:
corpus = Corpus("/content/wikitext-103", sp_model)

In [None]:
batch_size = 32
seq_len = 65

train_sampler = random_batch_sampler(corpus.train, device, batch_size, seq_len)
val_sampler = sequential_batch_sampler(corpus.valid, device, batch_size, seq_len)

### Model Definition

This cell contains the implementation for the Multihead attention for the transformer.

Lines which need to be filled in are marked with **TODOs** with comments to explain the functionality to be implemented.

**General hints**

Make sure all of the data and the model weights (layers) are on the same device. If not use an appropriat method to ensure this.

Make sure to use the appropriate dimensions while instantiating layers

In [None]:
# FILL IN THIS BLOCK OF CODE

class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        # Ensure that the model dimension (d_model) is divisible by the number of heads
        assert d_model % num_heads == 0, "d_model must be divisible by num_heads"

        # Initialize dimensions
        self.d_model = d_model # Model's dimension
        self.num_heads = num_heads # Number of attention heads
        self.d_k = d_model // num_heads # Dimension of each head's key, query, and value

        # Linear layers for transforming inputs

        '''
        TODO: INSTANTIATE LINEAR LAYERS FOR CREATING QUERY, KEY AND VALUE VECTORS (Check suitable layer in torch.nn) FROM THE INPUT VECTOR.
        Think about what the input and output dimensions must be for this layer.
        '''

        self.W_q = ... # Query transformation
        self.W_k = ... # Key transformation
        self.W_v = ... # Value transformation

        self.W_o = nn.Linear(d_model, d_model).to(device) # Output transformation

    def scaled_dot_product_attention(self, Q, K, V, mask=None):
        # Calculate attention scores

        '''
        TODO: IMPLEMENT THE ATTENTION SCORE CALCULATION FROM THE QUERY AND KEY VECTORS
        Make sure to multiply the Query vector with the transposed version of the Key vector, to calculate the attention score.
        Use a suitable function from the torch library for this
        '''

        attn_scores = ...

        # Apply mask if provided (useful for preventing attention to certain parts like padding)
        if mask is not None:
            attn_scores = attn_scores.masked_fill(mask == 0, -1e9)

        # Softmax is applied to obtain attention probabilities
        attn_probs = torch.softmax(attn_scores, dim=-1)

        '''
        TODO: Multiply attn_probs by values to obtain the final output
        Use a suitable function from the torch library for this
        '''

        output = ...
        return output

    def split_heads(self, x):
        # Reshape the input to have num_heads for multi-head attention
        batch_size, seq_length, d_model = x.size()
        return x.view(batch_size, seq_length, self.num_heads, self.d_k).transpose(1, 2)

    def combine_heads(self, x):
        # Combine the multiple heads back to original shape
        batch_size, _, seq_length, d_k = x.size()
        return x.transpose(1, 2).contiguous().view(batch_size, seq_length, self.d_model)

    def forward(self, Q, K, V, mask=None):
        # Apply linear transformations and split heads
        Q = self.split_heads(self.W_q(Q))
        K = self.split_heads(self.W_k(K))
        V = self.split_heads(self.W_v(V))

        # Perform scaled dot-product attention

        '''
        TODO: COMPUTE THE ATTENTION OUTPUT USING THE QUERY, KEY AND VALUE VECTORS USING THE SUITABLE HELPER FUNCTION FROM THIS CLASS
        '''

        attn_output = ...

        # Combine heads and apply output transformation
        output = self.W_o(self.combine_heads(attn_output))
        return output

class PositionWiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff):
        super(PositionWiseFeedForward, self).__init__()
        self.fc1 = nn.Linear(d_model, d_ff).to(device)
        self.fc2 = nn.Linear(d_ff, d_model).to(device)
        self.relu = nn.ReLU().to(device)

    def forward(self, x):
        return self.fc2(self.relu(self.fc1(x)))


class PositionalEncoding(nn.Module):
    def __init__(self, embed_dim, max_len=5000, dropout=0.1):
        super().__init__()
        #use torch.nn.Embedding to create the encoding. Initialize dropout layer.
        self.encoding = nn.Embedding(max_len, embed_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        N, S, D = x.shape
        positions = torch.arange(S).expand((N, -1)).to(x.device)
        encoded_positions = self.encoding(positions)
        output = x + encoded_positions
        output = self.dropout(output)
        return output

class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(DecoderLayer, self).__init__()

        '''
        #TODO: INSTANTIATE A MULTIHEADATTENTION LAYER FOR THE GIVEN d_model AND num_heads
        #Use the MultiHeadAttention class for this
        '''

        self.self_attn = ...

        self.feed_forward = PositionWiseFeedForward(d_model, d_ff)

        '''
        #TODO: IMPLEMENT THE LAYERNORM USED IN ATTENTION CALCULATION (Check suitable layer in torch.nn)
        #This layer implements the layer normalization operation explained in the lecture
        '''

        self.norm1 = ...
        self.norm2 = ...

        self.dropout = nn.Dropout(dropout).to(device)

    def forward(self, x, mask):

        '''
        TODO: COMPUTE SELF ATTENTION ON THE GIVEN INPUT "x" AND "mask" USING THE "self_attn" layer
        '''
        attn_output = ...

        x = self.norm1(x + self.dropout(attn_output))
        ff_output = self.feed_forward(x)
        x = self.norm2(x + self.dropout(ff_output))
        return x

class Transformer(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout, device='cuda'):
        super(Transformer, self).__init__()
        self.embedding = nn.Embedding(src_vocab_size, d_model).to(device)
        self.positional_encoding = PositionalEncoding(d_model, max_seq_length)

        self.layers = nn.ModuleList([DecoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])

        self.fc = nn.Linear(d_model, tgt_vocab_size).to(device)
        self.dropout = nn.Dropout(dropout).to(device)

        self.device = device
        self.to(device)

    def generate_mask(self, src):
        shape = src.shape[-1]
        a = torch.ones(shape, shape)
        mask = torch.tril(a).to(self.device)
        return mask

    def forward(self, src):
        src_mask = self.generate_mask(src)
        src_embedded = self.dropout(self.positional_encoding(self.embedding(src)))
        enc_output = src_embedded
        for layer in self.layers:
            enc_output = layer(enc_output, src_mask)

        return self.fc(enc_output)

##Pretraining (WikiText)

In [None]:
#HYPERPARAMETERS FOR PRETRAINING

src_vocab_size = 50000
tgt_vocab_size = 50000
d_model = 300
num_heads = 2
num_layers = 3
d_ff = 300
max_seq_length = 65
dropout = 0.1

model = Transformer(src_vocab_size, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [None]:
def compute_loss(data, logits):
    labels = data[:,1:]
    logits = logits[:,:-1]
    labels = labels.reshape(-1)
    logits = logits.reshape(-1, logits.size(-1))
    loss = F.cross_entropy(logits, labels)
    return loss

def train(model: nn.Module) -> float:
    model.train()  # turn on train mode
    iter_start_time = time.time()
    total_loss = 0.
    cnt = 0
    iters_loss = 0.

    for step in (pbar := trange(len(corpus.train) // batch_size)):
        data = next(train_sampler)
        output = model(data.to(device))

        loss = compute_loss(data, output)

        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()

        total_loss += loss.item()
        iters_loss += loss.item()
        if cnt % 2000 == 0 and cnt > 0:
            train_loss = iters_loss / 2000
            train_ppl = math.exp(train_loss)
            elapsed = time.time() - iter_start_time
            print('\n' + '-' * 100)
            print(f'| iteration {cnt:5d} | time elapsed : {elapsed:5.2f}s | '
                f'train loss {train_loss:5.3f} | train perplexity {train_ppl:8.3f} | ')
            print('-' * 100)
            iters_loss = 0.
        cnt += 1

    return total_loss / cnt

def evaluate(model: nn.Module) -> float:
    model.eval()  # turn on evaluation mode
    total_loss = 0.
    cnt = 0
    val_sampler = sequential_batch_sampler(corpus.valid, device, batch_size, seq_len)
    with torch.no_grad():
        for data in (pbar := tqdm(val_sampler, desc="Evaluating..")):
            output = model(data.to(device))
            loss = compute_loss(data, output)
            total_loss += loss.item()
            cnt += 1

    return total_loss / cnt

In [None]:
# START TRAINING. The reference implementation takes around 1 hour to complete training.

epoch_start_time = time.time()
train_loss = train(model)
train_ppl = math.exp(train_loss)
val_loss = evaluate(model)
val_ppl = math.exp(val_loss)
elapsed = time.time() - epoch_start_time

print('-' * 116)
print(f'| end of epoch | time elasped: {elapsed:5.2f}s | '
    f'train loss {train_loss:5.3f} | train perplexity {train_ppl:8.3f} | '
    f'valid loss {val_loss:5.3f} | valid perplexity {val_ppl:8.3f} |')
print('-' * 116)

#SAVE THE PRETRAINED MODEL AS "pretrained.pt"

print("saving the pretrained model")
torch.save(model.state_dict(), "pretrained.pt")

In [None]:
print(model)

## Finetuning (Sentiment Analysis)


In [None]:
import nltk
nltk.download('movie_reviews')

from nltk.corpus import movie_reviews
from torch.nn import functional as F
import random

documents = [(' '.join(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

In [None]:
def train_test_split(documents, num_samples):
    i = num_samples // 2
    neg_train = documents[:i]
    pos_train = documents[-i:]
    train = neg_train + pos_train
    test = documents[i:-i]
    random.shuffle(train)
    random.shuffle(test)
    return train, test

sentiment_maxtokens = 64

class TextDataset(torch.utils.data.Dataset):

  def __init__(self, input_data, tokenizer_model):
      self.text = [t for (t, l) in input_data]
      self.label = [1 if l == 'pos' else 0 for (t, l) in input_data]
      self.tokenizer = textF.sentencepiece_tokenizer(tokenizer_model)
      self.numericalizer = textF.sentencepiece_numericalizer(tokenizer_model)
      self.numerictext = list(self.numericalizer(self.text))

  def __len__(self):
      return len(self.label)

  def get_sequence_token(self, idx):
      sequence = self.numerictext[idx]
      len_seq = len(sequence)
      return sequence, len_seq

  def get_labels(self, idx):
      return self.label[idx]

  def __getitem__(self, idx):
      sequence, len_seq = self.get_sequence_token(idx)
      label = self.get_labels(idx)
      return sequence, label, len_seq

def collate_fn(batch):
    bs = len(batch)
    sequences, labels, lengths = zip(*batch)
    trunc_seqs = torch.zeros((bs, sentiment_maxtokens), dtype = torch.long)

    for i in range(len(batch)):
        if len(sequences[i]) < sentiment_maxtokens:
            trunc_seqs[i][:len(sequences[i])] = torch.tensor(sequences[i], dtype = torch.long)
        else:
            trunc_seqs[i] = torch.tensor(torchtext.functional.truncate(sequences[i], sentiment_maxtokens), dtype = torch.long)
    labels = torch.tensor(labels, dtype=torch.long)
    return trunc_seqs, labels

### Finetuning without pretraining

Train the model for the finetuning task for the different train dataset sizes for 20 epochs (The plot must be submitted as mentioned in the writeup)



In [None]:
#HYPERPARAMETERS FOR FINETUNING TASK WITHOUT PRETRAINING

src_vocab_size = 50000
tgt_vocab_size = 50000
d_model = 300
num_heads = 2
num_layers = 3
d_ff = 300
max_length = 256
lr = 1e-4
batch_size = 32
finetune_epochs = 20

#VARIOUS TRAINING SET SIZES FOR FINETUNING TASK
'''
#TODO: PLOT THE TRAINING ACCURACY vs NUM EPOCHS FOR DIFFERENT TRAINING SET SIZES
'''
sizes = [16, 32, 64, 128, 256, 512]

In [None]:
from torch.utils.data import DataLoader
from tqdm import tqdm

#Tranformer architecture for the finetuning task without pretraining

class NoPretrainingTransformer(nn.Module):
    def __init__(self, vocab_size, d_model, d_ff, num_heads, num_layers, max_length, num_classes = 2, dropout = 0.1, device = 'cuda'):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model).to(device)
        self.positional_encoding = PositionalEncoding(d_model, max_len = max_length)
        self.fc = nn.Linear(d_model, num_classes)
        self.layers = nn.ModuleList([DecoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])
        self.device = device
        self.to(device)

    def forward(self, src):
        embedded = self.embedding(src)
        output = self.positional_encoding(embedded)
        for layer in self.layers:
            output = layer(output, None)
        output = output[:, -1]
        output = self.fc(output)
        return output

def train_classifier(model, dataset, sp_model, epochs, lr, bs):
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam((p for p in model.parameters() if p.requires_grad), lr=lr)
    train_dataset = TextDataset(dataset, sp_model)
    train_dataloader = DataLoader(train_dataset, num_workers=1, batch_size=bs, collate_fn=collate_fn, shuffle=True)

    # Training loop
    for epoch in range(epochs):
        total_loss_train = 0
        total_acc_train = 0
        num_pos = 0
        for train_sequence, labels in tqdm(train_dataloader):
            predictions = model(train_sequence.to(device))
            labels = labels.to(device)
            loss = criterion(predictions, labels)

            # Calculate accuracy and loss per batch
            correct = predictions.argmax(axis=1) == labels
            total_acc_train += correct.sum().item()
            total_loss_train += loss.item()

            # Backprop
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
            optimizer.step()

        print(f'Epochs: {epoch + 1} | Loss: {total_loss_train / len(train_dataset): .3f} | Accuracy: {total_acc_train / len(train_dataset): .3f}')

def eval_classifier(model, dataset, sp_model, bs):
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam((p for p in model.parameters() if p.requires_grad), lr=lr)
    test_dataset = TextDataset(dataset, sp_model)
    test_dataloader = DataLoader(test_dataset, num_workers=1, batch_size=bs, collate_fn=collate_fn, shuffle=False)

    total_acc_train = 0.
    for sequence, labels in tqdm(test_dataloader):
        predictions = model(sequence.to(device))
        labels = labels.to(device)
        correct = predictions.argmax(axis=1) == labels
        total_acc_train += correct.sum().item()

    return total_acc_train / len(test_dataset)

In [None]:
#Train loop
train_sets = {}
models = {}
for sz in sizes:
    train_sets[sz], test_set = train_test_split(documents, sz)
    models[sz] = NoPretrainingTransformer(src_vocab_size, d_model, d_ff, num_heads, num_layers, max_length)
    print('-' * 56)
    print(f'Training on {sz} samples')
    print('-' * 56)
    train_classifier(models[sz], train_sets[sz], sp_model, finetune_epochs, lr, batch_size)

#Evaluation loop
with torch.no_grad():
    for sz in models:
        print('-' * 56)
        print(f'Evaluating model trained on {sz} samples')
        print('-' * 56)
        test_accuracy = eval_classifier(models[sz], test_set, sp_model, batch_size)
        print("\n Accuracy on test set = %.3f" % test_accuracy)

##Finetuning with pretraining

Train the model for the finetuning task for the different train dataset sizes for 20 epochs (The plot must be submitted as mentioned in the writeup)


In [None]:
def train_classifier(model, dataset, sp_model, epochs, lr, bs):
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam((p for p in model.parameters() if p.requires_grad), lr=lr)
    train_dataset = TextDataset(dataset, sp_model)
    train_dataloader = DataLoader(train_dataset, num_workers=1, batch_size=bs, collate_fn=collate_fn, shuffle=True)

    # Training loop
    for epoch in range(epochs):
        total_loss_train = 0
        total_acc_train = 0
        num_pos = 0
        for train_sequence, labels in tqdm(train_dataloader):
            predictions = model(train_sequence.to(device))
            preds = predictions[:, -1]
            labels = labels.to(device)
            loss = criterion(preds, labels)

            # Calculate accuracy and loss per batch
            correct = preds.argmax(axis=1) == labels
            total_acc_train += correct.sum().item()
            total_loss_train += loss.item()

            # Backprop
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
            optimizer.step()

        print(f'Epochs: {epoch + 1} | Loss: {total_loss_train / len(train_dataset): .3f} | Accuracy: {total_acc_train / len(train_dataset): .3f}')

In [None]:
lr = 1e-4
batch_size = 64
finetune_epochs = 20

#Train loop
train_sets = {}
finetuned_models = {}
for sz in sizes:
    train_sets[sz], test_set = train_test_split(documents, sz)
    pretrained = Transformer(src_vocab_size, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout)
    pretrained.load_state_dict(torch.load('/content/pretrained.pt'))
    finetuned_models[sz] = pretrained
    finetuned_models[sz].fc = nn.Linear(d_model, 2).to(device)
    print('-' * 56)
    print(f'Training on {2 * sz} samples')
    print('-' * 56)
    train_classifier(finetuned_models[sz], train_sets[sz], sp_model, finetune_epochs, lr, batch_size)

In [None]:
def eval_classifier(model, dataset, sp_model, bs):
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam((p for p in model.parameters() if p.requires_grad), lr=lr)
    test_dataset = TextDataset(dataset, sp_model)
    test_dataloader = DataLoader(test_dataset, num_workers=1, batch_size=bs, collate_fn=collate_fn, shuffle=False)

    total_acc = 0.
    for sequence, labels in tqdm(test_dataloader):
        predictions = model(sequence.to(device))
        preds = predictions[:, -1]
        labels = labels.to(device)
        correct = preds.argmax(axis=1) == labels
        total_acc += correct.sum().item()

    return total_acc / len(test_dataset)

In [None]:
#Evaluation loop
with torch.no_grad():
    for sz in finetuned_models:
        print('-' * 56)
        print(f'Evaluating model trained on {sz} samples')
        print('-' * 56)
        test_accuracy = eval_classifier(finetuned_models[sz], test_set, sp_model, batch_size)
        print("\n Accuracy on test set = %.3f" % test_accuracy)