# Imports

In [None]:
# Imports
import tqdm
import math
import torch
import random
import statistics
import requests
import tarfile
import os

import numpy as np
import matplotlib.pyplot as plt
import torch.nn.functional as F

from torch import nn
from pprint import pprint
from tqdm.notebook import tqdm
from datasets import load_metric
from datasets import load_dataset

# transformers library
from transformers import Trainer
from transformers import pipeline
from transformers import set_seed
from transformers import AutoTokenizer
from transformers import TrainingArguments
from transformers import AutoModelForCausalLM
from transformers import AutoModelForSequenceClassification

# pytorch
from pytorch_pretrained_bert import BertTokenizer
from pytorch_pretrained_bert import BertForMaskedLM

# textattack
from textattack.transformations import WordSwapQWERTY
from textattack.transformations import WordSwapExtend
from textattack.transformations import WordSwapContract
from textattack.transformations import WordSwapHomoglyphSwap
from textattack.transformations import CompositeTransformation
from textattack.transformations import WordSwapRandomCharacterDeletion
from textattack.transformations import WordSwapNeighboringCharacterSwap
from textattack.transformations import WordSwapRandomCharacterInsertion
from textattack.transformations import WordSwapRandomCharacterSubstitution

%load_ext tensorboard

# Helper functions

In [None]:
def set_seed(seed=None, seed_torch=True):
    if seed is None:
        seed = np.random.choice(2 ** 32)
    random.seed(seed)
    np.random.seed(seed)
    if seed_torch:
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        torch.cuda.manual_seed(seed)
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True

    print(f'Random seed {seed} has been set.')


# In case that `DataLoader` is used
def seed_worker(worker_id):
    worker_seed = torch.initial_seed() % 2**32
    np.random.seed(worker_seed)
    random.seed(worker_seed)
    
def set_device():
    device = "cuda" if torch.cuda.is_available() else "cpu"
    return device

SEED = 2021
set_seed(seed=SEED)
DEVICE = set_device()

In [None]:
# download dataset
def load_yelp_data(DATASET, tokenizer):
    dataset = DATASET
    dataset['train'] = dataset['train'].select(range(10000))
    dataset['test'] = dataset['test'].select(range(5000))
    dataset = dataset.map(lambda e: tokenizer(e['text'], truncation=True,
                                            padding='max_length'), batched=True)
    dataset.set_format(type='torch', columns=['input_ids', 'label'])

    train_loader = torch.utils.data.DataLoader(dataset['train'], batch_size=32)
    test_loader = torch.utils.data.DataLoader(dataset['test'], batch_size=32)

    vocab_size = tokenizer.vocab_size
    max_len = next(iter(train_loader))['input_ids'].shape[0]
    num_classes = next(iter(train_loader))['label'].shape[0]

    return train_loader, test_loader, max_len, vocab_size, num_classes


url = "https://osf.io/kthjg/download"
fname = "huggingface.tar.gz"

if not os.path.exists(fname):
    print('Dataset is downloading...')
    r = requests.get(url, allow_redirects=True)
    with open(fname, 'wb') as fd:
        fd.write(r.content)
    print('Download is finished.')

In [None]:
DATASET = load_dataset("yelp_review_full", download_mode="reuse_dataset_if_exists")
print(type(DATASET))

tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
train_loader, test_loader, max_len, vocab_size, num_classes = load_yelp_data(DATASET, tokenizer)

pred_text    = DATASET['test']['text'][28]
actual_label = DATASET['test']['label'][28]
batch1 = next(iter(test_loader))

In [None]:
def transform_sentence_for_bert(sent, masked_word = "___"):
    """
    By default takes a sentence with ___ instead of a masked word.

    Args:
    sent (str): an input sentence
    masked_word(str): a masked part of the sentence

    Returns:
    str: sentence that could be bassed to BERT
    """
    splitted = sent.split("___")
    assert (len(splitted) == 2), "Missing masked word. Make sure to mark it as ___"

    return '[CLS] ' + splitted[0] + "[MASK]" + splitted[1] + ' [SEP]'


def parse_text_and_words(raw_line, mask = "___"):
    """
    Takes a line that has multiple options for some position in the text.

    Input: The doctor picked up his/her bag
    Output: (The doctor picked up ___ bag, ['his', 'her'])

    Args:
    raw_line (str): a line in format 'some text option1/.../optionN some text'
    mask (str): the replacement for .../... section
    Returns:
    str: text with mask instead of .../... section
    list: list of words from the .../... section
    """
    splitted = raw_line.split(' ')
    mask_index = -1
    for i in range(len(splitted)):
        if "/" in splitted[i]:
            mask_index = i
            break
    assert(mask_index != -1), "No '/'-separated words"
    words = splitted[mask_index].split('/')
    splitted[mask_index] = mask
    return " ".join(splitted), words


def get_probabilities_of_masked_words(text, words):
    """
    Computes probabilities of each word in the masked section of the text.
    Args:
    text (str): A sentence with ___ instead of a masked word.
    words (list): array of words.
    Returns:
    list: predicted probabilities for given words.
    """
    text = transform_sentence_for_bert(text)
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    for i in range(len(words)):
        words[i] = tokenizer.tokenize(words[i])[0]
    words_idx = [tokenizer.convert_tokens_to_ids([word]) for word in words]
    tokenized_text = tokenizer.tokenize(text)
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    masked_index = tokenized_text.index('[MASK]')
    tokens_tensor = torch.tensor([indexed_tokens])

    pretrained_masked_model = BertForMaskedLM.from_pretrained('bert-base-uncased')
    pretrained_masked_model.eval()

    # Predict all tokens
    with torch.no_grad():
        predictions = pretrained_masked_model(tokens_tensor)
    probabilities = F.softmax(predictions[0][masked_index], dim = 0)
    predicted_index = torch.argmax(probabilities).item()

    return [probabilities[ix].item() for ix in words_idx]

# Attention overview

## Queries, Keys, & Values

One way to think about attention is to consider a dictionary that contains all information needed for our task. Each entry in the dictionary contains some value and the corresponding key to retrieve it. For a specific prediction, we would like to retrieve relevant information from the dictionary. Therefore, we issue a query, match it to keys in the dictionary, and return the corresponding values.

Let's compute the scaled dot product attention using its matrix form. 

\begin{equation}
\mathrm{softmax} \left( \frac{Q K^\text{T}}{\sqrt{d}} \right) V
\end{equation}

where $Q$ denotes the query or values of the embeddings (in other words the hidden states), $K$ the key, and $k$ denotes the dimension of the query key vector.

Note: the function takes an additional argument `h` (number of heads). You can assume it is 1 for now.

In [None]:
class DotProductAttention(nn.Module):
    """Scaled dot product attention."""
    def __init__(self, dropout, **kwargs):
        super(DotProductAttention, self).__init__(**kwargs)
        self.dropout = nn.Dropout(dropout)

    def forward(self, queries, keys, values, b, h, t, k):
        """
        Compute dot products. This is the same operation for each head,
        so we can fold the heads into the batch dimension and use torch.bmm
        Note: .contiguous() doesn't change the actual shape of the data,
        but it rearranges the tensor in memory, which will help speed up the computation
        for this batch matrix multiplication.
        .transpose() is used to change the shape of a tensor. It returns a new tensor
        that shares the data with the original tensor. It can only swap two dimension.

        Shape of `queries`: (`batch_size`, no. of queries, head,`k`)
        Shape of `keys`: (`batch_size`, no. of key-value pairs, head, `k`)
        Shape of `values`: (`batch_size`, no. of key-value pairs, head, value dimension)

        b: batch size
        h: number of heads
        t: number of keys/queries/values (for simplicity, let's assume they have the same sizes)
        k: embedding size
        """
        keys = keys.transpose(1, 2).contiguous().view(b * h, t, k)
        queries = queries.transpose(1, 2).contiguous().view(b * h, t, k)
        values = values.transpose(1, 2).contiguous().view(b * h, t, k)

        # Matrix Multiplication between the keys and queries
        score = torch.bmm(queries, keys.transpose(1, 2)) / math.sqrt(k)  # size: (b * h, t, t)
        softmax_weights = F.softmax(score, dim=2)  # row-wise normalization of weights

        # Matrix Multiplication between the output of the key and queries multiplication and values.
        out = torch.bmm(self.dropout(softmax_weights), values).view(b, h, t, k)  # rearrange h and t dims
        out = out.transpose(1, 2).contiguous().view(b, t, h * k)

        return out

# Transformers

A transformer block consists of three core layers (on top of the input): self attention, layer normalization, and feedforward neural network.

Implement the forward function below by composing the given modules (`SelfAttention`, `LayerNorm`, and `mlp`) according to the diagram below.

<img src="https://raw.githubusercontent.com/NeuromatchAcademy/course-content-dl/main/tutorials/W2D4_AttentionAndTransformers/static/transformers1.png">

In [None]:
class TransformerBlock(nn.Module):
    """Transformer Block
    Args:
    k (int): Attention embedding size
    heads (int): number of self-attention heads

    Attributes:
    attention: Multi-head SelfAttention layer
    norm_1, norm_2: LayerNorms
    mlp: feedforward neural network
    """
    def __init__(self, k, heads):
        super().__init__()

        self.attention = SelfAttention(k, heads=heads)

        self.norm_1 = nn.LayerNorm(k)
        self.norm_2 = nn.LayerNorm(k)

        hidden_size = 2 * k  # This is a somewhat arbitrary choice
        self.mlp = nn.Sequential(
            nn.Linear(k, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, k))

    def forward(self, x):
        attended = self.attention(x)
        # Complete the input of the first Add & Normalize layer
        x = self.norm_1(attended + x)

        feedforward = self.mlp(x)
        # Complete the input of the second Add & Normalize layer
        x = self.norm_2(feedforward + x)

        return x

# Multi-head Attention

One powerful idea in Transformer is multi-head attention, which is used to capture different aspects of the dependence among words (e.g., syntactical vs semantic). For more info see [here](https://lilianweng.github.io/lil-log/2018/06/24/attention-attention.html#a-family-of-attention-mechanisms).

In self-attention, the queries, keys, and values are all mapped (by linear projection) from the word embeddings. Implement the mapping functions (`to_keys`, `to_queries`, `to_values`) below.

In [None]:
class SelfAttention(nn.Module):
    """Multi-head self attention layer

    Args:
    k (int): Size of attention embeddings
    heads (int): Number of attention heads

    Attributes:
    to_keys: Transforms input to k x k*heads key vectors
    to_queries: Transforms input to k x k*heads query vectors
    to_values: Transforms input to k x k*heads value vectors
    unify_heads: combines queries, keys and values to a single vector
    """
    def __init__(self, k, heads=8, dropout=0.1):
        super().__init__()
        self.k, self.heads = k, heads
        self.to_keys = nn.Linear(k, k * heads, bias=False)
        self.to_queries = nn.Linear(k, k * heads, bias=False)
        self.to_values = nn.Linear(k, k * heads, bias=False)
        self.unify_heads = nn.Linear(k * heads, k)

        self.attention = DotProductAttention(dropout)

    def forward(self, x):
        """Implements forward pass of self-attention layer

        Args:
          x (torch.Tensor): batch x t x k sized input
        """
        b, t, k = x.size()
        h = self.heads

        # We reshape the queries, keys and values so that each head has its own dimension
        queries = self.to_queries(x).view(b, t, h, k)
        keys = self.to_keys(x).view(b, t, h, k)
        values = self.to_values(x).view(b, t, h, k)

        out = self.attention(queries, keys, values, b, h, t, k)

        return self.unify_heads(out)

# Positional Encoding

Self-attention is not sensitive to positions or word orderings. Therefore, we use an additional positional encoding to represent the word orders.

There are multiple ways to encode the position. For our purpose to have continuous values of the positions based on binary encoding, let's use the following implementation of deterministic (as opposed to learned) position encoding using sinusoidal functions.

Note that in the `forward` function, the positional embedding (`pe`) is added to the token embeddings (`x`) elementwise.

In [None]:
class PositionalEncoding(nn.Module):
    # Source: https://pytorch.org/tutorials/beginner/transformer_tutorial.html
    def __init__(self, emb_size, dropout=0.1, max_len=512):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, emb_size)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, emb_size, 2).float() * (-np.log(10000.0) / emb_size))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

In [None]:
class Transformer(nn.Module):
    """Transformer Encoder network for classification

    Args:
      k (int): Attention embedding size
      heads (int): Number of self attention heads
      depth (int): How many transformer blocks to include
      seq_length (int): How long an input sequence is
      num_tokens (int): Size of dictionary
      num_classes (int): Number of output classes
    """
    def __init__(self, k, heads, depth, seq_length, num_tokens, num_classes):
        super().__init__()

        self.k = k
        self.num_tokens = num_tokens
        self.token_embedding = nn.Embedding(num_tokens, k)
        self.pos_enc = PositionalEncoding(k)

        transformer_blocks = []
        for i in range(depth):
            transformer_blocks.append(TransformerBlock(k=k, heads=heads))

        self.transformer_blocks = nn.Sequential(*transformer_blocks)
        self.classification_head = nn.Linear(k, num_classes)

    def forward(self, x):
        """Forward pass for Classification Transformer network

        Args:
          x (torch.Tensor): (b, t) sized tensor of tokenized words

        Returns:
          torch.Tensor of size (b, c) with log-probabilities over classes
        """
        x = self.token_embedding(x) * np.sqrt(self.k)
        x = self.pos_enc(x)
        x = self.transformer_blocks(x)
        sequence_avg = x.mean(dim=1)
        x = self.classification_head(sequence_avg)
        logprobs = F.log_softmax(x, dim=1)

        return logprobs

In [None]:
def train(model, loss_fn, train_loader,
          n_iter=1, learning_rate=1e-4,
          test_loader=None, device='cpu',
          L2_penalty=0, L1_penalty=0):
    """Run gradient descent to opimize parameters of a given network

    Args:
    net (nn.Module): PyTorch network whose parameters to optimize
    loss_fn: built-in PyTorch loss function to minimize
    train_data (torch.Tensor): n_train x n_neurons tensor with neural
      responses to train on
    train_labels (torch.Tensor): n_train x 1 tensor with orientations of the
      stimuli corresponding to each row of train_data
    n_iter (int, optional): number of iterations of gradient descent to run
    learning_rate (float, optional): learning rate to use for gradient descent
    test_data (torch.Tensor, optional): n_test x n_neurons tensor with neural
      responses to test on
    test_labels (torch.Tensor, optional): n_test x 1 tensor with orientations of
      the stimuli corresponding to each row of test_data
    L2_penalty (float, optional): l2 penalty regularizer coefficient
    L1_penalty (float, optional): l1 penalty regularizer coefficient

    Returns:
    (list): training loss over iterations

    """

    # Initialize PyTorch Adam optimizer
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    # Placeholder to save the loss at each iteration
    train_loss = []
    test_loss = []

    # Loop over epochs (cf. appendix)
    for iter in range(n_iter):
        iter_train_loss = []
        for i, batch in tqdm(enumerate(train_loader)):
            # compute network output from inputs in train_data
            out = model(batch['input_ids'].to(device))
            loss = loss_fn(out, batch['label'].to(device))

            # Clear previous gradients
            optimizer.zero_grad()

            # Compute gradients
            loss.backward()

            # Update weights
            optimizer.step()

            # Store current value of loss
            iter_train_loss.append(loss.item())  # .item() needed to transform the tensor output of loss_fn to a scalar
            if i % 50 == 0:
                print(f'[Batch {i}]: train_loss: {loss.item()}')
        train_loss.append(statistics.mean(iter_train_loss))

        # Track progress
        if True: #(iter + 1) % (n_iter // 5) == 0:

            if test_loader is not None:
                print('Running Test loop')
            iter_loss_test = []
            for j, test_batch in enumerate(test_loader):

                out_test = model(test_batch['input_ids'].to(device))
                loss_test = loss_fn(out_test, test_batch['label'].to(device))
                iter_loss_test.append(loss_test.item())

            test_loss.append(statistics.mean(iter_loss_test))

            if test_loader is None:
                print(f'iteration {iter + 1}/{n_iter} | train loss: {loss.item():.3f}')
            else:
                print(f'iteration {iter + 1}/{n_iter} | train loss: {loss.item():.3f} | test_loss: {loss_test.item():.3f}')

    if test_loader is None:
        return train_loss
    else:
        return train_loss, test_loss

# Set random seeds for reproducibility
np.random.seed(1)
torch.manual_seed(1)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Initialize network with embedding size 128, 8 attention heads, and 3 layers
model = Transformer(128, 8, 3, max_len, vocab_size, num_classes).to(device)

# Initialize built-in PyTorch Negative Log Likelihood loss function
loss_fn = F.nll_loss

train_loss, test_loss = train(model, loss_fn, train_loader, test_loader=test_loader,
                              device=device)

In [None]:
with torch.no_grad():
    # Batch 1 contains all the tokenized text for the 1st batch of the test loader
    pred_batch = model(batch1['input_ids'].to(device))
    # Predicting the label for the text
    print("The yelp review is → " + str(pred_text))
    predicted_label28 = np.argmax(pred_batch[28].cpu())
    print()
    print("The Predicted Rating is → " + str(predicted_label28) + " and the Actual Rating was → " + str(actual_label))