# Import necessary libraries



## Drive + pip install

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd "/content/drive/My Drive/DL final"

/content/drive/My Drive/DL final


In [None]:
!pip install datasets evaluate



## Other libraries

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F

import math
import os
import numpy as np

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

# Config

In [None]:
max_token_length= 128
d_model = 512
num_layer = 6
factor = 4
n_head = 8

learning_rate = 2e-5
weight_decay = 1e-3

batch_size = 32
dropout = 0.1

d_ff = 2048

# Data preprocessing

In [None]:
from datasets import load_dataset
dataset = load_dataset('glue', 'sst2')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/35.3k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/3.11M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/72.8k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/148k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/67349 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/872 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1821 [00:00<?, ? examples/s]

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
def preprocess_function(examples):
    return tokenizer(examples["sentence"], padding='max_length', truncation=True, max_length=max_token_length)

In [None]:
tokenized_train = dataset["train"].map(preprocess_function, batched=True)
tokenized_val = dataset["validation"].map(preprocess_function, batched=True)
tokenized_test = dataset["test"].map(preprocess_function, batched=True)

Map:   0%|          | 0/67349 [00:00<?, ? examples/s]

Map:   0%|          | 0/872 [00:00<?, ? examples/s]

Map:   0%|          | 0/1821 [00:00<?, ? examples/s]

In [None]:
tmp_data = np.array(tokenized_train["input_ids"])
vocab_size = np.max(tmp_data) + 1
tmp_data = None

In [None]:
tmp_data = np.array(tokenized_train["label"])
num_classes = np.max(tmp_data) + 1
tmp_data = None

In [None]:
class CustomDataset(Dataset):
    def __init__(self, data):
        self.data = data["input_ids"]
        self.label = data["label"]
        self.length = len(self.label)

    def __len__(self):
        return self.length

    def __getitem__(self, index):
        torch_data = torch.tensor(self.data[index], dtype=torch.int64)
        torch_label = torch.tensor(self.label[index], dtype=torch.int64)

        return (torch_data, torch_label)

In [None]:
torch_dataset_train = CustomDataset(tokenized_train)
torch_dataset_val = CustomDataset(tokenized_val)
torch_dataset_test = CustomDataset(tokenized_test)

In [None]:
torch_train_loader = DataLoader(torch_dataset_train, batch_size=batch_size, shuffle=True)
torch_val_loader = DataLoader(torch_dataset_val, batch_size=batch_size, shuffle=True)
torch_test_loader = DataLoader(torch_dataset_test, batch_size=batch_size, shuffle=True)

# Model

In [None]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, n_head):
        super(MultiHeadAttention, self).__init__()
        assert d_model % n_head == 0

        self.d_model = d_model
        self.n_head = n_head
        self.d_k = d_model // n_head

        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)
        self.W_o = nn.Linear(d_model, d_model)

    def scaled_dot_product_attention(self, Q, K, V, mask=None):
        attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)

        if mask is not None:
            attn_scores = attn_scores.masked_fill(mask == 0, -1e9)

        attn_probs = torch.softmax(attn_scores, dim=-1)
        output = torch.matmul(attn_probs, V)
        return output

    def split_heads(self, x):
        batch_size, seq_length, d_model = x.size()
        return x.view(batch_size, seq_length, self.n_head, self.d_k).transpose(1, 2)

    def combine_heads(self, x):
        batch_size, _, seq_length, d_k = x.size()
        return x.transpose(1, 2).contiguous().view(batch_size, seq_length, self.d_model)

    def forward(self, Q, K, V, mask=None):
        Q = self.split_heads(self.W_q(Q))
        K = self.split_heads(self.W_k(K))
        V = self.split_heads(self.W_v(V))

        attn_output = self.scaled_dot_product_attention(Q, K, V, mask)

        output = self.W_o(self.combine_heads(attn_output))
        return output

In [None]:
class PositionWiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff):
        super(PositionWiseFeedForward, self).__init__()
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)
        self.relu = nn.ReLU()

    def forward(self, x):
        return self.fc2(self.relu(self.fc1(x)))

In [None]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_seq_length):
        super(PositionalEncoding, self).__init__()

        pe = torch.zeros(max_seq_length, d_model)
        position = torch.arange(0, max_seq_length, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model))

        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)

        self.register_buffer('pe', pe.unsqueeze(0))

    def forward(self, x):
        return x + self.pe[:, :x.size(1)]

In [None]:
class EncoderLayer(nn.Module):
    def __init__(self, d_model, n_head, d_ff, dropout):
        super(EncoderLayer, self).__init__()
        self.self_attn = MultiHeadAttention(d_model, n_head)
        self.feed_forward = PositionWiseFeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask):
        attn_output = self.self_attn(x, x, x, mask)
        x = self.norm1(x + self.dropout(attn_output))
        ff_output = self.feed_forward(x)
        x = self.norm2(x + self.dropout(ff_output))
        return x

In [None]:
class TransformerEncoder(nn.Module):
    def __init__(self, vocab_size, d_model, n_head, num_layers, d_ff, max_seq_length, dropout):
        super(TransformerEncoder, self).__init__()
        self.vocab_size = vocab_size
        self.d_model = d_model
        self.n_head = n_head
        self.num_layers = num_layers
        self.d_ff = d_ff
        self.max_seq_length = max_seq_length
        self.dropout = nn.Dropout(dropout)

        self.embedding = nn.Embedding(vocab_size, d_model)
        self.encoder = nn.ModuleList([EncoderLayer(d_model, n_head, d_ff, dropout) for _ in range(num_layers)])

        self.positional_embedding = PositionalEncoding(d_model, max_seq_length)

    def masking(self, x):
        x_mask = (x != tokenizer.pad_token_id).unsqueeze(1).unsqueeze(2).to(x.device)
        return x_mask

    def forward(self, x):
        x_mask = self.masking(x)

        x = self.embedding(x) * math.sqrt(self.d_model)
        x = self.dropout(self.positional_embedding(x))

        for layer in self.encoder:
          x = layer(x, x_mask)

        x = x.reshape(x.shape[0], -1)

        return x

x = torch.randint(size=(32, 10), low=0, high=1000)

net = TransformerEncoder(vocab_size=1000, d_model=512, n_head=8, num_layers=4, d_ff=2048, max_seq_length=10, dropout=0.1)

print(net(x).shape)

torch.Size([32, 5120])


In [None]:
class TransformerEncoderClassification(nn.Module):
    def __init__(self, vocab_size=1000, d_model=512, n_head=8, num_layers=4, d_ff=2048, max_seq_length=64, dropout=0.1, num_classes=2):
        super(TransformerEncoderClassification, self).__init__()
        self.transformers_encoder = TransformerEncoder(vocab_size, d_model, n_head, num_layers, d_ff, max_seq_length, dropout)
        self.fc1 = nn.Linear(max_seq_length * d_model, d_model)
        self.fc2 = nn.Linear(d_model, 128)
        self.fc3 = nn.Linear(128, num_classes)

    def forward(self, x):
        x = self.transformers_encoder(x)

        print(x.shape)

        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

x =  torch.randint(size=(32, 10), low=1, high=100)
net = TransformerEncoderClassification(vocab_size=1000, d_model=512, n_head=8, num_layers=4, d_ff=2048, max_seq_length=10, dropout=0.1, num_classes=2)
a = net(x)
print(a.shape)

torch.Size([32, 5120])
torch.Size([32, 2])


# Train

## Initialize, load, save model

In [None]:
def init_model():
    model = TransformerEncoderClassification(vocab_size=vocab_size,
                                             d_model=d_model,
                                             n_head=n_head,
                                             num_layers=num_layer,
                                             d_ff=d_ff,
                                             max_seq_length=max_token_length,
                                             dropout=dropout,
                                             num_classes=num_classes).to(device=device)

    criterion = nn.CrossEntropyLoss().to(device=device)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

    return model, criterion, optimizer

def save_model(model, optimizer, epoch, path):
    torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
    }, path)

def load(model, optimizer, path):
    checkpoint = torch.load(path, map_location=torch.device(device))

    print(type(checkpoint["model_state_dict"]))

    model.load_state_dict(checkpoint["model_state_dict"])
    optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
    epoch = checkpoint["epoch"]

    return model, optimizer, epoch

## Actual training loop

In [None]:
def summary(loader, model, criterion):
    num_correct = 0
    num_samples = 0
    total_loss = 0
    loss_epoch = 0
    loss_avg = 0

    model.eval()

    acc = 0

    with torch.no_grad():
        for index, (data, label) in enumerate(loader):
            data = data.to(device=device)
            label = label.to(device=device)

            prob = model(data)

            pred = torch.argmax(prob, dim=1)

            current_correct = (pred == label).sum()
            current_size = pred.shape[0]

            num_correct += current_correct
            num_samples += current_size

            #print(data.shape)
            #print(label.shape)
            #print(pred.shape)

            loss = criterion(prob, label)

            loss_epoch += loss.item()

        acc = float(num_correct)/float(num_samples) * 100.0
        loss_avg = float(loss_epoch)/float(len(loader))
    return acc, loss_avg

In [None]:
def train(train_loader, val_loader, num_epochs, batch_print=40):
    train_acc_list = []
    train_loss_list = []

    val_acc_list = []
    val_loss_list = []

    cur_epoch = -1

    model, criterion, optimizer = init_model()

    numpy_final_result = [[] for _ in range(20)]

    MODEL_SAVE_PATH = os.path.join(os.getcwd(), "./eval/encoder_attention.pt")
    JSON_SAVE_PATH = os.path.join(os.getcwd(), "./eval/encoder_attention.json")

    if os.path.exists(MODEL_SAVE_PATH):
        model, optimizer, cur_epoch = load(model, optimizer, path=MODEL_SAVE_PATH)

        #with open(NUMPY_SAVE_PATH, 'rb') as f:
        #    numpy_final_result = pickle.load(f)

        ### LOAD MODEL ###

    for epoch in range(num_epochs):
        if cur_epoch >= epoch:
            continue

        correct_samples = 0
        total_samples = 0

        loss_epoch = 0

        print("----------------------------------------")

        model.train()

        for batch_idx, (data, label) in enumerate(train_loader):
            # Data to CUDA if possible
            data = data.to(device=device)
            label = label.to(device=device)

            print(data.shape)
            print(label.shape)

            optimizer.zero_grad()

            prob = model(data)

            print(prob.shape)
            #prob.requires_grad=True
            prob.retain_grad()

            pred = torch.argmax(prob, dim=1)

            current_correct = (pred == label).sum()
            current_size = pred.shape[0]

            correct_samples += current_correct
            total_samples += current_size

            #print(data.shape)
            #print(label.shape)
            #print(pred.shape)
            #print(prob.shape)

            loss = criterion(prob, label)
            loss.retain_grad()
            #loss.requires_grad=True
            loss.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=5)
            #optimizer.requires_grad=True
            optimizer.step()

            loss_epoch += loss.item()

            if batch_idx % batch_print == batch_print - 1:
                print(f"Batch {batch_idx + 1}: Accuracy: {float(current_correct) / float(current_size) * 100.0}")
                print(f"Loss: {float(loss.item())}")
                save_model(model=model, optimizer=optimizer, epoch=epoch, path=MODEL_SAVE_PATH)

        # Validation
        val_acc, val_loss = summary(val_loader, model, criterion)

        train_acc_list.append(float(correct_samples) / float(total_samples + 1e-12) * 100.0)
        train_loss_list.append(float(loss_epoch) / float(len(train_loader)))

        val_acc_list.append(val_acc)
        val_loss_list.append(val_loss)

        #for i in range(20):
        #    numpy_final_result[i].extend(final_result[i])
        #    print(f"Prob for {i + 1}: min {np.min(numpy_final_result[i])}, max: {np.max(numpy_final_result[i])}")

        if epoch % 1 == 0:
            save_model(model=model, optimizer=optimizer, epoch=epoch, path=MODEL_SAVE_PATH)

        cur_epoch = epoch

        print(f"Epoch {epoch + 1}:")

        print(f"Train accuracy: {train_acc_list[-1]}%")
        print(f"Train loss: {train_loss_list[-1]}")

        print(f"Val accuracy: {val_acc_list[-1]}%")
        print(f"Val loss: {val_loss_list[-1]}")

In [None]:
train(torch_train_loader, torch_val_loader, num_epochs=5, batch_print=50)

----------------------------------------
torch.Size([32, 128])
torch.Size([32])
torch.Size([32, 65536])
torch.Size([32, 2])
torch.Size([32, 128])
torch.Size([32])
torch.Size([32, 65536])
torch.Size([32, 2])
torch.Size([32, 128])
torch.Size([32])
torch.Size([32, 65536])
torch.Size([32, 2])
torch.Size([32, 128])
torch.Size([32])
torch.Size([32, 65536])
torch.Size([32, 2])
torch.Size([32, 128])
torch.Size([32])
torch.Size([32, 65536])
torch.Size([32, 2])
torch.Size([32, 128])
torch.Size([32])
torch.Size([32, 65536])
torch.Size([32, 2])
torch.Size([32, 128])
torch.Size([32])
torch.Size([32, 65536])
torch.Size([32, 2])
torch.Size([32, 128])
torch.Size([32])
torch.Size([32, 65536])
torch.Size([32, 2])
torch.Size([32, 128])
torch.Size([32])
torch.Size([32, 65536])
torch.Size([32, 2])
torch.Size([32, 128])
torch.Size([32])
torch.Size([32, 65536])
torch.Size([32, 2])
torch.Size([32, 128])
torch.Size([32])
torch.Size([32, 65536])
torch.Size([32, 2])
torch.Size([32, 128])
torch.Size([32])
torch.S