In [1]:
# Download and unpack data
!wget https://www.dropbox.com/s/tc1qo73rrm3gt3m/CARVANA.zip  # Carvana dataset
!unzip -q CARVANA.zip
!rm -rf ./train/.DS_Store
!rm -rf ./train_masks/.DS_Store

--2022-03-07 13:47:38--  https://www.dropbox.com/s/tc1qo73rrm3gt3m/CARVANA.zip
Resolving www.dropbox.com (www.dropbox.com)... 162.125.70.18, 2620:100:6026:18::a27d:4612
Connecting to www.dropbox.com (www.dropbox.com)|162.125.70.18|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: /s/raw/tc1qo73rrm3gt3m/CARVANA.zip [following]
--2022-03-07 13:47:38--  https://www.dropbox.com/s/raw/tc1qo73rrm3gt3m/CARVANA.zip
Reusing existing connection to www.dropbox.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://ucbe7aeba3b62c4d346ece644953.dl.dropboxusercontent.com/cd/0/inline/BhAv_omEPaOc0AnpPe6-LnXmrTrkSKTiXlde0ulfzDxpmSZ516DVmzaadVIDfjED7GbSuJCTRiLaB7Wtk1P3sTnc03CL6ZQHCuMu991StcrEMPVYEQR1VUBi5tbfkGBWtxKvnfDVSkgFnpRfK2ojK7d_/file# [following]
--2022-03-07 13:47:39--  https://ucbe7aeba3b62c4d346ece644953.dl.dropboxusercontent.com/cd/0/inline/BhAv_omEPaOc0AnpPe6-LnXmrTrkSKTiXlde0ulfzDxpmSZ516DVmzaadVIDfjED7GbSuJCTRiLaB7Wtk1P3sTnc03

In [1]:
%load_ext autoreload

In [2]:
%autoreload 2
# code sourse: https://pytorch.org/tutorials/beginner/transformer_tutorial.html
import math
from typing import Tuple

import torch
from torch import nn, Tensor
import torch.nn.functional as F
from torch.nn import TransformerEncoder, TransformerEncoderLayer
from torch.utils.data import dataset
from torchtext.datasets import WikiText2
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from tqdm.auto import trange

from transformer import generate_square_subsequent_mask, TransformerModel
from torch.profiler import profile, record_function, ProfilerActivity

In [3]:
train_iter = WikiText2(split="train")
tokenizer = get_tokenizer("basic_english")
vocab = build_vocab_from_iterator(map(tokenizer, train_iter), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])

def data_process(raw_text_iter: dataset.IterableDataset) -> Tensor:
    """Converts raw text into a flat Tensor."""
    data = [torch.tensor(vocab(tokenizer(item)), dtype=torch.long) for item in raw_text_iter]
    return torch.cat(tuple(filter(lambda t: t.numel() > 0, data)))

# train_iter was "consumed" by the process of building the vocab,
# so we have to create it again
train_iter, val_iter, test_iter = WikiText2()
train_data = data_process(train_iter)
val_data = data_process(val_iter)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def batchify(data: Tensor, bsz: int) -> Tensor:
    """Divides the data into bsz separate sequences, removing extra elements
    that wouldn't cleanly fit.

    Args:
        data: Tensor, shape [N]
        bsz: int, batch size

    Returns:
        Tensor of shape [N // bsz, bsz]
    """
    seq_len = data.size(0) // bsz
    data = data[:seq_len * bsz]
    data = data.view(bsz, seq_len).t().contiguous()
    return data.to(device)

batch_size = 20
eval_batch_size = 10
train_data = batchify(train_data, batch_size)  # shape [seq_len, batch_size]
val_data = batchify(val_data, eval_batch_size)

In [4]:
bptt = 35
def get_batch(source: Tensor, i: int) -> Tuple[Tensor, Tensor]:
    """
    Args:
        source: Tensor, shape [full_seq_len, batch_size]
        i: int

    Returns:
        tuple (data, target), where data has shape [seq_len, batch_size] and
        target has shape [seq_len * batch_size]
    """
    seq_len = min(bptt, len(source) - 1 - i)
    data = source[i:i+seq_len]
    target = source[i+1:i+1+seq_len].reshape(-1)
    return data, target

In [5]:
ntokens = len(vocab)  # size of vocabulary
emsize = 200  # embedding dimension
d_hid = 200  # dimension of the feedforward network model in nn.TransformerEncoder
nlayers = 2  # number of nn.TransformerEncoderLayer in nn.TransformerEncoder
nhead = 2  # number of heads in nn.MultiheadAttention
dropout = 0.2  # dropout probability

model = TransformerModel(ntokens, emsize, nhead, d_hid, nlayers, dropout).to(device)

In [6]:
import copy
import time

criterion = nn.CrossEntropyLoss()
lr = 5.0  # learning rate
optimizer = torch.optim.SGD(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)

def train(model: nn.Module) -> None:
    model.train()  # turn on train mode
    total_loss = 0.
    log_interval = 200
    start_time = time.time()
    src_mask = generate_square_subsequent_mask(bptt).to(device)

    num_batches = len(train_data) // bptt
    i = 0
    for batch in trange(0, train_data.size(0) - 1, bptt, desc="Epoch progress: "):
        data, targets = get_batch(train_data, i)
        batch_size = data.size(0)
        if batch_size != bptt:  # only on last batch
            src_mask = src_mask[:batch_size, :batch_size]
        with record_function("forward"):
            output = model(data, src_mask)
        loss = criterion(output.view(-1, ntokens), targets)

        # feel free to comment out this 
        optimizer.zero_grad()
        with record_function("backward"):
            loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()

        total_loss += loss.item()
        if batch % log_interval == 0 and batch > 0:
            lr = scheduler.get_last_lr()[0]
            ms_per_batch = (time.time() - start_time) * 1000 / log_interval
            cur_loss = total_loss / log_interval
            ppl = math.exp(cur_loss)
            print(f"| epoch {epoch:3d} | {batch:5d}/{num_batches:5d} batches | "
                  f"lr {lr:02.2f} | ms/batch {ms_per_batch:5.2f} | "
                  f"loss {cur_loss:5.2f} | ppl {ppl:8.2f}")
            total_loss = 0
            start_time = time.time()
        i += 1
            

def evaluate(model: nn.Module, eval_data: Tensor) -> float:
    model.eval()  # turn on evaluation mode
    total_loss = 0.
    src_mask = generate_square_subsequent_mask(bptt).to(device)
    with torch.no_grad():
        for i in range(0, eval_data.size(0) - 1, bptt):
            data, targets = get_batch(eval_data, i)
            batch_size = data.size(0)
            if batch_size != bptt:
                src_mask = src_mask[:batch_size, :batch_size]
            output = model(data, src_mask)
            output_flat = output.view(-1, ntokens)
            total_loss += batch_size * criterion(output_flat, targets).item()
    return total_loss / (len(eval_data) - 1)

In [None]:
best_val_loss = float("inf")
epochs = 1
best_model = None

for epoch in range(1, epochs + 1):
    epoch_start_time = time.time()
    
    with profile(activities=[
        ProfilerActivity.CPU, ProfilerActivity.CUDA], record_shapes=True) as prof:
        train(model)
    val_loss = evaluate(model, val_data)
    val_ppl = math.exp(val_loss)
    elapsed = time.time() - epoch_start_time
    print("-" * 89)
    print(f"| end of epoch {epoch:3d} | time: {elapsed:5.2f}s | "
          f"valid loss {val_loss:5.2f} | valid ppl {val_ppl:8.2f}")
    print("-" * 89)

#     if val_loss < best_val_loss:
#         best_val_loss = val_loss
#         best_model = copy.deepcopy(model)

#     scheduler.step()

Epoch progress:   0%|          | 0/2929 [00:00<?, ?it/s]

| epoch   1 |  1400/ 2928 batches | lr 5.00 | ms/batch 24.74 | loss  1.53 | ppl     4.62
| epoch   1 |  2800/ 2928 batches | lr 5.00 | ms/batch  4.31 | loss  0.68 | ppl     1.97
| epoch   1 |  4200/ 2928 batches | lr 5.00 | ms/batch  4.51 | loss  0.46 | ppl     1.58


In [None]:
stat = prof.key_averages()

In [12]:
with open("transformer_stats_cpu.txt", "w") as f:
    f.write(stat.table(sort_by="cpu_time_total", row_limit=100))

In [14]:
with open("transformer_stats_cuda.txt", "w") as f:
    f.write(stat.table(sort_by="cuda_time_total", row_limit=100))