# Simple transformer

In [13]:
# Hyperparameters
ntokens = 50000  # r50k_base used with GPT-2 is 50k vocabulary size
emsize = 200  # embedding dimension
d_hid = 200  # dimension of the feedforward network model in ``nn.TransformerEncoder``
nlayers = 2  # number of ``nn.TransformerEncoderLayer`` in ``nn.TransformerEncoder``
nhead = 2  # number of heads in ``nn.MultiheadAttention``
dropout = 0.2  # dropout probability
batch_size = 20  # batch size
device = "cpu"  # Use cuda if GPU is available, for playing with syntax cpu is fine

In [25]:
# Imports
import torch
from torch import nn, Tensor
import torch.optim as optim
from torch.nn import TransformerEncoder, TransformerEncoderLayer
from typing import Tuple
import math
import numpy as np

In [7]:
class PositionalEncoding(nn.Module):

    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x: Tensor) -> Tensor:
        """
        Arguments:
            x: Tensor, shape ``[seq_len, batch_size, embedding_dim]``
        """
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

In [None]:
class TransformerModel(nn.Module):

    def __init__(self, ntoken: int, d_model: int, nhead: int, d_hid: int,
                 nlayers: int, dropout: float = 0.5):
        super().__init__()
        self.model_type = 'Transformer'
        self.pos_encoder = PositionalEncoding(d_model, dropout)
        encoder_layers = TransformerEncoderLayer(d_model, nhead, d_hid, dropout)
        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
        self.embedding = nn.Embedding(ntoken, d_model)
        self.d_model = d_model
        self.linear = nn.Linear(d_model, ntoken)

        self.init_weights()

    def init_weights(self) -> None:
        initrange = 0.1
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.linear.bias.data.zero_()
        self.linear.weight.data.uniform_(-initrange, initrange)

    def forward(self, src: Tensor, src_mask: Tensor = None) -> Tensor:
        """
        Arguments:
            src: Tensor, shape ``[seq_len, batch_size]``
            src_mask: Tensor, shape ``[seq_len, seq_len]``

        Returns:
            output Tensor of shape ``[seq_len, batch_size, ntoken]``
        """
        src = self.embedding(src) * math.sqrt(self.d_model)
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src, src_mask)
        output = self.linear(output)
        return output

### Load data, split to batches, function to prepare target sequences

In [14]:
# Load vectors
train_data = torch.load("azure-docs-training.pt")
val_data = torch.load("azure-docs-validation.pt")

def create_batches(data: Tensor, batch_size: int) -> Tensor:
    seq_len = data.size(0) // batch_size
    data = data[:seq_len * batch_size]
    data = data.view(batch_size, seq_len).t().contiguous()
    return data.to(device)

train_data = create_batches(train_data, batch_size)
val_data = create_batches(val_data, batch_size)

In [20]:
# Print shapes
print(f"train_data shape: {train_data.shape[0]}, {train_data.shape[1]}")
print(f"val_data shape: {val_data.shape[0]}, {val_data.shape[1]}")

train_data shape: 3614965, 20
val_data shape: 392322, 20


In [56]:
# Function to prepare data and target (by shifting by one target has one more token - the one model should learn to predict)
bptt = 35
def get_batch(source: Tensor, i: int) -> Tuple[Tensor, Tensor]:
    seq_len = min(bptt, len(source) - 1 - i)
    data = source[i:i+seq_len]
    target = source[i+1:i+1+seq_len].reshape(-1)
    return data, target

# Print example - note that we have 19 tokens as data 19+1 token as target, model should learn 37165 as best next token
print("Example get_batch(train_data[1], 0)")
print("-----------------------------------")
data, target = get_batch(train_data[1], 0)
print(f"original_data: {train_data[1]}")
print(f"data: {data}")
print(f"target: {target}")
print()

# In this example we also have token 37165 to be predicted
print("Example get_batch(train_data[1], 1)")
print("-----------------------------------")
data, target = get_batch(train_data[1], 1)
print(f"original_data: {train_data[1]}")
print(f"data: {data}")
print(f"target: {target}")

Example get_batch(train_data[1], 0)
-----------------------------------
original_data: tensor([  198,    12,   220,   444,    25,  2215,   930,   734,    14,    11,
          907,    12,    12,    12,    12,  6333,  9900,   657, 25811, 37165])
data: tensor([  198,    12,   220,   444,    25,  2215,   930,   734,    14,    11,
          907,    12,    12,    12,    12,  6333,  9900,   657, 25811])
target: tensor([   12,   220,   444,    25,  2215,   930,   734,    14,    11,   907,
           12,    12,    12,    12,  6333,  9900,   657, 25811, 37165])

Example get_batch(train_data[1], 1)
-----------------------------------
original_data: tensor([  198,    12,   220,   444,    25,  2215,   930,   734,    14,    11,
          907,    12,    12,    12,    12,  6333,  9900,   657, 25811, 37165])
data: tensor([   12,   220,   444,    25,  2215,   930,   734,    14,    11,   907,
           12,    12,    12,    12,  6333,  9900,   657, 25811])
target: tensor([  220,   444,    25,  2215,   93

In [None]:

model = TransformerModel(ntokens, emsize, nhead, d_hid, nlayers, dropout).to(device)

In [67]:
for batch, i in enumerate(range(0, train_data.size(0) - 1, bptt)):
    print(f"batch: {batch}, i: {i}")
    # data, targets = get_batch(train_data, i)
    # print(f"data shape: {data.shape}")
    # print(f"data: {data}")
    # print(f"target: {target}")


batch: 0, i: 0
batch: 1, i: 35
batch: 2, i: 70
batch: 3, i: 105
batch: 4, i: 140
batch: 5, i: 175
batch: 6, i: 210
batch: 7, i: 245
batch: 8, i: 280
batch: 9, i: 315
batch: 10, i: 350
batch: 11, i: 385
batch: 12, i: 420
batch: 13, i: 455
batch: 14, i: 490
batch: 15, i: 525
batch: 16, i: 560
batch: 17, i: 595
batch: 18, i: 630
batch: 19, i: 665
batch: 20, i: 700
batch: 21, i: 735
batch: 22, i: 770
batch: 23, i: 805
batch: 24, i: 840
batch: 25, i: 875
batch: 26, i: 910
batch: 27, i: 945
batch: 28, i: 980
batch: 29, i: 1015
batch: 30, i: 1050
batch: 31, i: 1085
batch: 32, i: 1120
batch: 33, i: 1155
batch: 34, i: 1190
batch: 35, i: 1225
batch: 36, i: 1260
batch: 37, i: 1295
batch: 38, i: 1330
batch: 39, i: 1365
batch: 40, i: 1400
batch: 41, i: 1435
batch: 42, i: 1470
batch: 43, i: 1505
batch: 44, i: 1540
batch: 45, i: 1575
batch: 46, i: 1610
batch: 47, i: 1645
batch: 48, i: 1680
batch: 49, i: 1715
batch: 50, i: 1750
batch: 51, i: 1785
batch: 52, i: 1820
batch: 53, i: 1855
batch: 54, i: 189