# Bert
Lets ***transform the game***


In [32]:
# Lets get some imports up in here
import torch
from torch import nn
import torch.nn.functional as F 
import math as m    
import numpy as np 


In [33]:
#Set a few lil' hyperparameters
sup_params = {"LAYERS": 24,
              "D_HIDDEN": 1024,
              "HEADS": 16,
              "EPOCHS": 10, 
              "LR":3e-5, # Adam optimiser
              "BATCH_SIZE": 64,
              "DROP_PROB": 0.1}
few_params = {"LAYERS": 24,
              "D_HIDDEN": 1024,
              "HEADS": 16,
              "EPOCHS": 10, 
              "LR":1e-4, # SGD optimiser
              "BATCH_SIZE": 256,
              "DROP_PROB": 0.1}

device = "cpu"

Lets build a multihead attention module

In [34]:
toy_encodings = torch.Tensor([[[0.0, 0.1, 0.2, 0.3], [1.0, 1.1, 1.2, 1.3], [2.0, 2.1, 2.2, 2.3]]]) 
# shape(toy_encodings) = [B, T, D] = (1, 3, 4)
print("Toy Encodings:\n", toy_encodings)

D_MODEL = toy_encodings.shape[-1]

toy_MHA_layer = nn.MultiheadAttention(embed_dim=D_MODEL, num_heads=2)
toy_MHA, _ = toy_MHA_layer(toy_encodings, toy_encodings, toy_encodings)
print("Toy MHA: \n", toy_MHA)
print("Toy MHA Shape: \n", toy_MHA.shape)

Toy Encodings:
 tensor([[[0.0000, 0.1000, 0.2000, 0.3000],
         [1.0000, 1.1000, 1.2000, 1.3000],
         [2.0000, 2.1000, 2.2000, 2.3000]]])
Toy MHA: 
 tensor([[[-0.0925, -0.0732, -0.0211,  0.1079],
         [-0.5046, -0.4852, -0.0248,  0.7104],
         [-0.9168, -0.8971, -0.0285,  1.3128]]], grad_fn=<AddBackward0>)
Toy MHA Shape: 
 torch.Size([1, 3, 4])


Build a layer normalisation module that includes dropout

In [35]:
class Norm(nn.Module):
    def __init__(self, d_hidden=sup_params["D_HIDDEN"], dropout=sup_params["DROP_PROB"]):
        super().__init__()
        self.layer_norm = nn.LayerNorm(d_hidden)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, x):
        ln = self.layer_norm(x)
        return self.dropout(ln)

Position-wise feed-forward network

In [40]:
class PWFFN(nn.Module):
    def __init__(self, d_hidden=sup_params["D_HIDDEN"], d_ff=4*sup_params["D_HIDDEN"], dropout=sup_params["DROP_PROB"]):
        super().__init__()

        self.ff = nn.Sequential(
            nn.Linear(d_hidden, d_ff),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(d_ff, d_hidden)
        )

    def forward(self, x):
        # shape(x) = [B x seq_len x D]

        return self.ff(x)
        # shape(ff) = [B x seq_len x D]


In [None]:
model = nn.Transformer(d_model=sup_params["D_HIDDEN"], nhead=sup_params["HEADS"])