In [1]:
import torch
from torch import nn
from torch.nn import functional as F
from tqdm import tqdm
import math
torch.manual_seed(1234)

from transformers import AutoTokenizer
from datasets import Dataset
import psutil

# use GPU if available
device = 'cuda' if torch.cuda.is_available() else 'cpu' 

#### A simple/minimal implementation of the BERT model (https://arxiv.org/pdf/1810.04805v2.pdf). 

The transformer block and multihead attention layer implementations are based on the Andrej Karpathy GPT youtube tutorial. In this case, we use a transformer encoder block which uses bi-directional context, differing from the transformer decoder in GPT which is unidirectional (achieved via causal masking of attention weights).

In [2]:
import math

class MultiHeadAttention(nn.Module):
    def __init__(self, block_size, embedding_dim, total_head_size, num_heads, dropout_rate):
        super().__init__()

        assert total_head_size % num_heads == 0, "head_size needs to be integer multiple of num_heads"

        self.block_size = block_size
        self.embedding_dim = embedding_dim
        self.total_head_size = total_head_size 
        self.head_size = total_head_size // num_heads 
        self.num_heads = num_heads
        self.dropout_rate = dropout_rate

        # define parameters
        self.key = nn.Linear(embedding_dim, self.total_head_size, bias=False)
        self.query = nn.Linear(embedding_dim, self.total_head_size, bias=False)
        self.value = nn.Linear(embedding_dim, self.total_head_size, bias=False)
        self.attn_dropout = nn.Dropout(dropout_rate)

        # non-parameter tensor of lower triangular ones
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        # we also need to apply a linear projection to make the output residual the same dimension as the input
        self.proj = nn.Linear(total_head_size, embedding_dim) 
        self.output_dropout = nn.Dropout(dropout_rate)


    # define forward pass, input shape: (B,T,C) where B=batch size, T=block_size, C=embedding_dim
    # the attn_mask is a mask that can be used for masking out the attention weights for padding tokens 
    def forward(self, x, attn_mask):
        B, T, C = x.shape
        k = self.key(x) # (B,T,H) where H is the total_head_size
        q = self.query(x) # (B,T,H)
        v = self.value(x) # (B,T,H)

        # reshape (B,T,H) --> (B,T,n,h), where n=num_heads and h=head_size and H=n*h
        k = k.view(B,T,self.num_heads,self.head_size) 
        q = q.view(B,T,self.num_heads,self.head_size) 
        v = v.view(B,T,self.num_heads,self.head_size) 

        # now we transpose so that the num_heads is the second dimension followed by T,h
        # this allows us to batch matrix mutliply for all heads simulataneously to compute their attention weights
        # (B,T,n,h) --> (B,n,T,h) 
        k = k.transpose(1,2) 
        q = q.transpose(1,2)
        v = v.transpose(1,2)

        # compute attention scores manually (slower)
        '''
        W = q @ k.transpose(-2,-1)  / math.sqrt(self.head_size) # (B,n,T,T)
        W = W.masked_fill(attn_mask == 0, float('-inf')) 
        W = F.softmax(W, dim=-1)
        # apply dropout to attention weights
        W = self.attn_dropout(W)
        out = W @ v # (B,n,T,h)
        '''

        # use pytorch built-in function for faster computation of attention scores (set the 'is_causal' parameter for applying causal masking)
        out = F.scaled_dot_product_attention(q,k,v,attn_mask=attn_mask,dropout_p=self.dropout_rate if self.training else 0,is_causal=False)

        # we can transpose the output from (B,n,T,h) --> (B,T,n,h)
        # since the last two dimensions of the transposed tensor are non-contiguous, we apply 
        # contiguous() which return a contiguous tensor
        out = out.transpose(1,2).contiguous()

        # finally we collapse the last two dimensions to get the concatenated output, (B,T,n,h) --> (B,T,n*h) 
        out = out.view(B,T,self.total_head_size)

        # now we project the concatenated output so that it has the same dimensions as the multihead attention layer input
        # (we need to add it with the input because of the residual connection, so need to be same size) 
        out = self.proj(out) # (B,T,C) 

        # apply dropout
        out = self.output_dropout(out)

        return out
    

# a simple mlp 
class FeedForward(nn.Module):
    def __init__(self, embedding_dim, dropout_rate):
        super().__init__()
        # we add extra computations by growing out the feed-forward hidden size by a factor of 4
        # we also add an extra linear layer at the end to project the residual back to same dimensions as input
        self.net = nn.Sequential(
            nn.Linear(embedding_dim, 4*embedding_dim),  
            nn.ReLU(),
            nn.Linear(4*embedding_dim, embedding_dim), 
            nn.Dropout(dropout_rate)
        )
    
    # in the forward pass, concatenate the outputs from all the attention heads
    def forward(self, x):
        return self.net(x)
    

# transformer encoder block with residual connection and layer norm
# Note: the original transformer uses post layer norms, here we use pre layer norms, i.e. layer norm is applied at the input
# instead of the output, this typically leads to better results in terms of training convergence speed and gradient scaling 
class TransformerBlock(nn.Module):
    def __init__(self, block_size, embedding_dim, head_size, num_heads, dropout_rate):
        super().__init__()
        self.sa = MultiHeadAttention(block_size, embedding_dim, head_size, num_heads, dropout_rate) # multi-head attention layer 
        self.ff = FeedForward(embedding_dim, dropout_rate)   # feed-forward layer
        self.ln1 = nn.LayerNorm(embedding_dim) # layer norm at input of multi-head attention
        self.ln2 = nn.LayerNorm(embedding_dim) # layer norm at input of feed-forward

    # in the forward pass, concatenate the outputs from all the attention heads
    def forward(self, x, attn_mask):
        # residual connection between input and multi-head attention output (also note that we're doing a pre-layer norm, i.e. layer norm at the input of the multi-head attention)
        x = x + self.sa(self.ln1(x), attn_mask)
        # residual connection between multi-head attention output and feed-forward output (also note that we're doing a pre-layer norm, i.e. layer norm at the input of the feed-forward)
        x = x + self.ff(self.ln2(x)) 
        return x
    

# BERT model with multiple transformer blocks 
class BERTModel(nn.Module):
    def __init__(self, vocab_size, block_size, embedding_dim, head_size, num_heads, num_blocks, pad_token_id, dropout_rate=0.2):
        super().__init__()

        self.vocab_size = vocab_size
        self.block_size = block_size        # block_size is just the input sequence length
        self.embedding_dim = embedding_dim
        self.head_size = head_size
        self.hum_heads = num_heads
        self.num_blocks = num_blocks

        '''
        Define model parameters
        '''
        # token embedding layer 
        self.token_embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_token_id) # shape: (vocab_size,C)
        # position embedding layer
        self.pos_embedding = nn.Embedding(block_size, embedding_dim) # shape: (T,C)
        # segment embedding layer (disabled for now)
        #self.segment_embedding = nn.Embedding(2, embedding_dim)

        # stack of transformer blocks
        self.blocks = nn.ModuleList([TransformerBlock(block_size, embedding_dim, head_size, num_heads, dropout_rate) for _ in range(num_blocks)])

        # pooling transformation of CLS token
        self.pooling_linear = nn.Linear(embedding_dim, embedding_dim) # shape: (C,C)
        self.pooling_activation_fn = nn.Tanh()

        # store position indices inside a buffer for fast access when computing position embeddings
        self.position_idx = torch.arange(block_size, device=device).unsqueeze(0)
        self.register_buffer('position_idx', self.position_idx)


        # forward pass takes in a batch of input token sequences idx of shape (B,T) and corresponding targets of shape (B,T)
    def forward(self, idx, attn_mask, targets=None, segment_idx=None):
        B, T = idx.shape
        # get token embeddings
        token_embeds = self.token_embedding(idx) # (B,T,C)
        # add positional encoding
        pos_embeds = self.pos_embedding(self.position_idx[:,:T]) # (T,C) 
        
        # add sentence segment embedding (disabled for now)
        # segment_embeds = self.segment_embedding(segment_idx) # segment_idx is an integer tensor of shape (B,T) and has 0's at positions corresponding to 
        
        # the first sentence and 1's at positions corresponding to the second sentence 
        x = token_embeds + pos_embeds # (B,T,C)
        # pass through transformer blocks to get encoding
        for block in self.blocks:
            x = block(x, attn_mask) # (B,T,C)
    
        # get CLS token encoding and apply pooling transform
        cls_encoding = x[:,0] # (B,C)
        pooled_cls_encoding = self.pooling_activation_fn(self.pooling_linear(cls_encoding)) # (B,C)

        return x, pooled_cls_encoding 


#### We will use the WordPiece Tokenizer with the vocabulary used for pre-training the original BERT (https://huggingface.co/learn/nlp-course/chapter6/6?fw=pt)

In [3]:
# we will use the uncased wordpiece tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# show some words from the wordpiece vocabulary
list(tokenizer.vocab.keys())[:20]

['freaking',
 'evenly',
 '##গ',
 'internacional',
 'inference',
 '##ious',
 'yuan',
 'atoll',
 'compilation',
 'cmll',
 'ろ',
 '##morphic',
 'trillion',
 'squire',
 '##岡',
 'irs',
 'denote',
 '##rigues',
 'comedian',
 '##shu']

In [4]:
# load the books corpus dataset from file
dataset = Dataset.from_file('book_corpus_dataset/archive/train/dataset.arrow')

In [5]:
print(f"RAM used: {psutil.Process().memory_info().rss / (1024 * 1024):.2f} MB")

RAM used: 1142.43 MB


In [8]:
# check how many sentences are in the dataset
len(dataset)

74004228

In [9]:
# show some sentences from the dataset
dataset[:10]['text']

['the half-ling book one in the fall of igneeria series kaylee soderburg copyright 2013 kaylee soderburg all rights reserved .',
 'isbn : 1492913731 isbn-13 : 978-1492913733 for my family , who encouraged me to never stop fighting for my dreams chapter 1 summer vacations supposed to be fun , right ?',
 'i wish i had a better answer to that question .',
 'starlings , new york is not the place youd expect much to happen .',
 'its a small quiet town , the kind where everyone knows your name .',
 'its a place where your parents wouldnt even care if you stayed out late biking with your friends .',
 'only because everyone felt so safe , so comfy .',
 'they dont know the half of it .',
 'but i do .',
 'i know it all and starlings is not the place where you want to be after dark .']

In [10]:
# now let's tokenize some sentences
example_sentence = dataset[0]['text']
print(example_sentence)
# first, split into subwords
subwords = tokenizer.tokenize(example_sentence)
print(subwords)
# now, tokenize
tokenized_sentence = tokenizer(example_sentence)
print(tokenized_sentence)

# decode the token sequence back into a string
decoded_sentence = tokenizer.decode(tokenized_sentence['input_ids'])
print(decoded_sentence)

the half-ling book one in the fall of igneeria series kaylee soderburg copyright 2013 kaylee soderburg all rights reserved .
['the', 'half', '-', 'ling', 'book', 'one', 'in', 'the', 'fall', 'of', 'ign', '##eer', '##ia', 'series', 'kay', '##lee', 'so', '##der', '##burg', 'copyright', '2013', 'kay', '##lee', 'so', '##der', '##burg', 'all', 'rights', 'reserved', '.']
{'input_ids': [101, 1996, 2431, 1011, 17002, 2338, 2028, 1999, 1996, 2991, 1997, 16270, 11510, 2401, 2186, 10905, 10559, 2061, 4063, 4645, 9385, 2286, 10905, 10559, 2061, 4063, 4645, 2035, 2916, 9235, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


2023-12-04 21:39:36.527214: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-12-04 21:39:36.685585: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


[CLS] the half - ling book one in the fall of igneeria series kaylee soderburg copyright 2013 kaylee soderburg all rights reserved. [SEP]


Note that the applying the tokenizer returns a dictionary containing 3 lists: the token id sequence, 

In [11]:
# we can also prevent the tokenizer from returning the token_type_ids list because we won't be doing the next sentence prediction task and we've disabled segment embedding
# and we can also have the tokenizer return pytorch tensors, note that padding tokens have mask value zero
tokenized_sentence = tokenizer(example_sentence, return_token_type_ids=False, return_tensors='pt', max_length=128, padding='max_length')
tokenized_sentence

{'input_ids': tensor([[  101,  1996,  2431,  1011, 17002,  2338,  2028,  1999,  1996,  2991,
          1997, 16270, 11510,  2401,  2186, 10905, 10559,  2061,  4063,  4645,
          9385,  2286, 10905, 10559,  2061,  4063,  4645,  2035,  2916,  9235,
          1012,   102,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,  

In [11]:
# check indices of some special tokens
print(f"[PAD] token id: {tokenizer.pad_token_id}")
print(f"[MASK] token id: {tokenizer.mask_token_id}")
print(f"[CLS] token id: {tokenizer.cls_token_id}")
print(f"[SEP] token id: {tokenizer.sep_token_id}")
print(f"[UNK] token id: {tokenizer.unk_token_id}")

[PAD] token id: 0
[MASK] token id: 103
[CLS] token id: 101
[SEP] token id: 102
[UNK] token id: 100


#### Now, lets create a data loader for serving up batches of tokenized sentences.

In [10]:
from torch.utils.data import DataLoader

block_size = 128
batch_size = 32

# get tokenized batches
tokenized_dataset = dataset.map(lambda x: tokenizer(x['text'], return_token_type_ids=False, padding='max_length', max_length=block_size), batched=True)

Map:   0%|          | 0/74004228 [00:00<?, ? examples/s]

In [19]:
# Set the format to only return 'input_ids' and 'attention_mask'
tokenized_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask'])

In [22]:
from transformers import DataCollatorForLanguageModeling

# collator function for masked language modeling, randomly sets 15% of the tokens to [MASK] token
collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)

# create pytorch dataloader
dataloader = DataLoader(tokenized_dataset, batch_size=batch_size, shuffle=True, collate_fn=collator)  

In [28]:
# show a batch
batch = next(iter(dataloader))