# BERT

In [1]:
import math
import re
from random import *
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

## 1. Data

In [2]:
import datasets
dataset = datasets.load_dataset('bookcorpus', split='train[:10000]')
dataset

Dataset({
    features: ['text'],
    num_rows: 10000
})

## 2. Preprocessing

### Tokenization and numericalization

In [3]:
import torchtext

tokenizer = torchtext.data.utils.get_tokenizer('basic_english')

In [4]:
sentences = [x.replace("\n", " ") for x in dataset['text']]
# Remove any text which, after tokenized, has tokens length > 125
sentences = [x for x in sentences if len(tokenizer(x)) <= 125]
len(sentences)

10000

In [5]:
#lower case, and clean all the symbols
text = [x.lower() for x in sentences]
text = [re.sub("[.,!?\\-]", '', x) for x in text]

In [6]:
# Tokenize the cleaned text
tokenized_sentences = [tokenizer(sentence) for sentence in text]

In [7]:
# Build vocabulary from the tokenized sentences
vocab = torchtext.vocab.build_vocab_from_iterator(tokenized_sentences, specials=["[PAD]", "[CLS]", "[SEP]", "[MASK]", "[UNK]"])
vocab.set_default_index(vocab['[UNK]'])

In [8]:
# Get the size of the vocabulary
vocab_size = len(vocab)

In [9]:
# Save the vocabulary for later use
torch.save(vocab, './model/vocab.pth')

In [10]:
# Convert sentences to numerical representation using the vocabulary
token_list = [[vocab[token] for token in tokens] for tokens in tokenized_sentences]

In [11]:
# Optionally, we can save 'token_list' for training
torch.save(token_list, './model/tokenized_sentences.pth')

## 3. Data loader

We gonna make dataloader.  Inside here, we need to make two types of embeddings: **token embedding** and **segment embedding**

1. **Token embedding** - Given “The cat is walking. The dog is barking”, we add [CLS] and [SEP] >> “[CLS] the cat is walking [SEP] the dog is barking”. 

2. **Segment embedding**
A segment embedding separates two sentences, i.e., [0 0 0 0 1 1 1 1 ]

3. **Masking**
As mentioned in the original paper, BERT randomly assigns masks to 15% of the sequence. In this 15%, 80% is replaced with masks, while 10% is replaced with random tokens, and the rest 10% is left as is.  Here we specified `max_pred` 

4. **Padding**
Once we mask, we will add padding. For simplicity, here we padded until some specified `max_len`. 

Note:  `positive` and `negative` are just simply counts to keep track of the batch size.  `positive` refers to two sentences that are really next to one another.

In [12]:
batch_size = 6
max_mask   = 5 #even though it does not reach 15% yet....maybe you can set this threshold
max_len    = 256 #maximum length that my transformer will accept.....all sentence will be padded

In [13]:
def make_batch(batch_size, max_mask, max_len):
    batch = []
    half_batch_size = batch_size // 2
    positive = negative = 0
    while positive != half_batch_size or negative != half_batch_size:
        
        # Randomly choose two sentences from the pre-processed token list
        tokens_a_index, tokens_b_index = np.random.randint(len(sentences), size=2)
        tokens_a, tokens_b = token_list[tokens_a_index], token_list[tokens_b_index]
        
        # Construct input_ids by concatenating [CLS], tokens_a, [SEP], tokens_b, and [SEP]
        input_ids = [vocab['[CLS]']] + tokens_a + [vocab['[SEP]']] + tokens_b + [vocab['[SEP]']]
        
        # Create segment_ids to differentiate between the two sentences (0s for the first, 1s for the second)
        segment_ids = [0] * (1 + len(tokens_a) + 1) + [1] * (len(tokens_b) + 1)
        
        # Determine the number of tokens to mask based on 15% of the sequence length
        n_pred = min(max_mask, max(1, int(round(len(input_ids) * 0.15))))
        
        # Identify candidate positions for masking, excluding [CLS] and [SEP]
        candidates_masked_pos = [i for i, token in enumerate(input_ids) if token != vocab['[CLS]'] and token != vocab['[SEP]']]
        np.random.shuffle(candidates_masked_pos)
        masked_tokens, masked_pos = [], []
        
        # Apply masking with a probability distribution: 10% random token, 80% [MASK], 10% unchanged
        for pos in candidates_masked_pos[:n_pred]:
            masked_pos.append(pos)
            masked_tokens.append(input_ids[pos])
            rand_val = np.random.random()
            if rand_val < 0.1:  # 10% chance to replace with a random token
                index = np.random.randint(4, vocab_size - 1)  # Exclude special tokens
                input_ids[pos] = vocab[vocab.get_itos()[index]]
            elif rand_val < 0.8:  # 80% chance to replace with [MASK]
                input_ids[pos] = vocab['[MASK]']
            # 10% chance to leave the token unchanged
            
        # Pad input_ids and segment_ids to the maximum sequence length
        n_pad = max_len - len(input_ids)
        input_ids.extend([0] * n_pad)  # Using vocab['[PAD]'] might be more explicit
        segment_ids.extend([0] * n_pad)
        
        # Pad masked_tokens and masked_pos to ensure uniformity
        if max_mask > n_pred:
            n_pad = max_mask - n_pred
            masked_tokens.extend([0] * n_pad)  # Consider using a placeholder other than 0 for clarity
            masked_pos.extend([0] * n_pad)
        
        # Append the processed example to the batch, marking as positive or negative based on sentence adjacency
        if tokens_a_index + 1 == tokens_b_index and positive < half_batch_size:
            batch.append([input_ids, segment_ids, masked_tokens, masked_pos, True])  # True for consecutive sentences
            positive += 1
        elif tokens_a_index + 1 != tokens_b_index and negative < half_batch_size:
            batch.append([input_ids, segment_ids, masked_tokens, masked_pos, False])  # False for non-consecutive sentences
            negative += 1
        
    return batch


In [14]:
batch = make_batch(batch_size, max_mask, max_len)

In [15]:
len(batch)

6

In [16]:
input_ids, segment_ids, masked_tokens, masked_pos, isNext = map(torch.LongTensor, zip(*batch))

In [17]:
input_ids.shape, segment_ids.shape, masked_tokens.shape, masked_pos.shape, isNext

(torch.Size([6, 256]),
 torch.Size([6, 256]),
 torch.Size([6, 5]),
 torch.Size([6, 5]),
 tensor([0, 0, 0, 1, 1, 1]))

In [18]:
masked_tokens

tensor([[   5,    5,   14,   70,    0],
        [   6,   12,  298,   72,    0],
        [  31,    5,  421,   10,    0],
        [  87,   68,  746,    6,    0],
        [ 358, 1185,   57,   10,    0],
        [  13,    6,    0,    0,    0]])

## 4. Model

Recall that BERT only uses the encoder.

BERT has the following components:

- Embedding layers
- Attention Mask
- Encoder layer
- Multi-head attention
- Scaled dot product attention
- Position-wise feed-forward network
- BERT (assembling all the components)

## 4.1 Embedding

<img src = "../figures/BERT_embed.png" width=500>

In [19]:
class Embedding(nn.Module):
    """
    Embedding layer that combines token embeddings, position embeddings, and segment embeddings,
    followed by layer normalization. Designed for use in transformer models.
    """
    def __init__(self, vocab_size, max_len, n_segments, d_model, device):
        """
        Initializes the embedding layer.

        Args:
            vocab_size (int): Size of the vocabulary.
            max_len (int): Maximum length of the input sequences.
            n_segments (int): Number of distinct segments or types of tokens.
            d_model (int): Dimensionality of the embeddings.
            device (torch.device): The device (CPU/GPU) where the tensors will be allocated.
        """
        super(Embedding, self).__init__()
        self.tok_embed = nn.Embedding(vocab_size, d_model)  # Embedding layer for token IDs.
        self.pos_embed = nn.Embedding(max_len, d_model)     # Embedding layer for positional encoding.
        self.seg_embed = nn.Embedding(n_segments, d_model)  # Embedding layer for segment IDs.
        self.norm = nn.LayerNorm(d_model)  # Layer normalization to stabilize the inputs to the subsequent layers.
        self.device = device  # Specifies the device for tensor allocation.

    def forward(self, x, seg):
        """
        Forward pass of the embedding layer.

        Args:
            x (Tensor): Input tensor with token IDs. Shape: (batch_size, sequence_length).
            seg (Tensor): Segment ID tensor to indicate different parts of the input. Shape: (batch_size, sequence_length).

        Returns:
            Tensor: The combined embeddings with normalization applied. Shape: (batch_size, sequence_length, d_model).
        """
        seq_len = x.size(1)  # Determine the sequence length from the input tensor.
        
        # Generate a position tensor, move it to the specified device, and match its shape with the input tensor.
        pos = torch.arange(seq_len, dtype=torch.long).to(self.device)
        pos = pos.unsqueeze(0).expand_as(x)  # Shape transformation to match `x`.
        
        # Compute the sum of token embeddings, position embeddings, and segment embeddings.
        embedding = self.tok_embed(x) + self.pos_embed(pos) + self.seg_embed(seg)
        
        # Apply layer normalization to the combined embeddings before returning.
        return self.norm(embedding)


## 4.2 Attention mask

In [20]:
def get_attn_pad_mask(seq_q, seq_k, device):
    batch_size, len_q = seq_q.size()
    batch_size, len_k = seq_k.size()
    # eq(zero) is PAD token
    pad_attn_mask = seq_k.data.eq(0).unsqueeze(1).to(device)  # batch_size x 1 x len_k(=len_q), one is masking
    return pad_attn_mask.expand(batch_size, len_q, len_k)  # batch_size x len_q x len_k

### Testing the attention mask

In [21]:
print(get_attn_pad_mask(input_ids, input_ids, 'cpu').shape)

torch.Size([6, 256, 256])


## 4.3 Encoder

The encoder has two main components: 

- Multi-head Attention
- Position-wise feed-forward network

First let's make the wrapper called `EncoderLayer`

In [22]:
class EncoderLayer(nn.Module):
    """
    Represents a single layer within a transformer encoder.
    
    This layer consists of two main components:
    1. Multi-head self-attention mechanism.
    2. Position-wise feed-forward network.
    
    Each encoder layer processes the input sequence using self-attention and then applies a position-wise feed-forward neural network to the result.
    """
    def __init__(self, n_heads, d_model, d_ff, d_k, device):
        """
        Initializes the encoder layer with specified parameters.
        
        Args:
            n_heads (int): Number of attention heads.
            d_model (int): Dimensionality of the model's output space.
            d_ff (int): Dimensionality of the feed-forward network's inner layer.
            d_k (int): Dimensionality of the key/query vectors in the attention mechanism.
            device (torch.device): Device (CPU/GPU) on which the computations will be executed.
        """
        super(EncoderLayer, self).__init__()
        self.enc_self_attn = MultiHeadAttention(n_heads, d_model, d_k, device)  # Initializes the multi-head self-attention.
        self.pos_ffn = PoswiseFeedForwardNet(d_model, d_ff)  # Initializes the position-wise feed-forward network.

    def forward(self, enc_inputs, enc_self_attn_mask):
        """
        Forward pass of the encoder layer.
        
        Args:
            enc_inputs (Tensor): Input tensor to the encoder layer. Shape: (batch_size, sequence_length, d_model).
            enc_self_attn_mask (Tensor): Mask tensor for self-attention mechanism to ignore specific positions within the input. Shape: (batch_size, sequence_length).
        
        Returns:
            Tuple[Tensor, Tensor]: A tuple containing:
                - The output of the encoder layer after processing `enc_inputs`. Shape: (batch_size, sequence_length, d_model).
                - The attention weights from the multi-head self-attention mechanism. Shape varies based on implementation.
        """
        # Applies self-attention to the input. The same tensor `enc_inputs` is used as queries, keys, and values.
        enc_outputs, attn = self.enc_self_attn(enc_inputs, enc_inputs, enc_inputs, enc_self_attn_mask)
        
        # Applies the position-wise feed-forward network to the output of the self-attention mechanism.
        enc_outputs = self.pos_ffn(enc_outputs)
        
        return enc_outputs, attn


Let's define the scaled dot attention, to be used inside the multihead attention

In [23]:
class ScaledDotProductAttention(nn.Module):
    """
    Implements the scaled dot-product attention mechanism.

    The attention function used here is the dot product of queries and keys, scaled by the square root of the dimensionality of keys, followed by application of a softmax function to obtain the weights on the values.
    """
    def __init__(self, d_k, device):
        """
        Initializes the ScaledDotProductAttention layer.

        Args:
            d_k (int): Dimensionality of the key vectors. It is used to scale the dot product of the queries and keys.
            device (torch.device): Specifies the device for computation. This is important for
                                   transferring the scaling factor to the same device as the input tensors.
        """
        super(ScaledDotProductAttention, self).__init__()
        # Scale factor for the dot products, sqrt(d_k), moved to the specified device.
        self.scale = torch.sqrt(torch.FloatTensor([d_k])).to(device)

    def forward(self, Q, K, V, attn_mask):
        """
        Forward pass of the scaled dot-product attention.

        Args:
            Q (Tensor): Queries. Shape: [batch_size, n_heads, len_q, d_k].
            K (Tensor): Keys. Shape: [batch_size, n_heads, len_k, d_k].
            V (Tensor): Values. Shape: [batch_size, n_heads, len_v(=len_k), d_v].
            attn_mask (Tensor): An attention mask to prevent attention to certain positions. This is important for masking out padding tokens. Shape: [batch_size, n_heads, len_q, len_k].

        Returns:
            Tuple[Tensor, Tensor]: A tuple containing:
                - The context tensor after applying attention to the value vectors. Shape: [batch_size, n_heads, len_q, d_v].
                - The attention weights. Shape: [batch_size, n_heads, len_q, len_k].
        """
        # Calculate the dot products of Q and K, scale them, and apply the attention mask.
        scores = torch.matmul(Q, K.transpose(-1, -2)) / self.scale
        scores.masked_fill_(attn_mask, -1e9)  # Apply attention mask with a large negative number to softmax.

        # Apply softmax to get the attention weights.
        attn = nn.Softmax(dim=-1)(scores)

        # Multiply the attention weights with the value vectors to get the context.
        context = torch.matmul(attn, V)

        return context, attn


Let's define the parameters first

In [24]:
n_layers = 6    # number of Encoder of Encoder Layer
n_heads  = 8    # number of heads in Multi-Head Attention
d_model  = 768  # Embedding Size
d_ff = 768 * 4  # 4*d_model, FeedForward dimension
d_k = d_v = 64  # dimension of K(=Q), V
n_segments = 2

Here is the Multiheadattention.

In [25]:
class MultiHeadAttention(nn.Module):
    """
    Implements the Multi-Head Attention mechanism, a crucial component in Transformers. 
    This mechanism allows the model to jointly attend to information from different representation subspaces at different positions.
    """
    def __init__(self, n_heads, d_model, d_k, device):
        """
        Initializes the MultiHeadAttention layer.
        
        Args:
            n_heads (int): Number of attention heads.
            d_model (int): Dimensionality of the model's output space.
            d_k (int): Dimensionality of the key/query vectors in each attention head.
            device (torch.device): Device (CPU/GPU) on which the computations will be executed.
        """
        super(MultiHeadAttention, self).__init__()
        self.n_heads = n_heads
        self.d_model = d_model
        self.d_k = d_k
        self.d_v = d_k  # For simplicity, size of 'v' is made equal to that of 'k'.
        # Linear transformations for queries, keys, and values
        self.W_Q = nn.Linear(d_model, d_k * n_heads)
        self.W_K = nn.Linear(d_model, d_k * n_heads)
        self.W_V = nn.Linear(d_model, d_k * n_heads)  # Note: d_v is used as d_k for consistency.
        self.device = device
        # Output linear transformation
        self.fc = nn.Linear(n_heads * d_k, d_model).to(device)
        # Layer normalization
        self.layer_norm = nn.LayerNorm(d_model).to(device)

    def forward(self, Q, K, V, attn_mask):
        """
        Forward pass of the MultiHeadAttention layer.
        
        Args:
            Q (Tensor): Queries tensor. Shape: [batch_size, len_q, d_model].
            K (Tensor): Keys tensor. Shape: [batch_size, len_k, d_model].
            V (Tensor): Values tensor. Shape: [batch_size, len_v(=len_k), d_model].
            attn_mask (Tensor): Tensor indicating positions to be masked with negative infinity for softmax. Shape: [batch_size, len_q, len_k].
        
        Returns:
            Tuple[Tensor, Tensor]: A tuple containing:
                - Output tensor after applying multi-head attention and residual connection followed by layer normalization. Shape: [batch_size, len_q, d_model].
                - Attention weights across the heads. Shape: [batch_size, n_heads, len_q, len_k].
        """
        residual, batch_size = Q.clone(), Q.size(0)
        # Prepare query, key, value tensors for attention mechanism
        q_s = self.W_Q(Q).view(batch_size, -1, self.n_heads, self.d_k).transpose(1,2)
        k_s = self.W_K(K).view(batch_size, -1, self.n_heads, self.d_k).transpose(1,2)
        v_s = self.W_V(V).view(batch_size, -1, self.n_heads, self.d_v).transpose(1,2)
        # Repeat attn_mask for each attention head
        attn_mask = attn_mask.unsqueeze(1).repeat(1, self.n_heads, 1, 1)

        # Apply scaled dot product attention
        context, attn = ScaledDotProductAttention(self.d_k, self.device)(q_s, k_s, v_s, attn_mask)
        # Concatenate heads and apply final linear transformation
        context = context.transpose(1, 2).contiguous().view(batch_size, -1, self.n_heads * self.d_v)
        output = self.fc(context)
        # Apply layer normalization and residual connection
        return self.layer_norm(output + residual), attn


Here is the PoswiseFeedForwardNet.

In [26]:
import torch.nn as nn
import torch.nn.functional as F

class PoswiseFeedForwardNet(nn.Module):
    """
    Implements a position-wise feedforward network as described in the Transformer model architecture.
    This network is applied to each position separately and identically. It consists of two fully-connected layers
    with a GELU non-linearity between them.
    """
    def __init__(self, d_model, d_ff):
        """
        Initializes the position-wise feedforward network.

        Args:
            d_model (int): The number of expected features in the input (model dimension).
            d_ff (int): The dimensionality of the feed-forward network's inner layer.
        """
        super(PoswiseFeedForwardNet, self).__init__()
        # First fully connected layer increases dimensionality from d_model to d_ff
        self.fc1 = nn.Linear(d_model, d_ff)
        # Second fully connected layer decreases dimensionality back from d_ff to d_model
        self.fc2 = nn.Linear(d_ff, d_model)

    def forward(self, x):
        """
        Forward pass through the position-wise feedforward network.

        Args:
            x (Tensor): The input tensor with shape [batch_size, sequence_length, d_model].

        Returns:
            Tensor: The output tensor with the same shape as the input, [batch_size, sequence_length, d_model].
        """
        # Apply the first linear transformation followed by a GELU non-linearity
        # Then apply the second linear transformation to project the dimensions back
        return self.fc2(F.gelu(self.fc1(x)))


## 4.4 Putting them together

In [27]:
class BERT(nn.Module):
    def __init__(self, n_layers, n_heads, d_model, d_ff, d_k, n_segments, vocab_size, max_len, device):
        super(BERT, self).__init__()
        self.params = {'n_layers': n_layers, 'n_heads': n_heads, 'd_model': d_model,
                       'd_ff': d_ff, 'd_k': d_k, 'n_segments': n_segments,
                       'vocab_size': vocab_size, 'max_len': max_len}
        self.embedding = Embedding(vocab_size, max_len, n_segments, d_model, device)
        self.layers = nn.ModuleList([EncoderLayer(n_heads, d_model, d_ff, d_k, device) for _ in range(n_layers)])
        self.fc = nn.Linear(d_model, d_model)
        self.activ = nn.Tanh()
        self.linear = nn.Linear(d_model, d_model)
        self.norm = nn.LayerNorm(d_model)
        self.classifier = nn.Linear(d_model, 2)
        # decoder is shared with embedding layer
        embed_weight = self.embedding.tok_embed.weight
        n_vocab, n_dim = embed_weight.size()
        self.decoder = nn.Linear(n_dim, n_vocab, bias=False)
        self.decoder.weight = embed_weight
        self.decoder_bias = nn.Parameter(torch.zeros(n_vocab))
        self.device = device

    def forward(self, input_ids, segment_ids, masked_pos):
        output = self.embedding(input_ids, segment_ids)
        enc_self_attn_mask = get_attn_pad_mask(input_ids, input_ids, self.device)
        for layer in self.layers:
            output, enc_self_attn = layer(output, enc_self_attn_mask)
        # output : [batch_size, len, d_model], attn : [batch_size, n_heads, d_mode, d_model]
        
        # 1. predict next sentence
        # it will be decided by first token(CLS)
        h_pooled   = self.activ(self.fc(output[:, 0])) # [batch_size, d_model]
        logits_nsp = self.classifier(h_pooled) # [batch_size, 2]

        # 2. predict the masked token
        masked_pos = masked_pos[:, :, None].expand(-1, -1, output.size(-1)) # [batch_size, max_pred, d_model]
        h_masked = torch.gather(output, 1, masked_pos) # masking position [batch_size, max_pred, d_model]
        h_masked  = self.norm(F.gelu(self.linear(h_masked)))
        logits_lm = self.decoder(h_masked) + self.decoder_bias # [batch_size, max_pred, n_vocab]

        return logits_lm, logits_nsp
    
    def get_last_hidden_state(self, input_ids, segment_ids):
        output = self.embedding(input_ids, segment_ids)
        enc_self_attn_mask = get_attn_pad_mask(input_ids, input_ids, self.device)
        for layer in self.layers:
            output, enc_self_attn = layer(output, enc_self_attn_mask)

        return output

## 5. Training

In [28]:
# Check if CUDA (GPU support) is available and set PyTorch to use it, otherwise use the CPU
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

# Set a fixed seed for PyTorch's random number generator for reproducibility
SEED = 1234
torch.manual_seed(SEED)

# Ensure that CUDA's convolution operations are deterministic
# This may impact performance, but it ensures reproducibility
torch.backends.cudnn.deterministic = True


cpu


In [29]:
# Transformer model configuration parameters.

n_layers = 6    # Encoder layers: Increases depth for learning complex patterns.
n_heads  = 8    # Attention heads: Allows simultaneous focus on different sequence parts.
d_model  = 768  # Embedding dimension: Affects model capacity and representation.
d_ff = 768 * 4  # FeedForward dimension: Inner layer dimension, typically 4x d_model.
d_k = d_v = 64  # Key/query and value vector dimensions in attention mechanism.
n_segments = 2  # Segment types: Used for tasks with multiple sequences (e.g., NSP).

In [30]:
from tqdm import tqdm
import torch
import torch.nn as nn
import torch.optim as optim

# Assuming model configuration parameters and the make_batch function are defined elsewhere

# Setting up training parameters and the model
num_epoch = 3
num_step = 250  # Number of training steps per epoch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Initialize the model, loss criterion, and optimizer
model = BERT(n_layers, n_heads, d_model, d_ff, d_k, n_segments, vocab_size, max_len, device).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
for epoch in range(num_epoch):
    epoch_loss = 0
    # Generate batches for each epoch to introduce variability
    batch = [make_batch(batch_size, max_mask, max_len) for _ in tqdm(range(num_step), desc=f"Preparing batches for Epoch {epoch+1}")]

    for step in tqdm(range(num_step), desc=f"Epoch {epoch+1}"):
        optimizer.zero_grad()
        # Unpack the batch data and move tensors to the appropriate device
        input_ids, segment_ids, masked_tokens, masked_pos, isNext = map(torch.LongTensor, zip(*batch[step]))
        input_ids, segment_ids = input_ids.to(device), segment_ids.to(device)
        masked_tokens, masked_pos = masked_tokens.to(device), masked_pos.to(device)
        isNext = isNext.to(device)

        # Forward pass through the model
        logits_lm, logits_nsp = model(input_ids, segment_ids, masked_pos)

        # Compute losses for both MLM and NSP tasks
        loss_lm = criterion(logits_lm.transpose(1, 2), masked_tokens)  # Masked Language Model Loss
        loss_nsp = criterion(logits_nsp, isNext)  # Next Sentence Prediction Loss
        loss = loss_lm + loss_nsp

        # Backward pass and optimization
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()

    # Log the average loss for the epoch
    print(f'Epoch: {epoch + 1:02d}, Loss: {epoch_loss / num_step:.6f}')


Preparing batches for Epoch 1: 100%|██████████| 250/250 [18:27<00:00,  4.43s/it]
Epoch 1: 100%|██████████| 250/250 [15:24<00:00,  3.70s/it]


Epoch: 01, Loss: 28.975339


Preparing batches for Epoch 2: 100%|██████████| 250/250 [19:19<00:00,  4.64s/it]
Epoch 2: 100%|██████████| 250/250 [15:04<00:00,  3.62s/it]


Epoch: 02, Loss: 9.930248


Preparing batches for Epoch 3: 100%|██████████| 250/250 [19:37<00:00,  4.71s/it]
Epoch 3: 100%|██████████| 250/250 [14:33<00:00,  3.50s/it]

Epoch: 03, Loss: 8.619430





In [31]:
# save model
save_path = './model/bert.pt'
torch.save([model.params, model.state_dict()], save_path)

## 6. Inference


In [32]:
params, state = torch.load(save_path)
model = BERT(**params, device=device).to(device)
model.load_state_dict(state)

<All keys matched successfully>

In [35]:
# Assuming necessary imports, model initialization, and vocab setup are done previously

# Prepare input tensors from the first batch and move them to the configured device
# This step involves unpacking the input_ids, segment_ids, masked_tokens, masked_pos, and isNext from the batch
input_ids, segment_ids, masked_tokens, masked_pos, isNext = map(torch.LongTensor, zip(batch[0][0]))
input_ids = input_ids.to(device)
segment_ids = segment_ids.to(device)
masked_tokens = masked_tokens.to(device)
masked_pos = masked_pos.to(device)
isNext = isNext.to(device)

# Display the input sequence excluding padding for clarity
print([vocab.get_itos()[w.item()] for w in input_ids[0] if vocab.get_itos()[w.item()] != '[PAD]'])

# Forward pass through the model to get predictions for masked tokens and NSP
logits_lm, logits_nsp = model(input_ids, segment_ids, masked_pos)
# logits_lm shape explanation: (batch_size, max_mask, vocab_size) -> (1, 5, vocab_size)
# logits_nsp shape explanation: (batch_size, binary_output) -> (1, 2) for the NSP task

# Processing the output logits for the masked language modeling (MLM) task
# The `.max(2)[1]` operation retrieves the indices of the maximum values along the vocabulary dimension,
# which correspond to the predicted token IDs
logits_lm = logits_lm.data.cpu().max(2)[1][0].data.numpy() 
# Note: zeros in masked_tokens are padding added to match `max_mask`

# Display actual and predicted masked tokens for comparison
print('masked tokens (words): ', [vocab.get_itos()[pos.item()] for pos in masked_tokens[0]])
print('masked tokens list: ', [pos.item() for pos in masked_tokens[0]])
print('predicted masked tokens (words): ', [vocab.get_itos()[pos.item()] for pos in logits_lm])
print('predicted masked tokens list: ', [pos for pos in logits_lm])

# Processing the output logits for the NSP task
# The `.max(1)[1]` operation retrieves the index of the maximum value along the binary output dimension,
# indicating the model's prediction for the NSP task (0 for "no", 1 for "yes")
logits_nsp = logits_nsp.data.cpu().max(1)[1][0].data.numpy()
print('NSP prediction:', logits_nsp)

# Comparing the actual NSP label with the model's prediction
print('Actual isNext:', bool(isNext.item()))
print('Predicted isNext:', bool(logits_nsp))


['[CLS]', 'regardless', 'if', '[MASK]', 'agree', 'or', 'understand', '[MASK]', '[MASK]', "'", '[SEP]', 'pesh', 'turned', 'to', 'megan', 'and', 'gave', 'her', 'a', 'sheepish', 'smile', '[SEP]']
masked tokens (words):  ["'", 'it', 'we', '[PAD]', '[PAD]']
masked tokens list:  [5, 23, 46, 0, 0]
predicted masked tokens (words):  ['[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]']
predicted masked tokens list:  [0, 0, 0, 0, 0]
NSP prediction: 0
Actual isNext: False
Predicted isNext: False


Trying a bigger dataset should be able to see the difference.