## Transformers from Scratch



In [1]:
import sys
import numpy as np
import torch
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import AutoConfig, AutoModel, AutoTokenizer
from math import sqrt
from torch import nn
from loguru import logger
from typing import Optional
import matplotlib.pyplot as plt
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def scaled_dot_product_attention(
    query: torch.Tensor,
    key: torch.Tensor,
    value: torch.Tensor,
    mask: Optional[torch.Tensor] = None,
    dropout: Optional[nn.Dropout] = None
) -> torch.Tensor:
    """
    Compute scaled dot product attention weights.

    Args:
        query: Tensor with shape [batch_size, seq_length_q, depth_q].
        key: Tensor with shape [batch_size, seq_length_k, depth_k].
        value: Tensor with shape [batch_size, seq_length_v, depth_v].
        mask: Optional tensor with shape [batch_size, seq_length_q, seq_length_k],
            containing values to be masked. Default is None.

    Returns:
        Tensor with shape [batch_size, seq_length_q, depth_v].
    """

    dim_k = query.size(-1)
    logger.debug(f"query_size: {query.size()}")
    logger.debug(f"key: {key.transpose(-2, -1).size()}")
    # TODO
    # Compute the attention scores.
    # Scale the dot product simialrity between the query and the key tensors.
    scores = torch.matmul(query, key.transpose(-2, -1)) / sqrt(dim_k)
    if mask is not None:
        mask = mask.unsqueeze(1)
        scores = scores.masked_fill(mask == 0, float("-inf"))
    # TODO:
    # Compute attention weights w.
    # Apply the softmax function to the scaled similarity matrix.
    weights = F.softmax(scores, dim=-1)

    if dropout is not None:
        weights = dropout(weights)
    # TODO:
    # Update the token embeddings.
    # Multiply the attention weights w by the value vectors V
    # to obtain a weighted sum of the values.
    return torch.matmul(weights, value)

#### 1.2. Multi-head Attention
- The multi-head attention is an extension of the self-attention mechanism. It enhances the modeling capability by performing multiple attention computations in parallel, with different learned linear projections.
$$ \begin{matrix}
\text{MultiHead}(Q, K, V) = \text{Concat}(\text{head}_1,...,\text{head}_h)W^O\\
\text{where}~\text{head}_i = \text{Attention}(QW_i^Q, KW_i^K,VW_i^V)
\end{matrix} $$
- Steps:
  1. It applies linear transformations to the query, key, and value tensors using the learned linear layers self.q, self.k, and self.v, respectively. This projects the tensors to the appropriate dimensions for attention computation.
  2. The attention scores are computed by performing matrix multiplication between the query and key tensors.
  3. The attention scores are scaled by dividing them by the square root of the head dimension (`self.head_dim`).
  4. If a mask is provided, the attention scores are masked by setting the scores corresponding to masked positions to negative infinity.
  5. The attention scores are passed through a softmax activation function along the last dimension (`dim=-1`).
  6. The attention probabilities are used to weight the value tensor.
  7. The resulting attention output is transposed and reshaped to match the original shape.
  8. Finally, the attention output is passed through the `self.output_linear` linear layer, which applies another linear transformation to the output representation.

In [3]:

class MultiHeadAttention(nn.Module):
    """
    Multi-head attention module.

    Args:
        config: Configuration for the multi-head attention.
    """
    def __init__(self, config) -> None:
        super().__init__()
        self.embed_dim = config.hidden_size
        self.num_heads = config.num_attention_heads
        logger.debug(f"hidden_dim: {self.embed_dim}")
        logger.debug(f"num_heads: {self.num_heads}")

        assert self.embed_dim % self.num_heads == 0
        self.head_dim = self.embed_dim // self.num_heads
        logger.debug(f"head_dim: {self.head_dim}")

        self.q = nn.Linear(self.embed_dim, self.head_dim * self.num_heads)
        self.k = nn.Linear(self.embed_dim, self.head_dim * self.num_heads)
        self.v = nn.Linear(self.embed_dim, self.head_dim * self.num_heads)
        self.output_linear = nn.Linear(self.embed_dim, self.embed_dim)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    def forward(
            self,
            query: torch.Tensor,
            key: torch.Tensor,
            value: torch.Tensor,
            mask: Optional[torch.Tensor] = None
        ) -> torch.Tensor:
            """
            Perform a forward pass of the multi-head attention.

            Args:
                query: Query tensor of shape [batch_size, seq_len, embed_dim].
                key: Key tensor of shape [batch_size, seq_len, embed_dim].
                value: Value tensor of shape [batch_size, seq_len, embed_dim].
                mask: Optional mask tensor. Default is None.

            Returns:
                Tensor of shape [batch_size, seq_len, embed_dim],
                representing the output of the multi-head attention.
            """
            # TODO:
            # Apply linear transformations to the query, key, and value tensors
            q = self.q(query)
            k = self.k(key)
            v = self.v(value)
            logger.debug(f"q_size: {q.size()}")
            logger.debug(f"k_size: {k.size()}")
            logger.debug(f"v_size: {v.size()}")

            # Reshape and transpose tensors for matrix multiplication
            q = q.view(q.size(0), -1, self.num_heads, self.head_dim).transpose(1, 2)
            k = k.view(k.size(0), -1, self.num_heads, self.head_dim).transpose(1, 2)
            v = v.view(v.size(0), -1, self.num_heads, self.head_dim).transpose(1, 2)

            logger.debug(f"qT_size: {q.size()}")
            logger.debug(f"kT_size: {k.size()}")
            logger.debug(f"vT_size: {v.size()}")
            # TODO:
            # Calculate the attention scores using the
            # scaled_dot_product_attention function defined earlier

            attn_scores = scaled_dot_product_attention(q, k, v, mask, self.dropout)
            attn_scores = attn_scores.transpose(1, 2).contiguous()
            attn_scores = attn_scores.view(attn_scores.size(0), -1, self.embed_dim)
            logger.debug(f"attn_scores: {attn_scores.size()}")

            output = self.output_linear(attn_scores)
            logger.debug(f"output_size: {output.size()}")
            return output

In [4]:
model_ckpt = "bert-base-uncased"
config = AutoConfig.from_pretrained(model_ckpt)

tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
token_emb = nn.Embedding(config.vocab_size, config.hidden_size)

text = "The quick brown fox jumps over the lazy dog"
inputs = tokenizer(text, return_tensors="pt", add_special_tokens=False)
inputs_embeds = token_emb(inputs.input_ids)

query = key = value = inputs_embeds

multihead_attn = MultiHeadAttention(config)
attn_output = multihead_attn(query, key, value)

[32m2025-02-11 13:45:05.514[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m__init__[0m:[36m12[0m - [34m[1mhidden_dim: 768[0m
[32m2025-02-11 13:45:05.516[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m__init__[0m:[36m13[0m - [34m[1mnum_heads: 12[0m
[32m2025-02-11 13:45:05.516[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m__init__[0m:[36m17[0m - [34m[1mhead_dim: 64[0m
[32m2025-02-11 13:45:05.535[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mforward[0m:[36m50[0m - [34m[1mq_size: torch.Size([1, 9, 768])[0m
[32m2025-02-11 13:45:05.536[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mforward[0m:[36m51[0m - [34m[1mk_size: torch.Size([1, 9, 768])[0m
[32m2025-02-11 13:45:05.536[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mforward[0m:[36m52[0m - [34m[1mv_size: torch.Size([1, 9, 768])[0m
[32m2025-02-11 13:45:05.537[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mforward[0m:[36m59[0m - [34m[1mqT_size: torch.

### 2. The Feed-Forward Layer
- The feed-forward layer is a type of neural network layer that processes the input data independently at each position in the input sequence, without considering the dependencies between different positions. This means that the computations for different positions can be parallelized, making the Transformer architecture highly efficient for sequence processing tasks.

- The feed-forward layer in Transformers typically consists of two linear transformations with a non-linear activation function in between. The input to the feed-forward layer is a tensor representing the hidden states of the previous layer or the input embeddings.

- The input to the feed-forward layer is a tensor representing the hidden states of the previous layer or the input embeddings. The feed-forward layer is a critical component of Transformers as it helps capture local patterns and dependencies in the input data.



In [5]:

class FeedForward(nn.Module):
    """
    Feed-forward layer module.

    Args:
        config: Configuration for the feed-forward layer.
    """
    def __init__(self, config) -> None:
        super().__init__()
        self.linear_1 = nn.Linear(config.hidden_size, config.intermediate_size)
        self.linear_2 = nn.Linear(config.intermediate_size, config.hidden_size)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        Perform a forward pass of the feed-forward layer.

        Args:
            x: Input tensor of shape [batch_size, seq_len, hidden_dim].

        Returns:
            Tensor of shape [batch_size, seq_len, hidden_dim],
            representing the output of the feed-forward layer.
        """
        x = self.linear_1(x)
        x = self.relu(x)
        x = self.linear_2(x)
        x = self.dropout(x)
        logger.debug(f"ff_output_size: {x.size()}")
        return x

In [6]:
feed_forward = FeedForward(config)
ff_outputs = feed_forward(attn_output)

[32m2025-02-11 13:45:05.638[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mforward[0m:[36m30[0m - [34m[1mff_output_size: torch.Size([1, 9, 768])[0m


### 3. Positional Embeddings
- The purpose of positional embeddings is to provide the model with a representation that encodes the relative positions of tokens within the sequence. This allows the model to differentiate between tokens based on their position, even though all tokens initially have the same embeddings.

- In the original Transformer model, the positional embeddings used to encode the sequential order of tokens are learned as part of the model training process. The positional embeddings are initialized with fixed sinusoidal functions of different frequencies and then fine-tuned during training.

- Steps:
  1. The constructor of the Embedding class defines two embedding layers `self.token_embeddings` and `self.position_embeddings`. These layers are initialized with different vocabulary sizes and hidden sizes.
  2. In the forward method, position IDs are created using `torch.arange(seq_length).unsqueeze(0)`.  This creates a tensor of sequential integers from 0 to seq_lenght-1 and unsqueezes it to have a shape of [1, seq_lenght]. These position IDs represent the positions of the tokens in the input sequence.
  3. The token embeddings for the input sequence are obtained by passing `input_ids` to `self.token_embeddings`. This maps each token ID to its corresponding embedding vector. On the other hand, the position embeddings for the input sequence are obtained by passing `position_ids` to `self.position_embeddings`. This maps each position ID to its corresponding embedding vector.
  4. The token embeddings and position embeddings are added element-wise (`token_embeddings` + `position_embeddings`) to create the combined embeddings. This operation incorporates both the token information and the positional information of each token in the input sequence.



In [29]:
class Embeddings(nn.Module):
    """
    Embeddings layer module.
    Combines a token embedding layer that projects the `input_ids` to a dense hidden state
    with the positional embedding that does the same for `position_ids`.
    The resulting embedding is simply the sum of both embeddings.

    Args:
        config: Configuration for the embeddings layer.
    """
    def __init__(self, config):
        super().__init__()
        self.token_embeddings = nn.Embedding(config.vocab_size, config.hidden_size)
        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=1e-12)
        self.dropout = nn.Dropout()

    def forward(self, input_ids: torch.Tensor) -> torch.Tensor:
        """
        Perform a forward pass of the embeddings layer.

        Args:
            input_ids: Input tensor of shape [batch_size, seq_len].

        Returns:
            Tensor of shape [batch_size, seq_len, hidden_dim],
            representing the embeddings of the input.

        Notes:
            1. Create position IDs for input sequence.
            2. Create token and position embeddings.
            3. Combine token and position embeddings.
        """
        logger.debug(f"input_size: {input_ids.size()}")
        seq_length = input_ids.size(1)
        position_ids = torch.arange(seq_length, dtype=torch.long, device=device).unsqueeze(0)
        token_embeddings = self.token_embeddings(input_ids)
        logger.debug(f"token_embd_size: {token_embeddings.size()}")
        position_embeddings = self.position_embeddings(position_ids)
        logger.debug(f"position_embd_size: {token_embeddings.size()}")

        embeddings = token_embeddings + position_embeddings
        embeddings = self.layer_norm(embeddings)
        embeddings = self.dropout(embeddings)
        logger.debug(f"embd_size: {token_embeddings.size()}")

        return embeddings

## The Encoder
### 1. TransformerEncoderBlock
- With the previously defined components, we can now define the TransformerEncoderBlock class. It is responsible for performing one layer of the encoder in a Transformer model.
- Steps:
  1. Layer Normalization: The input tensor `x` is first passed through a layer normalization operation using `self.layer_norm_1`. This operation normalizes the activations across the hidden dimension of x to have zero mean and unit variance. The result is stored in hidden_state.
  2. Attention with Skip Connection: The attention mechanism is applied to `hidden_state` using `self.attention`. This attention operation takes hidden_state as the input and produces an attention-based output. The output is then element-wise added (`+`) to the original input tensor `x`. This skip connection allows the model to directly incorporate the original input along with the attention-based output.
  3. Feed-Forward Layer with Skip Connection: The output of the previous step is passed through another layer normalization operation `self.layer_norm_2` to normalize the activations. Then, the result is passed through the feed-forward layer self.feed_forward. The output of the feed-forward layer is again element-wise added (`+`) to the input tensor from the previous step (`x`). This skip connection allows the model to combine the information from the original input with the transformed output from the feed-forward layer.

- In summary, the skip connections enable the model to incorporate the original input tensor x into the output of each layer. By adding the transformed outputs to the original input, the model can retain important information from the input and facilitate the flow of gradients during training. The skip connections help in addressing the vanishing gradient problem and make it easier to train deep Transformer architectures by ensuring the model has access to the original input information at each layer.

In [8]:
class TransformerEncoderBlock(nn.Module):
    """
    Transformer Encoder block module.

    Args:
        config: Configuration for the encoder block.
    """

    def __init__(self, config) -> None:
        super().__init__()
        self.layer_norm_1 = nn.LayerNorm(config.hidden_size)
        self.layer_norm_2 = nn.LayerNorm(config.hidden_size)
        self.attention = MultiHeadAttention(config)
        self.feed_forward = FeedForward(config)
        self.dropout = torch.nn.Dropout(config.hidden_dropout_prob)

    def forward(self, x: torch.Tensor, mask: Optional[torch.Tensor]=None) -> torch.Tensor:
        """
        Perform a forward pass of the transformer encoder block.

        Args:
            x: Input tensor of shape [batch_size, seq_len, hidden_dim].
            mask: Optional mask tensor. Default is None.

        Returns:
            Tensor of shape [batch_size, seq_len, hidden_dim],
            representing the output of the encoder block.
        """
        logger.debug(f"encoder_block_input_size: {x.size()}")
        # TODO:
        # Normalize input tensor x
        hidden_state = self.layer_norm_1(x)
        # TODO:
        # Apply the attention mechanism to the hidden_state using self.attention
        # Add the output to the original input tensor (skip connection)

        attention_output = self.attention(hidden_state, hidden_state, hidden_state, mask)
        x = x + self.dropout(attention_output)
        # TODO:
        # Normalize the activations using self.layer_norm_2
        # Pass it to the feed-forward layer
        # Add the output of the feed_forward layer to the input tensor from
        # the previous step (skip connection)
        x = self.layer_norm_2(x)
        x = x + self.feed_forward(x)

        x = self.dropout(x)
        logger.debug(f"encoder_block_output_size: {x.size()}")
        return x

In [9]:
encoder_layer = TransformerEncoderBlock(config)
_ = encoder_layer(inputs_embeds)

[32m2025-02-11 13:45:05.728[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m__init__[0m:[36m12[0m - [34m[1mhidden_dim: 768[0m
[32m2025-02-11 13:45:05.735[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m__init__[0m:[36m13[0m - [34m[1mnum_heads: 12[0m
[32m2025-02-11 13:45:05.739[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m__init__[0m:[36m17[0m - [34m[1mhead_dim: 64[0m
[32m2025-02-11 13:45:05.814[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mforward[0m:[36m29[0m - [34m[1mencoder_block_input_size: torch.Size([1, 9, 768])[0m
[32m2025-02-11 13:45:05.817[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mforward[0m:[36m50[0m - [34m[1mq_size: torch.Size([1, 9, 768])[0m
[32m2025-02-11 13:45:05.818[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mforward[0m:[36m51[0m - [34m[1mk_size: torch.Size([1, 9, 768])[0m
[32m2025-02-11 13:45:05.819[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mforward[0m:[36m52[0m - [34m

### 2. TransformerEncoder
- Finally, putting everything together, we can now define the TransoformerEncoder class. It is responsible for processing the input sequence using multiple stacked Transformer Encoder Blocks.

In [10]:
class TransformerEncoder(nn.Module):
    """
    Transformer Encoder module.

    Args:
        config: Configuration for the encoder.
    """
    def __init__(self, config) -> None:
        super().__init__()
        self.embeddings = Embeddings(config)
        self.layers = nn.ModuleList([TransformerEncoderBlock(config) for _ in range(config.num_hidden_layers)])

    def forward(self, x: torch.Tensor, mask: Optional[torch.Tensor] = None) -> torch.Tensor:
        """
        Perform a forward pass of the transformer encoder.

        Args:
            x: Input tensor of shape [batch_size, seq_len].
            mask: Optional mask tensor. Default is None.

        Returns:
            Tensor of shape [batch_size, seq_len, hidden_dim],
            representing the output of the encoder.
        """
        x = self.embeddings(x)
        for layer in self.layers:
            x = layer(x, mask)

        return x

In [11]:
logger.remove()
logger.add(sys.stderr, level="INFO")
encoder = TransformerEncoder(config)
encoder_output = encoder(inputs.input_ids)
encoder_output.size()

torch.Size([1, 9, 768])

In [12]:
encoder_output

tensor([[[ 1.1345,  0.0775,  0.5740,  ...,  0.1227, -0.1416, -0.4897],
         [-0.6721,  4.9926,  1.5509,  ...,  0.7134,  1.1228,  0.0332],
         [ 0.2856, -1.5069,  0.1794,  ...,  2.3478,  0.0000,  0.1848],
         ...,
         [-0.2085, -0.3087,  0.5413,  ..., -0.4112,  0.0000,  0.8980],
         [-0.2039,  0.0180,  0.2897,  ...,  0.2765,  1.0491, -0.1463],
         [-0.1788,  4.2346,  1.7248,  ..., -0.5369,  0.5729, -1.4365]]],
       grad_fn=<MulBackward0>)

## The Decoder
-  The main difference between the decoder and encoder is that the decoder has two attention sublayers.

- The first attention sublayer, known as the self-attention sublayer, allows the decoder to attend to its own previously generated tokens, capturing dependencies and relationships within the output sequence.

- The second attention sublayer is the encoder-decoder attention, which allows the decoder to attend to the encoded representations produced by the encoder, incorporating contextual information from the input sequence.

- **Mask** is applied in the self-attention mechanism to enforce the causality constraint during the decoding process. Since the decoder generates the target sequence autoregressively, each position in the target sequence should only attend to previous positions and not future positions. If you recall the scaled_dot_product_attention function, we set the upper values to infinity. This guarantees that the attention weights are all zero once we take the softmax over the scores (as e^-∞=0).

### 1. The Decoder Block
- Similarly to the TransformerEncoderBlock, the TransformerDecoderBlock is responsible for performing one layer of the decoder in a Transformer model:

In [13]:
class TransformerDecoderBlock(nn.Module):
    """
    Transformer Decoder layer module.

    Args:
        config: Configuration for the decoder layer.
    """

    def __init__(self, config, ) -> None:
        super().__init__()
        self.layer_norm_1 = nn.LayerNorm(config.hidden_size)
        self.layer_norm_2 = nn.LayerNorm(config.hidden_size)
        self.layer_norm_3 = nn.LayerNorm(config.hidden_size)
        self.self_attention = MultiHeadAttention(config)
        self.encoder_decoder_attention = MultiHeadAttention(config)
        self.feed_forward = FeedForward(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    def forward(
        self,
        x: torch.Tensor,
        encoder_output: torch.Tensor,
        source_mask: Optional[torch.Tensor] = None,
        target_mask: Optional[torch.Tensor] = None,
    ) -> torch.Tensor:
        """
        Perform a forward pass of the transformer decoder block.

        Args:
            x: Input tensor of shape [batch_size, seq_len, hidden_dim].
            encoder_output: Output tensor from the encoder of shape [batch_size, seq_len, hidden_dim].
            source_mask: Optional source mask tensor. Default is None.
            target_mask: mask: Optional target mask tensor. Default is None.
        Returns:
            Tensor of shape [batch_size, seq_len, hidden_dim],
            representing the output of the decoder block.
        """
        logger.debug(f"decoder_block_input_size: {x.size()}")

        # TODO:
        # First attention sublayer, attending to its own previously generated
        # tokens. mask 필요: 디코더가 미래 단어를 참조하지 못하도록 차단
        hidden_state = self.layer_norm_1(x)
        attn_1_out = self.self_attention(hidden_state, hidden_state, hidden_state, target_mask)
        x = x + self.dropout(attn_1_out)
        # TODO:
        # Second attention sublayer, attending to the encoded representations
        # from the encoder.
        # mask 필요 없음: 인코더 출력을 전체적으로 봐도 됨. 인코더-디코더 간의 Attention에서는 미래 정보 제한이 필요 없음
        x = self.layer_norm_2(x)
        attn_2_out = self.encoder_decoder_attention(x, encoder_output, encoder_output, source_mask)
        x = x + self.dropout(attn_2_out)
        x = self.layer_norm_3(x)

        feed_forward_output = self.feed_forward(x)
        x = x + self.dropout(feed_forward_output)
        logger.debug(f"decoder_block_output_size: {x.size()} ")

        return x

### 2. TransformerDecoder


In [14]:
class TransformerDecoder(nn.Module):
    def __init__(self, config) -> None:
        """
        Transformer Decoder module.

        Args:
            config: Configuration object for the decoder.
            mask: Masking object for attention layers.
        """
        super().__init__()
        self.embeddings = Embeddings(config)
        self.layers = nn.ModuleList([TransformerDecoderBlock(config) for _ in range(config.num_hidden_layers)])

    def forward(
        self,
        input_ids: torch.Tensor,
        encoder_output: torch.Tensor,
        source_mask: torch.Tensor = None,
        target_mask: torch.Tensor = None
    ) -> torch.Tensor:
        """
        Perform a forward pass of the transformer decoder.

        Args:
            x: Input tensor of shape [batch_size, tgt_len].

        Returns:
            Tensor of shape [batch_size, tgt_len, vocab_size],
            representing the predicted probabilities over the vocabulary.
        """
        x = self.embeddings(input_ids)
        for layer in self.layers:
            x = layer(x, encoder_output, source_mask=source_mask, target_mask=target_mask)
        return x

In [15]:
logger.remove()
logger.add(sys.stderr, level="INFO")
seq_len = inputs.input_ids.size(-1)
mask = torch.tril(torch.ones(seq_len, seq_len)).unsqueeze(0)
encoder = TransformerEncoder(config)
encoder_output = encoder(inputs.input_ids)
decoder = TransformerDecoder(config)
output = decoder(inputs.input_ids, encoder_output, target_mask=mask)
output.size()

torch.Size([1, 9, 768])

## The Transformer
- With all the required components now in place, we can proceed to define and implement this model

In [16]:
class EncoderDecoder(nn.Module):
    """
    Encoder-Decoder model that combines the TransformerEncoder and TransformerDecoder.

    Args:
        encoder_config: Configuration for the encoder.
        decoder_config: Configuration for the decoder.
    """
    def __init__(
        self,
        config,
    ) -> None:
        super().__init__()
        self.encoder = TransformerEncoder(config)
        self.decoder = TransformerDecoder(config)
        self.fc = nn.Linear(config.hidden_size, config.vocab_size)

    def forward(
        self,
        input_ids: torch.Tensor,
        target_ids: torch.Tensor,
        source_mask: Optional[torch.Tensor] = None,
        target_mask: Optional[torch.Tensor] = None
    ) -> torch.Tensor:
        """
        Perform a forward pass of the encoder-decoder model.

        Args:
            input_ids: Input tensor of shape [batch_size, src_len].
            target_ids: Target tensor of shape [batch_size, tgt_len].

        Returns:
            Tensor of shape [batch_size, tgt_len, vocab_size],
            representing the predicted probabilities over the vocabulary.
        """
        encoder_output = self.encoder(input_ids)
        decoder_output = self.decoder(
            target_ids,
            encoder_output,
            source_mask=source_mask,
            target_mask=target_mask
        )
        x = self.fc(decoder_output)  # Apply linear layer to transform to vocab_size

        return x

### Masking
The mask used in the Transformer model should have a specific shape and values to ensure proper masking during the attention mechanism. Here's how you can define the mask:

- Padding Mask: The padding mask is used to mask out padding tokens in the input sequences. It should have a shape of (batch_size, seq_length) and contain 1 where the padding tokens are present and 0 for the non-padding tokens. This mask ensures that the padding tokens do not contribute to the attention scores.

- Future Mask: The future mask is used to prevent attending to future positions in the self-attention mechanism. It should have a shape of (seq_length, seq_length) and have 1 for positions that can be attended and 0 for positions that should be masked or ignored.

- Combined Mask: To create the final mask, you need to combine the padding mask and the future mask. This can be done by applying logical operations, such as element-wise multiplication or logical OR, to the two masks.



<p align="center"><img src="https://miro.medium.com/v2/resize:fit:1400/format:webp/1*gIU1WTNJle6N0tw6P-C4OA.png" width = "400" ></p>

In [17]:
def create_mask(batch_size: int, seq_length: int) -> torch.Tensor:
    """
    Create a lower triangular mask with ones below the diagonal.

    Args:
        batch_size: The batch size.
        seq_length: The length of the sequence.

    Returns:
        The mask tensor with shape (batch_size, seq_length, seq_length).
    """
    mask = torch.tril(torch.ones(seq_length, seq_length))
    mask = mask.unsqueeze(0).expand(batch_size, seq_length, seq_length)  # Expand the mask along the batch dimension

    return mask

### Recap

<p align="center"><img src="https://cpm0722.github.io/assets/images/2021-01-28-Transformer-in-pytorch/qkv_vector.png" width = "600" ></p>


<p align="center"><img src="https://cpm0722.github.io/assets/images/2021-01-28-Transformer-in-pytorch/attention_score_scalar.png" width = "600" ></p>


<p align="center"><img src="https://cpm0722.github.io/assets/images/2021-01-28-Transformer-in-pytorch/qkv_matrix_1.png" width = "600" ></p>

<p align="center"><img src="https://cpm0722.github.io/assets/images/2021-01-28-Transformer-in-pytorch/attention_score_vector.png" width = "600" ></p>


<p align="center"><img src="https://cpm0722.github.io/assets/images/2021-01-28-Transformer-in-pytorch/attention_vector.png" width = "600" ></p>

<p align="center"><img src="https://cpm0722.github.io/assets/images/2021-01-28-Transformer-in-pytorch/qkv_matrix_2.png" width = "600" ></p>


<p align="center"><img src="https://cpm0722.github.io/assets/images/2021-01-28-Transformer-in-pytorch/attention_matrix.png" width = "600" ></p>

output의 shape는 모두 동일할지라도, Q, K, V 의 실제 값들은 모두 다르다

<p align="center"><img src="https://cpm0722.github.io/assets/images/2021-01-28-Transformer-in-pytorch/qkv_fc_layer.png" width = "600" ></p>


Multi-head attention

-  Scaled Dot-Product Attention에서는 Q, K , V
를 위해 FC layer가 총 3개 필요했었는데, 이를 h회 수행한다고 했으므로 3*h 개의 FC layer가 필요함

<p align="center"><img src="https://cpm0722.github.io/assets/images/2021-01-28-Transformer-in-pytorch/multi_head_attention_concat.png" width = "600" ></p>


## Testing the Transformer!

<p align="center"><img src="https://miro.medium.com/v2/resize:fit:1400/format:webp/1*l46NMJUjXc7apz4xlsS61A.png" width = "600" ></p>

In [18]:
class TransformerConfig():
    """
    Configuration class for the Transformer model.

    Args:
        hidden_size: Size of the hidden state.
        intermediate_size: Size of the intermediate layer in the feed-forward network.
        num_hidden_layers: Number of hidden layers in the Transformer.
        vocab_size: Size of the vocabulary.
        max_position_embeddings: Maximum number of positional embeddings.
        hidden_dropout_prob: Dropout probability for the hidden layers.
        num_attention_heads: Number of attention heads in the multi-head attention.
    """
    def __init__(
        self,
        hidden_size: int,
        intermediate_size: int,
        num_hidden_layers: int,
        vocab_size: int,
        max_position_embeddings: int,
        hidden_dropout_prob: float,
        num_attention_heads: int
    ):
        self.hidden_size = hidden_size
        self.intermediate_size = intermediate_size
        self.num_hidden_layers = num_hidden_layers
        self.vocab_size = vocab_size
        self.max_position_embeddings = max_position_embeddings
        self.hidden_dropout_prob = hidden_dropout_prob
        self.num_attention_heads = num_attention_heads

In [19]:
# Set up hyperparameters and configuration
config = TransformerConfig(
    hidden_size=512,
    intermediate_size=2048,
    num_hidden_layers=6,
    vocab_size=100,
    max_position_embeddings=512,
    hidden_dropout_prob=0.1,
    num_attention_heads=8
)

In [20]:
# Define some fake data
batch_size = 128  # 배치 크기 설정
source_length = 10  # 입력 시퀀스 길이 설정
target_length = 12  # 출력 시퀀스 길이 설정

source_ids =  torch.randint(0, config.vocab_size, (batch_size, source_length)) # TODO: (16, 10) 크기의 랜덤한 입력 데이터 생성
target_ids =  torch.randint(0, config.vocab_size, (batch_size, target_length)) # TODO: (16, 12) 크기의 랜덤한 출력 데이터 생성

source_ids.size(), target_ids.size()

(torch.Size([128, 10]), torch.Size([128, 12]))

In [21]:
source_mask = create_mask(batch_size, source_length)# TODO: implement mask creation function
target_mask = create_mask(batch_size, target_length)# TODO: implement mask creation function
source_mask.size(), target_mask.size()

(torch.Size([128, 10, 10]), torch.Size([128, 12, 12]))

In [22]:
encoder = TransformerEncoder(config)
encoder_output = encoder(source_ids) # 인코더에 입력 데이터 전달
print(f"encoder_output_size: {encoder_output.size()}") # (16, 10, hidden_size) 출력
decoder = TransformerDecoder(config)
output = decoder(source_ids, encoder_output, source_mask=source_mask)
print(f"decoder_output_size: {output.size()}")  # (16, 10, hidden_size) 출력

encoder_output_size: torch.Size([128, 10, 512])
decoder_output_size: torch.Size([128, 10, 512])


In [23]:
# Define the EncoderDecoder model
encoder_decoder = EncoderDecoder(config)
output = encoder_decoder(source_ids, target_ids, target_mask=target_mask)
print("Output Shape:", output.shape)
print("Source Input Shape:", source_ids.shape)  # (16, 10) 출력
# encoder_output = encoder(source_ids)

Output Shape: torch.Size([128, 12, 100])
Source Input Shape: torch.Size([128, 10])


In [24]:
target_ids.size()

torch.Size([128, 12])

### Training


In [25]:
class RandomDataset(torch.utils.data.Dataset):
    """
    Provides random data copy dataset for training.

    Args:
        vocabulary_size: The vocabulary size.
        batch_size: The batch size.
        num_samples: The number of samples.
        sample_length: The length of each sample.
    """

    def __init__(self, vocabulary_size: int, batch_size: int, num_samples: int, sample_length: int):
        self.samples = list()

        for i in range(batch_size * num_samples):
            data = torch.randint(1, vocabulary_size, (sample_length + 1,))  # +1 to avoid length mismatch
            data[0] = 1
            source = torch.autograd.Variable(data[:-1], requires_grad=False)
            target = torch.autograd.Variable(data[1:], requires_grad=False)

            sample = {
                'source': source,
                'target': target,
                'target_y': target,  # Ensure it matches target length
                'source_mask': (source != 0).unsqueeze(-2),
                'target_mask': self.make_std_mask(target, 0),
                'tokens_count': (target != 0).sum()
            }

            self.samples.append(sample)

    def __len__(self) -> int:
        """
        Get the number of samples in the dataset.

        Returns:
            The number of samples.
        """
        return len(self.samples)

    def __getitem__(self, idx: int) -> dict:
        """
        Get a sample from the dataset.

        Args:
            idx: The index of the sample to retrieve.

        Returns:
            A dictionary containing the source, target, target_y, source_mask, target_mask, and tokens_count.
        """
        return self.samples[idx]

    @staticmethod
    def make_std_mask(target: torch.Tensor, pad: int) -> torch.Tensor:
        """
        Create a mask to hide padding and future words.

        Args:
            target (torch.Tensor): The target tensor.
            pad (int): The padding value.

        Returns:
            torch.Tensor: The mask tensor.
        """
        target_mask = (target != pad)
        target_mask = target_mask & torch.autograd.Variable(
            RandomDataset.subsequent_mask(target.size(-1)).type_as(target_mask.data))

        return target_mask

    @staticmethod
    def subsequent_mask(size: int) -> torch.Tensor:
        """
        Mask out subsequent positions.

        Args:
            size: The size of the mask.

        Returns:
            torch.Tensor: The subsequent mask tensor.
        """
        attn_shape = (size, size)
        subsequent_mask = np.triu(np.ones(attn_shape), k=1).astype('uint8')
        return torch.from_numpy(subsequent_mask) == 0

In [26]:
batch_size = 128
num_samples = 1000
samples_len = 10
train_set = RandomDataset(config.vocab_size, batch_size, num_samples, samples_len)
train_loader = torch.utils.data.DataLoader(train_set, batch_size)

In [31]:
model = EncoderDecoder(config)
model

EncoderDecoder(
  (encoder): TransformerEncoder(
    (embeddings): Embeddings(
      (token_embeddings): Embedding(100, 512)
      (position_embeddings): Embedding(512, 512)
      (layer_norm): LayerNorm((512,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.5, inplace=False)
    )
    (layers): ModuleList(
      (0): TransformerEncoderBlock(
        (layer_norm_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (layer_norm_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (attention): MultiHeadAttention(
          (q): Linear(in_features=512, out_features=512, bias=True)
          (k): Linear(in_features=512, out_features=512, bias=True)
          (v): Linear(in_features=512, out_features=512, bias=True)
          (output_linear): Linear(in_features=512, out_features=512, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (feed_forward): FeedForward(
          (linear_1): Linear(in_features=512, out_features

In [32]:
import torch
from tqdm import tqdm

model = EncoderDecoder(config).to(device)

# Initialize parameters.
for p in model.parameters():
    if p.dim() > 1:
        torch.nn.init.xavier_uniform_(p)

model.train()

optimizer = torch.optim.Adam(model.parameters())
loss_function = torch.nn.CrossEntropyLoss()

current_loss = 0.0
counter = 0

# tqdm을 사용하여 progress bar를 추가합니다.
for i, batch in enumerate(tqdm(train_loader, desc="Training Progress", ncols=100)):
    # 데이터도 GPU로 이동
    source = batch['source'].to(device)
    target = batch['target'].to(device)
    source_mask = batch['source_mask'].to(device)
    target_mask = batch['target_mask'].to(device)
    target_y = batch['target_y'].to(device)

    # position_ids나 다른 텐서들이 CPU에 있을 수 있으므로 해당 텐서도 GPU로 이동
    position_ids = torch.arange(source.size(1), device=device).unsqueeze(0)  # 예시
    batch['position_ids'] = position_ids

    with torch.set_grad_enabled(True):
        out = model.forward(source, target, source_mask, target_mask)
        loss = loss_function(out.contiguous().view(-1, out.size(-1)), target_y.contiguous().view(-1).long())
        loss.backward()

        optimizer.step()
        optimizer.zero_grad()

        current_loss += loss.item()
        counter += 1

        if counter % 100 == 0:
            print("Batch: %d; Loss: %f" % (i + 1, current_loss / counter))
            current_loss = 0.0
            counter = 0


Training Progress:  10%|████                                     | 100/1000 [00:27<04:03,  3.70it/s]

Batch: 100; Loss: 4.839642


Training Progress:  20%|████████▏                                | 200/1000 [00:54<03:37,  3.68it/s]

Batch: 200; Loss: 4.687149


Training Progress:  30%|████████████▎                            | 300/1000 [01:21<03:09,  3.70it/s]

Batch: 300; Loss: 4.671585


Training Progress:  40%|████████████████▍                        | 400/1000 [01:48<02:43,  3.67it/s]

Batch: 400; Loss: 4.652823


Training Progress:  50%|████████████████████▌                    | 500/1000 [02:15<02:16,  3.67it/s]

Batch: 500; Loss: 4.635049


Training Progress:  60%|████████████████████████▌                | 600/1000 [02:45<01:49,  3.67it/s]

Batch: 600; Loss: 4.621610


Training Progress:  70%|████████████████████████████▋            | 700/1000 [03:12<01:21,  3.68it/s]

Batch: 700; Loss: 4.614254


Training Progress:  80%|████████████████████████████████▊        | 800/1000 [03:40<00:54,  3.68it/s]

Batch: 800; Loss: 4.608254


Training Progress:  90%|████████████████████████████████████▉    | 900/1000 [04:07<00:27,  3.66it/s]

Batch: 900; Loss: 4.606917


Training Progress: 100%|████████████████████████████████████████| 1000/1000 [04:34<00:00,  3.64it/s]

Batch: 1000; Loss: 4.604630





In [35]:
# inference
src = torch.tensor([[0, 2, 5, 6, 4, 3, 9, 5, 2, 9, 10, 1]])
trg = torch.tensor([[0]])
print(src.shape, trg.shape)
out = model.forward(src.to(device), trg.to(device))
out

torch.Size([1, 12]) torch.Size([1, 1])


tensor([[[-5.1863,  0.4589,  0.2204,  0.3417,  0.2435,  0.0502, -0.1730,
           0.4358,  0.3653,  0.4224,  0.0095,  0.1332,  0.2602, -0.1964,
           0.2211,  0.1413,  0.2812,  0.0622,  0.1738,  0.6793,  0.0913,
           0.1623,  0.2148,  0.5131,  0.2322, -0.0073,  0.2067,  0.0390,
           0.1210,  0.3781,  0.0927,  0.3638, -0.0176,  0.0437,  0.3765,
          -0.0105, -0.0474,  0.3564, -0.0599,  0.1322,  0.1855,  0.2391,
           0.2340, -0.1156,  0.3410,  0.4357, -0.1607,  0.0582,  0.1494,
          -0.0765,  0.0339,  0.2254,  0.2118,  0.0364,  0.3456,  0.2008,
           0.3556, -0.0720, -0.3504,  0.0287,  0.1312,  0.2217,  0.6383,
          -0.0321,  0.1059,  0.1555,  0.0279, -0.1671,  0.0205,  0.1103,
          -0.0877,  0.4064,  0.1032,  0.1070,  0.0849, -0.1501,  0.1310,
           0.2176, -0.0758,  0.0571, -0.0313,  0.0227,  0.0815,  0.3571,
          -0.0710,  0.1529,  0.2885,  0.2498,  0.1252, -0.0137,  0.1849,
           0.2626, -0.1972, -0.0943,  0.6266,  0.03

In [36]:
test_batch_size = 4
test_num_samples = 10
test_sample_length = 10

test_set = RandomDataset(config.vocab_size, test_batch_size, test_num_samples, test_sample_length)
test_loader = torch.utils.data.DataLoader(test_set, batch_size=test_batch_size)

model.eval()

with torch.no_grad():
    batch = next(iter(test_loader))

    source = batch['source'].to(device)
    source_mask = batch['source_mask'].to(device)

    # 디코더 입력을 위한 초기 토큰 생성 (시작 토큰 <s>로 가정, ID=1)
    target_start = torch.full((test_batch_size, 1), 1, dtype=torch.long, device=device)

    for _ in range(test_sample_length - 1):
        out = model.forward(source, target_start, source_mask, None)
        next_token = torch.argmax(out[:, -1, :], dim=-1, keepdim=True)  # 마지막 토큰 예측
        target_start = torch.cat([target_start, next_token], dim=1)  # 새로운 토큰 추가

print(target_start)

# target_start가 토큰 ID 텐서라고 가정
reconstructed_text = tokenizer.decode(target_start[0], skip_special_tokens=True)

print(reconstructed_text)


tensor([[ 1, 94, 94, 94, 94, 94, 94, 94, 94, 94],
        [ 1, 94, 94, 94, 94, 94, 94, 94, 94, 94],
        [ 1, 94, 94, 94, 94, 94, 94, 94, 94, 94],
        [ 1, 94, 94, 94, 94, 94, 94, 94, 94, 94]], device='cuda:0')
[unused0] [unused93] [unused93] [unused93] [unused93] [unused93] [unused93] [unused93] [unused93] [unused93]
