## Usage of Attention without Mask

In [None]:
import sys
sys.path.append('../')
import torch
import torch.nn as nn
from transformer.layer import MultiHeadAttention
# Suppose we have a batch of 1 sequences (mini-batch size of 1)
# Each sequence has 4 words (sequence length of 4)
# We use 1 attention heads (h = 1) and the dimension of key/query (d_k) is 64
batch_size = 1
h = 1
seq_len = 4
d_k = 64

torch.manual_seed(68) # for reproducible result of random process
input_tensor = torch.rand(batch_size, h, seq_len, d_k)
query_k = input_tensor.clone()
key_k = input_tensor.clone()
value_k = input_tensor.clone()

# Call the attention function
output, attention_scores = MultiHeadAttention.attention(query_k, key_k, value_k, d_k, mask=None, dropout=nn.Dropout(0.1))

print(f"Output Shape: {output.shape}")
print(f"Attention Scores Shape: {attention_scores.shape}")
print(f"Attention Scores:\n{attention_scores}")

Output Shape: torch.Size([1, 1, 4, 64])
Attention Scores Shape: torch.Size([1, 1, 4, 4])
Attention Scores:
tensor([[[[0.3822, 0.2345, 0.2504, 0.2440],
          [0.1791, 0.4443, 0.2518, 0.2360],
          [0.1597, 0.0000, 0.4912, 0.2500],
          [0.1708, 0.2163, 0.2745, 0.4495]]]])


## Example of Attention with Mask

In [None]:
import sys
sys.path.append('../')
import torch
import torch.nn as nn
from transformer.layer import MultiHeadAttention
from transformer.mask import create_decoder_mask
# Suppose we have a batch of 1 sequences (mini-batch size of 1)
# Each sequence has 4 words (sequence length of 4)
# We use 1 attention heads (h = 1) and the dimension of key/query (d_k) is 64
batch_size = 1
h = 1
seq_len = 4
d_k = 64
pad_token_id = torch.tensor([0]) # padding token ID

torch.manual_seed(68) # for reproducible result of random process
input_tensor = torch.rand(batch_size, h, seq_len, d_k)
query_k = input_tensor.clone()
key_k = input_tensor.clone()
value_k = input_tensor.clone()

# Create decoder mask
decoder_input_ids = torch.tensor([ 2, 68, 0, 0])
pad_token_id = torch.tensor([0]) # padding token ID
decoder_mask = create_decoder_mask(decoder_input_ids, pad_token_id, seq_len)

# Call the attention function
output, attention_scores = MultiHeadAttention.attention(query_k, key_k, value_k, d_k, mask=decoder_mask, dropout=nn.Dropout(0.1))

print(f"Decoder Mask:\n{decoder_mask}\n")
print(f"Output Shape: {output.shape}")
print(f"Attention Scores Shape: {attention_scores.shape}")
print(f"Attention Scores:\n{attention_scores}")

Decoder Mask:
tensor([[[1, 0, 0, 0],
         [1, 1, 0, 0],
         [1, 1, 0, 0],
         [1, 1, 0, 0]]], dtype=torch.int32)

Output Shape: torch.Size([1, 1, 4, 64])
Attention Scores Shape: torch.Size([1, 1, 4, 4])
Attention Scores:
tensor([[[[1.1111, 0.0000, 0.0000, 0.0000],
          [0.3191, 0.7920, 0.0000, 0.0000],
          [0.4796, 0.0000, 0.0000, 0.0000],
          [0.4902, 0.6209, 0.0000, 0.0000]]]])
