## Usage of Attention without Mask

In [58]:
import sys
sys.path.append('../')
import torch
from transformer.layers import MultiHeadAttention
# Suppose we have a batch of 1 sequences (mini-batch size of 1)
# Each sequence has 4 words (sequence length of 4)
# We use 1 attention heads (h = 1) and the dimension of key/query (d_k) is 64
batch_size = 1
h = 1
seq_len = 4
d_k = 64

input_tensor = torch.rand(batch_size, h, seq_len, d_k)
query_k = input_tensor.clone()
key_k = input_tensor.clone()
value_k = input_tensor.clone()

# Call the attention function
output, attention_scores = MultiHeadAttention.attention(query_k, key_k, value_k, d_k, mask=None, dropout=nn.Dropout(0.1))

print(f"Output Shape: {output.shape}")
print(f"Attention Scores Shape: {attention_scores.shape}")
print(f"Attention Scores:\n{attention_scores}")

Output Shape: torch.Size([1, 1, 4, 64])
Attention Scores Shape: torch.Size([1, 1, 4, 4])
Attention Scores:
tensor([[[[0.4294, 0.2395, 0.2545, 0.1876],
          [0.1694, 0.5158, 0.2546, 0.1713],
          [0.2125, 0.3007, 0.4212, 0.1767],
          [0.2100, 0.2712, 0.2369, 0.3930]]]])


## Example of Attention with Mask

In [56]:
import sys
sys.path.append('../')
import torch
from transformer.layers import MultiHeadAttention
# Suppose we have a batch of 1 sequences (mini-batch size of 1)
# Each sequence has 4 words (sequence length of 4)
# We use 1 attention heads (h = 1) and the dimension of key/query (d_k) is 64
batch_size = 1
h = 1
seq_len = 4
d_k = 64

input_tensor = torch.rand(batch_size, h, seq_len, d_k)
query_k = input_tensor.clone()
key_k = input_tensor.clone()
value_k = input_tensor.clone()

# Create a mask for values above the diagonal
causal_mask = torch.triu(torch.ones((1, 4, 4)), diagonal=1).type(torch.int) == 0
padding_mask = torch.tensor([1, 1, 0, 0])
decoder_mask = padding_mask & causal_mask

# Call the attention function
output, attention_scores = MultiHeadAttention.attention(query_k, key_k, value_k, d_k, mask=decoder_mask, dropout=nn.Dropout(0.1))

print(f"Decoder Mask:\n{decoder_mask}\n")
print(f"Output Shape: {output.shape}")
print(f"Attention Scores Shape: {attention_scores.shape}")
print(f"Attention Scores:\n{attention_scores}")

Decoder Mask:
tensor([[[1, 0, 0, 0],
         [1, 1, 0, 0],
         [1, 1, 0, 0],
         [1, 1, 0, 0]]])

Output Shape: torch.Size([1, 1, 4, 64])
Attention Scores Shape: torch.Size([1, 1, 4, 4])
Attention Scores:
tensor([[[[1.1111, 0.0000, 0.0000, 0.0000],
          [0.4744, 0.6367, 0.0000, 0.0000],
          [0.0000, 0.0000, 0.0000, 0.0000],
          [0.6133, 0.4978, 0.0000, 0.0000]]]])
