# Self-attention

$$ 
\text{Attention}(Q,K,V) = \text{softmax}(\frac{QK^T}{\sqrt{d_k}})V
$$

In [1]:
import math
import torch
import torch.nn as nn
from typing import Optional, Tuple
# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx
import warnings
warnings.filterwarnings("ignore", category=UserWarning)
# Transformers
from transformers import AutoTokenizer, AutoModel, utils
from bertviz import head_view
utils.logging.set_verbosity_error()  # Suppress standard warnings

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class SelfAttention(nn.Module):
    def __init__(self, embed_dim: int, dropout: float = 0.1):
        """
        Initialize the SelfAttention module.
        
        Args:
            embed_dim (int): The embedding dimension.
        """
        super(SelfAttention, self).__init__()
        self.embed_dim = embed_dim
        self.att_dropout = nn.Dropout(dropout)
        
        self.q_linear = nn.Linear(embed_dim, embed_dim)
        self.k_linear = nn.Linear(embed_dim, embed_dim)
        self.v_linear = nn.Linear(embed_dim, embed_dim)
    
    def forward(self, x: torch.Tensor, mask: Optional[torch.Tensor] = None) -> torch.Tensor:
        """
        Perform self-attention operation.
        
        Args:
            query (torch.Tensor): The query tensor.
            key (torch.Tensor): The key tensor.
            value (torch.Tensor): The value tensor.
            mask (Optional[torch.Tensor]): The mask tensor.
        
        Returns:
            torch.Tensor: The output after attention operation.
        """
        # [batch_size, seq_len, embed_dim]
        Q = self.q_linear(x)
        K = self.k_linear(x)
        V = self.v_linear(x)

        # [batch_size, seq_len, seq_len]
        scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.embed_dim)
        
        # Mask (opt.)
        if mask is not None:
            scores = scores.masked_fill(mask == 0, float('-inf'))
        
        # Softmax
        attention = self.att_dropout(torch.softmax(scores, dim=-1))
        
        # att * value computation -> [batch_size, seq_len, embed_dim]
        # att -> [batch_size, seq_len, seq_len]
        return torch.matmul(attention, V), attention

def test_self_attention():
    # 设置参数
    embed_dim = 64
    seq_length = 10
    batch_size = 2
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # 创建 SelfAttention 实例
    self_attention = SelfAttention(embed_dim).to(device)

    # 生成随机输入
    x = torch.randn(batch_size, seq_length, embed_dim).to(device)

    # 前向传播
    output, attention = self_attention(x)

    # 打印输入和输出的形状
    print(f"Input shape: {x.shape}")
    print(f"Output shape: {output.shape}")
    print(f"Attention shape: {attention.shape}")

    # 断言检查输出形状是否正确
    assert output.shape == x.shape, "Output shape does not match input shape"
    assert attention.shape == (batch_size, seq_length, seq_length), "Attention shape is incorrect"

    return self_attention

def visualize_attention(attention_matrix):
    plt.figure(figsize=(10, 8))
    sns.heatmap(attention_matrix.detach().numpy(), annot=True, cmap='viridis')
    plt.title("Self-Attention Visualization")
    plt.xlabel("Key")
    plt.ylabel("Query")
    plt.show()

In [3]:
# 测试自注意力机制
model = test_self_attention()

Input shape: torch.Size([2, 10, 64])
Output shape: torch.Size([2, 10, 64])
Attention shape: torch.Size([2, 10, 10])


In [4]:
from transformers import AutoTokenizer, AutoModel
import torch

# 加载预训练 tokenizer 和模型，可视化注意力机制
# huggingface-cli download --resume-download google-bert/bert-base-uncased --local-dir .\bert-base-uncased\
model_name = "bert-base-uncased"
input_text = "The cat sat on the mat"

# 加载tokenizer和完整的预训练模型
tokenizer = AutoTokenizer.from_pretrained(model_name)
pretrained_model = AutoModel.from_pretrained(model_name)

# 获取embed_dim
embed_dim = pretrained_model.config.hidden_size
print(f"The embedding dimension is: {embed_dim}")

# 初始化你的SelfAttention模型
model = SelfAttention(embed_dim=embed_dim)

tokenized = tokenizer.encode(input_text, return_tensors='pt')  # Tokenize input text
with torch.no_grad():
    inputs = pretrained_model(tokenized).last_hidden_state

output, attention = model(inputs)  # Run model & Retrieve attention from model outputs
tokens = tokenizer.convert_ids_to_tokens(tokenized[0])  # Convert input ids to token strings

attention = attention.unsqueeze(1).view(1, 1, 1, len(tokenized[0]), len(tokenized[0]))  # Self-attention only has one head
head_view(attention, tokens)  # Display model view, attention dim: (batch_size, num_layers, num_heads, seq_len, seq_len)



The embedding dimension is: 768


<IPython.core.display.Javascript object>

# Multi-head Attention

$$
\text{MultiHead}(Q, K, V) = \text{Concat}(\text{head}_1, ..., \text{head}_h)W^O
$$

$$
\text{where } \text{head}_i = \text{Attention}(QW_i^Q, KW_i^K, VW_i^V)
$$

$$
\text{Where the projections are parameter matrices } W_i^Q \in \mathbb{R}^{d_\text{model} \times d_k}, W_i^K \in \mathbb{R}^{d_\text{model} \times d_k}, W_i^V \in \mathbb{R}^{d_\text{model} \times d_v} \text{and } W^O \in \mathbb{R}^{hd_v \times d_\text{model}}
$$

In [5]:
class MultiHeadAttention(nn.Module):
    def __init__(self, embed_dim: int, num_heads: int, dropout: float = 0.1):
        """
        Initialize the MultiHeadAttention module.
        
        Args:
            embed_dim (int): The embedding dimension.
            num_heads (int): The number of attention heads.
        """
        super(MultiHeadAttention, self).__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.head_dim = embed_dim // num_heads
        self.att_dropout = nn.Dropout(dropout)
        
        assert self.head_dim * num_heads == embed_dim, "embed_dim must be divisible by num_heads"
        
        self.q_linear = nn.Linear(embed_dim, embed_dim)
        self.k_linear = nn.Linear(embed_dim, embed_dim)
        self.v_linear = nn.Linear(embed_dim, embed_dim)
        
        self.out_proj = nn.Linear(embed_dim, embed_dim)
    
    def forward(self, x: torch.Tensor, mask: Optional[torch.Tensor] = None) -> torch.Tensor:
        """
        Perform multi-head attention operation.
        
        Args:
            x (torch.Tensor): The input tensor of shape [batch_size, seq_len, embed_dim].
            mask (Optional[torch.Tensor]): The mask tensor of shape [batch_size, seq_len, seq_len].
        
        Returns:
            torch.Tensor: The output after multi-head attention operation.
        """
        batch_size, seq_len, _ = x.size()
        
        # Linear projections
        Q = self.q_linear(x)
        K = self.k_linear(x)
        V = self.v_linear(x)
        
        # Split into multiple heads
        Q = Q.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
        K = K.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
        V = V.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
        
        # Scaled dot-product attention
        att_weights = Q @ K.transpose(-2, -1) / math.sqrt(self.head_dim)
        
        # Mask (opt.)
        if mask is not None:
            att_weights = att_weights.masked_fill(mask.unsqueeze(1) == 0, float('-inf'))
        
        # Softmax
        attention = self.att_dropout(torch.softmax(att_weights, dim=-1))
        
        # Attention output
        out = attention @ V
        
        # Concatenate heads
        out = out.transpose(1, 2).contiguous().view(batch_size, seq_len, self.embed_dim)
        
        # Final linear projection
        return self.out_proj(out), attention

In [6]:
from transformers import AutoTokenizer, AutoModel
import torch

model_name = "bert-base-uncased"
input_text = "The cat sat on the mat"

# 加载tokenizer和完整的预训练模型
tokenizer = AutoTokenizer.from_pretrained(model_name)
pretrained_model = AutoModel.from_pretrained(model_name)

# 获取embed_dim
embed_dim = pretrained_model.config.hidden_size
print(f"The embedding dimension is: {embed_dim}")

# 初始化你的SelfAttention模型
model = MultiHeadAttention(embed_dim=embed_dim, num_heads=8)

tokenized = tokenizer.encode(input_text, return_tensors='pt')  # Tokenize input text
with torch.no_grad():
    inputs = pretrained_model(tokenized).last_hidden_state

output, attention = model(inputs)  # Run model & Retrieve attention from model outputs
tokens = tokenizer.convert_ids_to_tokens(tokenized[0])  # Convert input ids to token strings

attention = attention.unsqueeze(1).view(1, 1, 8, len(tokenized[0]), len(tokenized[0]))  # Self-attention only has one head
head_view(attention, tokens)  # Display model view, attention dim: (batch_size, num_layers, num_heads, seq_len, seq_len)



The embedding dimension is: 768


<IPython.core.display.Javascript object>

# 效率优化

QKV 投影的时候，可以合并为一个大矩阵进行计算

In [7]:
class MultiHeadAttention(nn.Module):
    def __init__(self, embed_dim: int, num_heads: int, dropout: float = 0.1):
        """
        Initialize the MultiHeadAttention module.
        
        Args:
            embed_dim (int): The embedding dimension.
            num_heads (int): The number of attention heads.
        """
        super(MultiHeadAttention, self).__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.head_dim = embed_dim // num_heads
        self.att_dropout = nn.Dropout(dropout)
        
        assert self.head_dim * num_heads == embed_dim, "embed_dim must be divisible by num_heads"
        
        self.qkv_linear = nn.Linear(embed_dim, embed_dim * 3)
        self.out_proj = nn.Linear(embed_dim, embed_dim)
    
    def forward(self, x: torch.Tensor, mask: Optional[torch.Tensor] = None) -> torch.Tensor:
        """
        Perform multi-head attention operation.
        
        Args:
            x (torch.Tensor): The input tensor of shape [batch_size, seq_len, embed_dim].
            mask (Optional[torch.Tensor]): The mask tensor of shape [batch_size, seq_len, seq_len].
        
        Returns:
            torch.Tensor: The output after multi-head attention operation.
        """
        batch_size, seq_len, _ = x.size()
        
        # Linear projections
        QKV = self.qkv_linear(x)
        Q, K, V = torch.split(QKV, 3, dim=-1)
        
        # Split into multiple heads
        Q = Q.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
        K = K.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
        V = V.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
        
        # Scaled dot-product attention
        att_weights = Q @ K.transpose(-2, -1) / math.sqrt(self.head_dim)
        
        # Mask (opt.)
        if mask is not None:
            att_weights = att_weights.masked_fill(mask.unsqueeze(1) == 0, float('-inf'))
        
        # Softmax
        attention = self.att_dropout(torch.softmax(att_weights, dim=-1))
        
        # Attention output
        out = attention @ V
        
        # Concatenate heads
        out = out.transpose(1, 2).contiguous().view(batch_size, seq_len, self.embed_dim)
        
        # Final linear projection
        return self.out_proj(out), attention

# 现成的轮子

- torch.nn.functional.scaled_dot_product_attention
- torch.nn.MultiheadAttention