In [19]:
from torch import nn
import torch
import math
import torch.nn.functional as F
class SelfAttention(nn.Module):
  
    def __init__(self
                 ,d_model: int
                 ,row_dim = 0
                 ,col_dim = 1):
        super(SelfAttention, self).__init__()
        self.d_model = d_model
        self.row_dim = row_dim
        self.col_dim = col_dim
        self.W_q = nn.Linear(in_features= d_model, out_features=d_model, bias=False)
        self.W_k = nn.Linear(in_features= d_model, out_features=d_model, bias=False)
        self.W_v = nn.Linear(in_features= d_model, out_features=d_model, bias=False)
    
    def forward(self, token_encodings):
        q = self.W_q(token_encodings)
        k = self.W_k(token_encodings)
        v = self.W_v(token_encodings)

        sims = torch.matmul(q, k.transpose(self.row_dim, self.col_dim))
        scaled_sims = sims / math.sqrt(self.d_model)
        #scaled_sims = sims / torch.tensor(k.size(self.col_dim) ** 0.5)
        attention_percents = F.softmax(scaled_sims, dim=self.col_dim)
        attention_scores = torch.matmul(attention_percents, v)
        return attention_scores

tensor([[1.0100, 1.0641],
        [0.2040, 0.7057],
        [3.4989, 2.2427]], grad_fn=<MmBackward0>)

In [20]:
torch.manual_seed(42)

encoding_matrix = torch.tensor([
    [1.16, 0.23],
    [0.57, 1.36],
    [4.41, -2.16]
], dtype=torch.float32
)

selfAttention = SelfAttention(d_model=2, row_dim=0, col_dim=1)
selfAttention(encoding_matrix)

tensor([[1.0100, 1.0641],
        [0.2040, 0.7057],
        [3.4989, 2.2427]], grad_fn=<MmBackward0>)

# 实现Masked Self Attention

In [24]:
from torch import nn
import torch
import math
import torch.nn.functional as F
class MaskedSelfAttention(nn.Module):
  
    def __init__(self
                 ,d_model: int
                 ,row_dim = 0
                 ,col_dim = 1):
        super(MaskedSelfAttention, self).__init__()
        self.d_model = d_model
        self.row_dim = row_dim
        self.col_dim = col_dim
        self.W_q = nn.Linear(in_features= d_model, out_features=d_model, bias=False)
        self.W_k = nn.Linear(in_features= d_model, out_features=d_model, bias=False)
        self.W_v = nn.Linear(in_features= d_model, out_features=d_model, bias=False)
    
    def forward(self, token_encodings, mask=None):
        q = self.W_q(token_encodings)
        k = self.W_k(token_encodings)
        v = self.W_v(token_encodings)

        sims = torch.matmul(q, k.transpose(self.row_dim, self.col_dim))
        scaled_sims = sims / math.sqrt(self.d_model)
        if mask is not None:
            scaled_sims = scaled_sims.masked_fill(mask = mask, value = -1e9)
        #scaled_sims = sims / torch.tensor(k.size(self.col_dim) ** 0.5)
        attention_percents = F.softmax(scaled_sims, dim=self.col_dim)
        attention_scores = torch.matmul(attention_percents, v)
        return attention_scores
    
    
torch.manual_seed(42)

encoding_matrix = torch.tensor([
    [1.16, 0.23],
    [0.57, 1.36],
    [4.41, -2.16]
], dtype=torch.float32
)

mask = torch.tril(torch.ones(encoding_matrix.size(0), encoding_matrix.size(0))).bool()
mask = mask == 0

maskedSelfAttention = MaskedSelfAttention(d_model=2, row_dim=0, col_dim=1)
maskedSelfAttention(encoding_matrix, mask = mask)

tensor([[ 0.6038,  0.7434],
        [-0.0062,  0.6072],
        [ 3.4989,  2.2427]], grad_fn=<MmBackward0>)

# 实现Multi Head Attention

In [26]:
from torch import nn
import torch
import math
import torch.nn.functional as F
class Attention(nn.Module):
  
    def __init__(self
                 ,d_model: int
                 ,row_dim = 0
                 ,col_dim = 1
                 ):
        super(Attention, self).__init__()
        self.d_model = d_model
        self.row_dim = row_dim
        self.col_dim = col_dim
        self.W_q = nn.Linear(in_features= d_model, out_features=d_model, bias=False)
        self.W_k = nn.Linear(in_features= d_model, out_features=d_model, bias=False)
        self.W_v = nn.Linear(in_features= d_model, out_features=d_model, bias=False)
    
    def forward(self, token_encodings_q,token_encodings_k,token_encodings_v, mask=None):
        q = self.W_q(token_encodings_q)
        k = self.W_k(token_encodings_k)
        v = self.W_v(token_encodings_v)

        sims = torch.matmul(q, k.transpose(self.row_dim, self.col_dim))
        scaled_sims = sims / math.sqrt(self.d_model)
        if mask is not None:
            scaled_sims = scaled_sims.masked_fill(mask = mask, value = -1e9)
        #scaled_sims = sims / torch.tensor(k.size(self.col_dim) ** 0.5)
        attention_percents = F.softmax(scaled_sims, dim=self.col_dim)
        attention_scores = torch.matmul(attention_percents, v)
        return attention_scores
    

encoding_for_q = torch.tensor([
    [1.16, 0.23],
    [0.57, 1.36],
    [4.41, -2.16]
], dtype=torch.float32
)

encoding_for_k = torch.tensor([
    [1.16, 0.23],
    [0.57, 1.36],
    [4.41, -2.16]
], dtype=torch.float32
)

encoding_for_v = torch.tensor([
    [1.16, 0.23],
    [0.57, 1.36],
    [4.41, -2.16]
], dtype=torch.float32
) 

torch.manual_seed(42)
attention = Attention(d_model=2, row_dim=0, col_dim=1)
attention(encoding_for_q, encoding_for_k, encoding_for_v)

tensor([[1.0100, 1.0641],
        [0.2040, 0.7057],
        [3.4989, 2.2427]], grad_fn=<MmBackward0>)

In [35]:
class MultiHeadAttention(nn.Module):

    def __init__(self
                 ,d_model: int
                 ,num_heads: int
                 ,row_dim = 0
                 ,col_dim = 1
                ):
        super(MultiHeadAttention, self).__init__()
        self.d_model = d_model
        self.num_heads = num_heads
        self.row_dim = row_dim
        self.col_dim = col_dim
        
        self.heads = [
            Attention(d_model=d_model, row_dim=row_dim, col_dim=col_dim) for _ in range(num_heads)
        ]
    
    def forward(self, token_encodings_q,token_encodings_k,token_encodings_v, mask=None):
        attention_scores_list = []
        for head in self.heads:
            attention_scores = head(token_encodings_q, token_encodings_k, token_encodings_v, mask)
            attention_scores_list.append(attention_scores)
        attention_scores = torch.cat(attention_scores_list, dim=self.col_dim)
        return attention_scores

encoding_for_q = torch.tensor([
    [1.16, 0.23],
    [0.57, 1.36],
    [4.41, -2.16]
], dtype=torch.float32
)

encoding_for_k = torch.tensor([
    [1.16, 0.23],
    [0.57, 1.36],
    [4.41, -2.16]
], dtype=torch.float32
)

encoding_for_v = torch.tensor([
    [1.16, 0.23],
    [0.57, 1.36],
    [4.41, -2.16]
], dtype=torch.float32
) 

torch.manual_seed(42)

multiattention = MultiHeadAttention(d_model=2, num_heads=1, row_dim=0, col_dim=1)
result = multiattention(encoding_for_q, encoding_for_k, encoding_for_v)
result

tensor([[1.0100, 1.0641],
        [0.2040, 0.7057],
        [3.4989, 2.2427]], grad_fn=<CatBackward0>)

In [15]:
a = torch.randn(3, 3)
a

tensor([[-0.3267, -0.2788, -0.4220],
        [-1.3323, -0.3639,  0.1513],
        [-0.3514, -0.7906, -0.0915]])

In [14]:
torch.tril(torch.ones(3, 3))

tensor([[1., 0., 0.],
        [1., 1., 0.],
        [1., 1., 1.]])

In [16]:
mask = torch.tril(torch.ones(3, 3))
mask = mask == 0
mask

tensor([[False,  True,  True],
        [False, False,  True],
        [False, False, False]])