In [9]:
import torch
from torch import nn
import torch.functional as F
import math

In [10]:
X = torch.randn(128, 64, 512) # Batch, Time, Dimension=d_model(embedding 后 词向量应该是多少维就是多少)
print(X.shape)

torch.Size([128, 64, 512])


In [11]:
d_model = 512 # QKV应该是多少维度
n_head = 8      # 有多少头

In [12]:
class multi_head_attention(nn.Module):
    def __init__(self, d_model, n_head):
        super().__init__()

        self.n_head = n_head
        self.d_model = d_model
        self.w_q = nn.Linear(d_model, d_model)
        self.w_k = nn.Linear(d_model, d_model)
        self.w_v = nn.Linear(d_model, d_model)
        self.w_combine = nn.Linear(d_model, d_model) # multi_head 多写一个combine
        self.softmax = nn.Softmax(dim = -1)

    def forward(self, q, k, v, mask = None):
        batch, time, dimension = q.shape

        # 子qkv的维度，也就是长度
        n_d = self.d_model // self.n_head
        q, k, v = self.w_q(q), self.w_k(k), self.w_v(v)
        q = q.view(batch, time, self.n_head, n_d).permute(0, 2, 1, 3)
        k = k.view(batch, time, self.n_head, n_d).permute(0, 2, 1, 3)
        v = v.view(batch, time, self.n_head, n_d).permute(0, 2, 1, 3)

        score = q @ k.transpose(2, 3) / math.sqrt(n_d) 
        if mask is not None:
            #mask = torch.tril(torch.ones(time, time, dtype=bool))
            score = score.masked_fill(mask == 0, float('-inf'))# mask == 0的地方用-inf表示
        score = self.softmax(score) @ v
        score = score.permute(0, 2, 1, 3).contiguous().view(batch, time, dimension)
        output = self.w_combine(score)
        return output
    
attention = multi_head_attention(d_model=d_model, n_head=n_head)
output = attention(X,X,X)

# print(output, output.shape)


In [13]:
attention.w_q.weight.shape

torch.Size([512, 512])

# Token Embedding

In [None]:
class TokenEmbedding(nn.Embedding):
    def __init__(self, vocab_size, d_model):
        super().__init__(vocab_size, d_model, padding_idx=1)

# Position Embedding
$$
PE_{(pos, 2i)} = \sin (\frac{pos}{10000^{2i/ d_{model}}} ) \\
PE_{(pos, 2i + 1)} = \cos (\frac{pos}{10000^{2i/ d_{model}}} )
$$

In [6]:
class PositionalEmbedding(nn.Module):
    def __init__(self, d_model, maxlen, device):
        super().__init__()
        # require 
        # TODO construction
        self.encoding = torch.zeros(maxlen, d_model, device=device)
        print(self.encoding.shape)
        self.encoding.requires_grad_(False)
        pos = torch.arange(0, maxlen, device=device)
        pos = pos.float().unsqueeze(1)
        print(pos.shape)
        _2i = torch.arange(0, d_model, 2, device=device)

        self.encoding[:, 0::2] = torch.sin(pos / 10000 ** (_2i / d_model) )
        self.encoding[:, 1::2] = torch.cos(pos / 10000 ** (_2i / d_model) )

    def forward(self, x):
        seq_len = x.shape[1]
        return self.encoding[:seq_len, :]
        

In [7]:
positon = PositionalEmbedding(512, 1000, 'cpu')


torch.Size([1000, 512])
torch.Size([1000, 1])




**归一化操作（如减去均值、除以标准差）会改变数据的分布，可能会丢失一些对模型有用的信息。**例如：
- 某些特征的原始方差可能本身就很大，归一化后会被压缩，导致信息损失。
- 模型可能需要保留数据中的某些特定分布特性。

通过引入可学习的参数 $γ$ 和 $β$，模型可以根据数据的特性自动调整归一化后的表示：

- $γ$可以放大或缩小特征的范围。初始化为全 1，即不改变归一化后的尺度。

- $β$ 可以调整特征的中心位置。初始化为全 0，即不改变归一化后的偏移。

In [None]:
class LayerNorm(nn.Module):
    def __init__(self, d_model, eps = 1e10):
        super().__init__()
        self.gamma = nn.Parameter(torch.ones(d_model))
        self.beta = nn.Parameter(torch.zeros(d_model))
        self.eps = eps

    def forward(self, x):
        mean = x.mean(-1, keepdim = True)
        var = x.var(-1, unbiased = False, keepdim = True)
        out = (x - mean) / torch.sqrt(var + self.eps)
        out = self.gamma * out + self.beta  
        return out

FFN

In [None]:
class PositionWiseForward(nn.Module):
    def __init__(self,d_model, d_hidden, dropout = 0.1):
        super().__init__()
        self.fc1 = nn.Linear(d_model, d_hidden)
        self.fc2 = nn.Linear(d_hidden, d_model)
        self.droupout = nn.Dropout(dropout)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.droupout(x)
        x = self.fc2(x)
        return x

# Total Embedding

In [None]:
class TransformerEmbedding(nn.Module):
    def __init__(self, vocab_size, d_model, max_len, drop_prob, device):
        super().__init__()
        self.tok_emb = TokenEmbedding(vocab_size, d_model)
        self.pos_emb = PositionalEmbedding(d_model, max_len, device=device)
        self.drop_out = nn.Dropout(drop_prob)

    def forward(self, x):
        tok_emb = self.tok_emb(x)
        pos_emb = self.pos_emb(x)
        return self.drop_out(tok_emb + pos_emb)

In [14]:
# Encoder Layer

In [None]:
class EmcoderLayer(nn.Module):
    def __init__(self, d_model, ffn_hidden, n_head, drop_prob):
        super().__init__()

        self.attention = multi_head_attention(d_model, n_head)
        self.norm1 = LayerNorm(d_model)
        self.drop1 = nn.Dropout(drop_prob)
        self.ffn = PositionWiseForward(d_model, ffn_hidden, drop_prob)
        self.norm2 = LayerNorm(d_model)
        self.drop2 = nn.Dropout(drop_prob)

    def forward(self, x , mask = None):
        _x = x # 为了残差连接
        x = self.attention(x, x, x, mask)

        x = self.drop1(x)
        x = self.norm1(x + _x)

        _x = x
        x = self.ffn(x)
        x = self.drop2(x)
        x = self.norm2(x + _x)
        return x

