## Positional Encoding


In [20]:
import torch
import torch.nn as nn
class PositionEncoding(nn.Module):

    def __init__(self,hidden_dim,seq_len):
        super().__init__()
        # hidden_dim就是每个seq的编码长度
        # 与X大小一致 其为seq_len,hidden_dim
        self.encoding = torch.zeros(seq_len,hidden_dim)

        pos = torch.arange(0,seq_len).float() #[seq_len]
        pos = pos.unsqueeze(dim=1)
        # 得到一个 [seq_len,1]

        # 得到hidden_dim的索引i
        _2i = torch.arange(0,hidden_dim,step=2).float()
        # 2*i

        self.encoding[:,0::2] = torch.sin(pos/(10000**(_2i/hidden_dim)))
        self.encoding[:,1::2] = torch.cos(pos/(10000**(_2i/hidden_dim)))

        # 此时encoding还没有加上batch_size的维度
        self.encoding = self.encoding.unsqueeze(0)

    def forward(self,X):

        batch_size,seq_len,_ = X.shape
        print(self.encoding)
        return X + self.encoding[:,:seq_len,:]

X = torch.rand(3,2,4)
net = PositionEncoding(hidden_dim=4,seq_len=2)
y = net(X)
print(y)
print(X+net.encoding)

tensor([[[0.0000, 1.0000, 0.0000, 1.0000],
         [0.8415, 0.5403, 0.0100, 0.9999]]])
tensor([[[0.2848, 1.5087, 0.7941, 1.8575],
         [1.0497, 0.5462, 0.8766, 1.8510]],

        [[0.4574, 1.6655, 0.1240, 1.0548],
         [1.4170, 1.2497, 1.0021, 1.8364]],

        [[0.5931, 1.8079, 0.6458, 1.6025],
         [0.8469, 0.5648, 0.4028, 1.7824]]])
tensor([[[0.2848, 1.5087, 0.7941, 1.8575],
         [1.0497, 0.5462, 0.8766, 1.8510]],

        [[0.4574, 1.6655, 0.1240, 1.0548],
         [1.4170, 1.2497, 1.0021, 1.8364]],

        [[0.5931, 1.8079, 0.6458, 1.6025],
         [0.8469, 0.5648, 0.4028, 1.7824]]])


In [21]:
torch.nn.Embedding??

[0;31mInit signature:[0m
[0mtorch[0m[0;34m.[0m[0mnn[0m[0;34m.[0m[0mEmbedding[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mnum_embeddings[0m[0;34m:[0m [0mint[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0membedding_dim[0m[0;34m:[0m [0mint[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mpadding_idx[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mint[0m[0;34m,[0m [0mNoneType[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmax_norm[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mfloat[0m[0;34m,[0m [0mNoneType[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mnorm_type[0m[0;34m:[0m [0mfloat[0m [0;34m=[0m [0;36m2.0[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mscale_grad_by_freq[0m[0;34m:[0m [0mbool[0m [0;34m=[0m [0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0msparse[0m[0;34m:[0m [0mbool[0m [0;34m=[0m [0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0m_weigh

In [22]:
import torch
#torch.zeros??
a = torch.arange(0,10)
print(a.shape)
a = a.unsqueeze(dim=1)
a.shape
# torch.unsqueeze??

torch.Size([10])


torch.Size([10, 1])

## Scale Dot Product Attention


In [None]:
import math
class scaleDotProductAttention(nn.Module):
    def __init__(self) -> None:
        super().__init__()
        self.softMax = nn.Softmax(dim=-1)


    def forward(self,X,q,k,v,mask=None):
        batch_size,seq_len,hidden_dim = X.shape
        ## 计算Q,K,V
        Q = q(X)
        K = k(X)
        V = v(X)
        # Q(batch_size,seq_len,hidden_dim)@K(batch_size,seq_len,hidden_dim)
        # 为了相乘，K要进行转置
        ##计算attention_weight
        attention_weight = Q@K.transpose(-1,-2)/math.sqrt(hidden_dim)
        #print(attention_weight.shape)
        if mask is not None:
            attention_weight = attention_weight.masked_fill(mask==0,float("-inf"))
        
        attention_weight = self.softMax(attention_weight)
        
        # 最终得到乘积
        attention_scores = attention_weight @ V
        return attention_scores

X = torch.rand(2,3,4)
mask = torch.Tensor([
    [1,1,1],
    [1,1,0],
])

Query = nn.Linear(4,4)
Key = nn.Linear(4,4)
Value = nn.Linear(4,4)

# (2,4) (2,)  (2,3,4)
# 首先需要加一个dim，然后repeat
#print(mask.unsqueeze(dim=1).shape)
## 记住mask的shape是(batch_size，seq_len,seq_len)
mask = mask.unsqueeze(dim=1).repeat(1,3,1)
#print(mask)
#print(mask.shape)
net = scaleDotProductAttention()
out = net(X,Query,Key,Value,mask=mask)
out.shape

torch.Size([2, 3, 4])

In [67]:
import math
class scaleDotProductAttentionV2(nn.Module):
    def __init__(self,hidden_dim) -> None:
        super().__init__()
        self.softMax = nn.Softmax(dim=-1)
        self.hidden_dim = hidden_dim

    def forward(self,q,k,v,mask=None):
        
        ## 计算Q,K,V
        Q = q
        K = k
        V = v
        # Q(batch_size,seq_len,hidden_dim)@K(batch_size,seq_len,hidden_dim)
        # 为了相乘，K要进行转置
        ##计算attention_weight
        attention_weight = Q@K.transpose(-1,-2)/math.sqrt(self.hidden_dim)
        #print(attention_weight.shape)
        if mask is not None:
            attention_weight = attention_weight.masked_fill(mask==0,float("-inf"))
        
        attention_weight = self.softMax(attention_weight)
        
        # 最终得到乘积
        attention_scores = attention_weight @ V
        return attention_scores

X = torch.rand(2,3,4)
mask = torch.Tensor([
    [1,1,1],
    [1,1,0],
])

Query = nn.Linear(4,4)
Key = nn.Linear(4,4)
Value = nn.Linear(4,4)

Query = Query(X)
Key = Key(X)
Value = Value(X)
# (2,4) (2,)  (2,3,4)
# 首先需要加一个dim，然后repeat
#print(mask.unsqueeze(dim=1).shape)
## 记住mask的shape是(batch_size，seq_len,seq_len)
mask = mask.unsqueeze(dim=1).repeat(1,3,1)
#print(mask)
#print(mask.shape)
net = scaleDotProductAttentionV2(4)
out = net(Query,Key,Value,mask=mask)
out.shape

torch.Size([2, 3, 4])

## Multi-head Attention

In [69]:
## 然后是根据单头写多头
class MultiHeadAttention(nn.Module):
    def __init__(self,head_nums,hidden_dim) -> None:
        super().__init__()
        self.head_nums = head_nums
        self.hidden_dim = hidden_dim
        # 将一段hidden_dim分成多个head
        self.Key = nn.Linear(hidden_dim,hidden_dim)
        self.Query = nn.Linear(hidden_dim,hidden_dim)
        self.Value = nn.Linear(hidden_dim,hidden_dim)
        self.singleHeads = scaleDotProductAttentionV2(hidden_dim)
        # 这个再看
        self.out_proj = nn.Linear(hidden_dim,hidden_dim)
        
    def forward(self,X,mask=None):
        batch_size,seq_len,hidden_dim = X.shape

        # 计算Q K V
        Q = self.Query(X)
        K = self.Key(X)
        V = self.Value(X)
        # (batch_size,seq_len,hidden_dim)
        # =>(batch_size,num_heads,seq_len,hidden_dim//)
        # 将计算结果分为多个头

        Q = Q.view(batch_size,seq_len,self.head_nums,-1).transpose(-2,-3)
        K = K.view(batch_size,seq_len,self.head_nums,-1).permute(0,2,1,3)
        V = V.view(batch_size,seq_len,self.head_nums,-1).permute(0,2,1,3)

        #对每个头进行传播
        out1 = self.singleHeads(Q,K,V,mask)
        # 将其重新变成 (batch_size,seq_len,num_heads,hidden_dim)
        out1 = out1.transpose(1,2).contiguous()

        out2 = out1.view(batch_size,seq_len,-1)
        # 连接起来之后，记得proj
        output = self.out_proj(out2)
        return output

head_nums = 2
hidden_dim = 4
X = torch.rand(2,3,4)
# (2,2,3,2)
mask = torch.tensor([
    [1,1,0],
    [1,0,0]
])
# print(mask.unsqueeze(1))
mask = mask.unsqueeze(dim=1).repeat(1,3,1)
#print(mask.shape)

multihead = MultiHeadAttention(head_nums=head_nums,hidden_dim=hidden_dim)
out = multihead(X)
out.shape

torch.Size([2, 3, 4])

## Layer Norm


In [30]:
class layerNorm(nn.Module):
    def __init__(self,hidden_model,eps=1e-12) -> None:
        super().__init__()
        self.gamma = nn.Parameter(torch.ones(hidden_model))
        self.beta = nn.Parameter(torch.ones(hidden_model))
        self.eps = eps
    def forward(self,X):
        mean = X.mean(dim=-1,keepdim=True) # keep same size 求和那个dim变为1
        var = X.var(dim=-1,keepdim=True)  # keep_dim=False，求和dim就会退化

        output = (X-mean)/torch.sqrt(var+self.eps)
        #print("out1:",output)
        output = self.gamma*output + self.beta
        return output

X = torch.randn(2,5,4)
LN = layerNorm(4)
out = LN(X)
print(out.mean(dim=-1,keepdim=True))

tensor([[[2.0000],
         [2.0000],
         [2.0000],
         [2.0000],
         [2.0000]],

        [[2.0000],
         [2.0000],
         [2.0000],
         [2.0000],
         [2.0000]]], grad_fn=<MeanBackward1>)


## Position Feed Forward

$$
    FFN(x) = max(0,xW_1+b_1)W_2 +b_2
    
$$

In [17]:
class FFN(nn.Module):
    def __init__(self,d_model,d_ff,dropout_rate=0.1) -> None:
        super().__init__()
        self.linear1 = nn.Linear(d_model,d_ff) # 给一个隐藏层的大小即可
        self.linear2 = nn.Linear(d_ff,d_model)
        self.relu = nn.ReLU()
        self.drop = nn.Dropout(p=dropout_rate)
    
    def forward(self,X):
        out1 = self.linear1(X)
        out2 = self.relu(out1)
        # 先失活再通过第二个linear
        out3 = self.drop(out2)
        output = self.linear2(out3)
        return output
    
# 输入输出dim
d_model = 64
#前馈隐藏层的dim
d_ff = 256


ffn = FFN(d_model,d_ff)

# test
X = torch.rand(2,16,64)
out = ffn(X)
out.shape
# 这个也不改变shape

torch.Size([2, 16, 64])

## Encoder Decoder

## 

## 