# Part 1: Model Architecture

### 逐步实验transformer
#### 从【Inference】部分开始对模型内容进行逐步实验
#### 首先计算memory
#### 即encoder最后的输出结果
#### 实验 memory = test_model.encode(src, src_mask)

In [1]:
# 模型结构定义，用于参照
import copy
def make_model(src_vocab, tgt_vocab, N=6, d_model=512, d_ff=2048, h=8, dropout=0.1):
    c = copy.deepcopy
    attn = MultiHeadedAttention(h=8,d_model=512)
    ff = PositionwiseFeedForward(d_model=512, d_ff=2048)
    position = PositionalEncoding(d_model=512, dropout=0.1)
    
    model = EncoderDecoder(
        Encoder(EncoderLayer(d_model, c(attn), c(ff), dropout), N),
        Decoder(DecoderLayer(d_model, c(attn), c(attn), c(ff), dropout), N),
        nn.Sequential(Embeddings(d_model, src_vocab), c(position)),
        nn.Sequential(Embeddings(d_model, tgt_vocab), c(position)),
        Generator(d_model, tgt_vocab),
    )
    
    # 作用随后补充
    # Initialize parameters with Glorot / fan_avg
    # 参考论文: https://proceedings.mlr.press/v9/glorot10a/glorot10a.pdf
    for p in model.parameters():
        if p.dim()>1:
            nn.init.xavier_uniform_(p)
    return model

In [69]:
'''
大致流程说明
1. 先实现 memory = test_model.encode(src, src_mask)，其中memory就是模型encoder输出的结果

2. class EncoderDecoder给出encode实现如下:
def encode(self, src, src_mask):
        return self.encoder(self.src_embed(src), src_mask)

    2.1 首先对输入src进行Embedding
    src_embed = nn.Sequential(Embeddings(d_model, src_vocab), c(position))
    就是先对src进行Embedding,
    然后再对Embedding的输出进行位置编码 position = PositionalEncoding(d_model=512, dropout=0.1)
    
    2.2 对2.1的输出进行encoder
    其中encoder = Encoder(EncoderLayer(d_model, c(attn), c(ff), dropout), N=6)

    参看Encoder类的实现，其实就是N个EncoderLayer的堆叠，把2.1中的输出经过N个EncoderLayer最后进行norm
    
    2.3 EncoderLayer类的实现:
    分成2个部分: attention部分和feed_forward部分
    
        2.3.1 MultiHeadedAttention的实现
        norm(x) -> MultiHeadedAttention(x,x,x,mask) -> dropout -> +x [残差连接]
        
        2.3.2 feedforward类
        norm(x) -> PositionwiseFeedForward(x) -> dropout -> +x [残差连接]
        
        
        可以看出MultiHeadedAttention和feedforward结构类似，除了第二步使用的layer不一样，上述结构都通过
        辅助类 SublayerConnection实现
        
        需要注意的地方:
        在EncoderLayer的MultiHeadedAttention中,由于所有q, k, v都是来自src_emb, 所以q = k = v = x
        
        DecoderLayer的MultiHeadedAttention的区别在DecoderLayer部分说明
    

'''


'\n大致流程说明\n1. 先实现 memory = test_model.encode(src, src_mask)，其中memory就是模型encoder输出的结果\n\n2. class EncoderDecoder给出encode实现如下:\ndef encode(self, src, src_mask):\n        return self.encoder(self.src_embed(src), src_mask)\n\n    2.1 首先对输入src进行Embedding\n    src_embed = nn.Sequential(Embeddings(d_model, src_vocab), c(position))\n    就是先对src进行Embedding,\n    然后再对Embedding的输出进行位置编码 position = PositionalEncoding(d_model=512, dropout=0.1)\n    \n    2.2 对2.1的输出进行encoder\n    其中encoder = Encoder(EncoderLayer(d_model, c(attn), c(ff), dropout), N=6)\n\n    参看Encoder类的实现，其实就是N个EncoderLayer的堆叠，把2.1中的输出经过N个EncoderLayer最后进行norm\n    \n    2.3 EncoderLayer类的实现:\n    分成2个部分: attention部分和feed_forward部分\n    \n        2.3.1 MultiHeadedAttention的实现\n        norm(x) -> MultiHeadedAttention(x,x,x,mask) -> dropout -> +x [残差连接]\n        \n        2.3.2 feedforward类\n        norm(x) -> PositionwiseFeedForward(x) -> dropout -> +x [残差连接]\n        \n        \n        可以看出MultiHeadedAttention和feedforward结构类似，除了第二步

#### 实现Embedding和PositionalEncoding

In [None]:
# test_model = make_model(11,11,2)
# 其他模型参数: d_model=512, d_ff=2-48, h=8, dropout=0.1
# src_vocab = 11
# tgt_vocab = 11
# train the model to memorize the numbers of 1 to 10

In [2]:
# src
import torch
import torch.nn as nn
src = torch.LongTensor([[1,2,3,4,5,6,7,8,9,10]]) # src
print(src)
# batch=1 
print(src.shape)

tensor([[ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10]])
torch.Size([1, 10])


In [3]:
# 创建mask
src_mask = torch.ones(1,1,10)
print(src_mask)
print(src_mask.shape) # 为什么这么设计

tensor([[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]]])
torch.Size([1, 1, 10])


#### Embedding

In [4]:
# memory = test_model.encoder(src_emb(src), src_mask)
# memory为encoder的结果
import math
class Embeddings(nn.Module):
    def __init__(self, d_model, vocab):
        super(Embeddings,self).__init__()
        self.lut = nn.Embedding(vocab,d_model)
        self.d_model = d_model
    
    def forward(self, x):
        # Q: 为什么要进行缩放
        # https://analyticsindiamag.com/why-data-scaling-is-important-in-machine-learning-how-to-effectively-do-it/
        return self.lut(x) * math.sqrt(d_model)

In [5]:
# embedding_test
embedding = nn.Embedding(11,512)
embedding_output = embedding(src)
print(embedding_output.shape)

torch.Size([1, 10, 512])


In [6]:
embedding_output = embedding_output * math.sqrt(512)
print(embedding_output.shape)

torch.Size([1, 10, 512])


#### positional encoding

In [7]:
# nn.Sequential(embedding, position)
# position = PositionalEncoding(512,dropout=0.1)
# 因为缺少位置信息
# inject information about relative and absolute
# position of the tokens

# positional encoding
# 公式
# PE(pos,2i) = sin(pos/10000**(2*i/d_model))
# PE(pos,2i+1) = cos(pos/10000**(2*i/d_model))

# for any fixed offset k
# PE(pos+k) can be represented as a linear function of PEpos

# apply Pdrop=0.1 to the result
# 注意max_length的设定

# 还有其他提供位置信息的方法
# 例如: rotary embedding


class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout, max_len=5000):
        # 设定句子长度为5000
        super(PositionalEncoding,self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        
        # compute the positional encodings once in log space
        
        # 创建position和d_mode的矩阵
        pe = torch.zeros(max_len, d_model) # 矩阵 [5000,512], 设定句子长度最长5000
        position = torch.arange(0,max_len).unsqueeze(1) # [5000,1]
        div_term = torch.exp(torch.arange(0,d_model,2) * -(math.log(10000.0)/d_model)) # [256]
        pe[:,0::2] = torch.sin(position * div_term) # [256]
        pe[:,1::2] = torch.cos(position * div_term) # [256]
        
        pe = pe.unsqueeze(0) # [1,5000,152]
        self.register_buffer("pe",pe)
        
    def forward(self,x):
        # x.size(1) 截取前10个位置的PE
        x = x + self.pe[:, : x.size(1)].requires_grad_(False) # [1,10,512]
        return self.dropout(x) # [1,10,512]

In [8]:
# positional encoding test
max_len = 5000 #句子长度
d_model = 512
pe = torch.zeros(5000, 512)
print(pe.shape)

torch.Size([5000, 512])


In [9]:
position = torch.arange(0,5000)
print(position)
print(position.shape)
position = position.unsqueeze(1)
print(position.shape)

tensor([   0,    1,    2,  ..., 4997, 4998, 4999])
torch.Size([5000])
torch.Size([5000, 1])


In [10]:
div_term = torch.exp(
            torch.arange(0, 512, 2) * -(math.log(10000.0) / 512)
        )
print(div_term.shape)

torch.Size([256])


In [11]:
print((position * div_term).shape)

torch.Size([5000, 256])


In [12]:
pe[:, 0::2] = torch.sin(position * div_term)
pe[:, 1::2] = torch.cos(position * div_term)

print(pe.shape)

torch.Size([5000, 512])


In [13]:
pe = pe.unsqueeze(0)
print(pe.shape)

torch.Size([1, 5000, 512])


In [14]:
print(embedding_output.size(1))  # 1.10,512

test = pe[:,:embedding_output.size(1)] # 截取前10个
print(test.shape)

10
torch.Size([1, 10, 512])


In [15]:
embedding_position = embedding_output + pe[:,:embedding_output.size(1)]
print(embedding_position.shape)

torch.Size([1, 10, 512])


In [16]:
# unsqueeze函数的使用方法
# torch.unsqueeze(input,dim)
# squeeze 和 unsqueeze的参数都是指维度
x = torch.tensor([1,2,3,4])
torch.unsqueeze(x,0)

tensor([[1, 2, 3, 4]])

In [17]:
torch.unsqueeze(x,1)

tensor([[1],
        [2],
        [3],
        [4]])

In [18]:
# 各种形状变换
dropout = nn.Dropout(p=0.1)
embedding_position = dropout(embedding_position)
print(embedding_position.shape)

torch.Size([1, 10, 512])


In [19]:
import pandas as pd
import altair as alt
RUN_EXAMPLES = True
def is_interactive_notebook():
    return __name__ == "__main__"


def show_example(fn, args=[]):
    if __name__ == "__main__" and RUN_EXAMPLES:
        return fn(*args)


def execute_example(fn, args=[]):
    if __name__ == "__main__" and RUN_EXAMPLES:
        fn(*args)


class DummyOptimizer(torch.optim.Optimizer):
    def __init__(self):
        self.param_groups = [{"lr": 0}]
        None

    def step(self):
        None

    def zero_grad(self, set_to_none=False):
        None


class DummyScheduler:
    def step(self):
        None
    
    
def example_positional():
    pe = PositionalEncoding(20, 0)
    y = pe.forward(torch.zeros(1, 100, 20))

    data = pd.concat(
        [
            pd.DataFrame(
                {
                    "embedding": y[0, :, dim],
                    "dimension": dim,
                    "position": list(range(100)),
                }
            )
            for dim in [4, 5, 6, 7]
        ]
    )

    return (
        alt.Chart(data)
        .mark_line()
        .properties(width=800)
        .encode(x="position", y="embedding", color="dimension:N")
        .interactive()
    )


show_example(example_positional)

#### 实验EncoderLayer类

#### encoder
#### 实验LayerNorm

In [20]:
# Encoder(EncoderLayer(d_model=512, c(attn), c(ff), dropout=0.1),N=6)
# c = copy.deepcopy

# layer normalization
# 原理看论文
class LayerNorm(nn.Module):
    def __init__(self, features, eps=1e-6):
        super(LayerNorm,self).__init__()
        self.a_2 = nn.Parameter(torch.ones(features))
        self.b_2 = nn.Parameter(torch.zeros(features))
        self.eps = eps
    
    def forward(self,x):
        mean = x.mean(-1,keepdim=True)
        std = x.std(-1,keepdim=True)
        return self.a_2 * (x-mean) / (std + self.eps) + self.b_2

In [21]:
# layer normalization test
# d_model = 512
# attn = MultiHeadedAttention(h=8, d_model=512)
# ff = PositionwiseFeedForward(d_model=512, d_ff=2048)
# Encoder(EncoderLayer(d_model, attn, ff, dropout),N=6)
# size = d_model = 512

a_2 = nn.Parameter(torch.ones(512))
a_2.shape

torch.Size([512])

In [22]:
b_2 = nn.Parameter(torch.zeros(512))
b_2.shape

torch.Size([512])

In [23]:
mean = embedding_position.mean(-1,keepdim=True)
print(mean.shape)
std = embedding_position.std(-1,keepdim=True)
print(std.shape)

torch.Size([1, 10, 1])
torch.Size([1, 10, 1])


In [24]:
eps = 1e-6
embedding_position = a_2 * (embedding_position - mean) / (std + eps) + b_2
embedding_position.shape

torch.Size([1, 10, 512])

#### 实验MultiHeadedAttention

In [25]:
# 转换x -> embedding_position
# 多头注意力
import copy
def clones(module,N):
    return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])

In [26]:
x = embedding_position
x.shape

torch.Size([1, 10, 512])

In [27]:
# attn = MultiHeadedAttention(h=8, d_model=512)
# 传入参数 (x,x,x,mask)
# x -> [1,10,512]
# mask = src_mask
class MultiHeadedAttention(nn.Module):
    def __init__(self, h, d_model, dropout=0.1):
        super(MultiHeadedAttention,self).__init__()
        assert d_model %h==0
        
        # we assume d_v==d_k
        self.d_k = d_model // h
        self.h = h
        # 前3个linear用于query,key,value的linear projections
        # 最后一个linear用于attention之后的projection
        self.linears = clones(nn.Linear(d_model,d_model),4)
        
        self.attn = None
        self.dropout = nn.Dropout(p=dropout)
        
    
    def forward(self, query, key, value, mask=None):
        if mask is not None:
            mask = mask.unsqueeze(1)
        
        nbatches = query.size(0)
        
        # do all the linear projections in batch from
        # d_model = h * d_k
        query, key, value = [lin(x).view(nbatches,-1,self.h,self.d_k).transpose(1,2) for lin,x in zip(self.linears, (query,key,value))]
        
        # apply attention on all the projected vectors in batch
        x, self.attn = attention(query, key, value, dropout=self.dropout)
        
        # concat
        x = (x.transpose(1,2).contiguous().view(nbatches, -1, self.h * self.d_k))
        
        del query
        del key
        del value
        return self.linears[-1](x)

In [28]:
x.shape

torch.Size([1, 10, 512])

In [30]:
# multiheaded attention 
h = 8
d_model = 512
#print(d_model//h) #64
d_k = d_model // h
print(d_k)

64


In [31]:
# query = key = value = x -> [1,10,512]
src_mask.shape

torch.Size([1, 1, 10])

In [32]:
src_mask = src_mask.unsqueeze(1)
src_mask.shape

torch.Size([1, 1, 1, 10])

In [33]:
# query和key的维度一样，value一个维度
# 实验中q, k, v维度一样
query = key = value = x # [1,10,512]
n_batches = query.size(0)
print(n_batches)

1


In [34]:
linears = clones(nn.Linear(512,512),4)
print(linears)

ModuleList(
  (0): Linear(in_features=512, out_features=512, bias=True)
  (1): Linear(in_features=512, out_features=512, bias=True)
  (2): Linear(in_features=512, out_features=512, bias=True)
  (3): Linear(in_features=512, out_features=512, bias=True)
)


In [35]:
test = list(zip(linears, (query,key,value)))
print(len(test))
#print(test[0])
test_zero = list(test[0])
print(test_zero[0])
print(test_zero[1].shape)
print(test_zero[1]==value)

3
Linear(in_features=512, out_features=512, bias=True)
torch.Size([1, 10, 512])
tensor([[[True, True, True,  ..., True, True, True],
         [True, True, True,  ..., True, True, True],
         [True, True, True,  ..., True, True, True],
         ...,
         [True, True, True,  ..., True, True, True],
         [True, True, True,  ..., True, True, True],
         [True, True, True,  ..., True, True, True]]])


In [36]:
# zip的用法
a = [1,2,3]
b = [4,5,6]
zipped = zip(a,b)
print(zipped)
temp = list(zipped)
print(temp)

<zip object at 0x00000214E1803B40>
[(1, 4), (2, 5), (3, 6)]


In [37]:
temp = (query,key,value)
print(type(temp))
print(len(temp))

<class 'tuple'>
3


In [38]:
# 简单示例
# 实现 lin(x).view(nbatches, -1, self.h, self.d_k).transpose(1, 2)
linear_temp = nn.Linear(512,512)
linear_output = linear_temp(query)
print(linear_output.shape)
linear_view = linear_output.view(1,-1,8,64)
print(linear_view.shape)
linear_transpose = linear_view.transpose(1,2)
print(linear_transpose.shape)
# 不知道为什么这里要做transpose

torch.Size([1, 10, 512])
torch.Size([1, 10, 8, 64])
torch.Size([1, 8, 10, 64])


In [39]:
query, key, value = [
            lin(x).view(1, -1, 8, 64).transpose(1, 2)
            for lin, x in zip(linears, (query, key, value))
        ]

In [40]:
# 检查query, key, value形状
print(query.shape)
print(key.shape)
print(value.shape) # batch, h, token_num, d_k

torch.Size([1, 8, 10, 64])
torch.Size([1, 8, 10, 64])
torch.Size([1, 8, 10, 64])


#### attention的实验

In [41]:
def attention(query, key, value, mask=None, dropout=None):
    # 计算 scaled dot product attention
    d_k = query.size(-1)
    scores = torch.matmul(query, key.transpose(-2,-1)) / math.sqrt(d_k)
    if mask is not None:
        scores = socres.masked_fill(mask==0, -1e9)
    p_attn = scores.softmax(dim=-1)
    if dropout is not None:
        p_attn = dropout(p_attn)
    return torch.matmul(p_attn,value), p_attn

In [42]:
# 实现x, self.attn = attention(
#            query, key, value, mask=mask, dropout=self.dropout
#        )

d_k = query.size(-1)
print(d_k) # torch.Size([1, 8, 10, 64]
key_trans = key.transpose(-2,-1) # 这里的维度是指trans之前的维度
print(key_trans.shape)
scores = torch.matmul(query,key_trans)
print(scores.shape)
scores = scores / math.sqrt(d_k)
print(scores.shape)

64
torch.Size([1, 8, 64, 10])
torch.Size([1, 8, 10, 10])
torch.Size([1, 8, 10, 10])


In [43]:
# encode不需要mask
# masked_fill的具体用法
scores_mask = scores.masked_fill(src_mask==0,-1e9)

In [44]:
print(scores_mask.shape)

torch.Size([1, 8, 10, 10])


In [45]:
p_attn = scores.softmax(dim=-1)
print(p_attn.shape)

torch.Size([1, 8, 10, 10])


In [46]:
p_attn = dropout(p_attn)
x_attn = torch.matmul(p_attn,value)

In [47]:
print(x_attn.shape)
print(p_attn.shape)

torch.Size([1, 8, 10, 64])
torch.Size([1, 8, 10, 10])


In [48]:
# "concat" using a view and apply a final layer 
# 实验 x = (x.transpose(1, 2).contiguous().view(nbatches, -1, self.h * self.d_k))
x_attn_trans = x_attn.transpose(1,2) # [1,8,10,64]
print(x_attn_trans.shape) # 恢复到原来的形状

torch.Size([1, 10, 8, 64])


In [49]:
# contiguous用法
# https://stackoverflow.com/questions/48915810/what-does-contiguous-do-in-pytorch
x_attn_trans = x_attn_trans.contiguous()
print(x_attn_trans.shape)

torch.Size([1, 10, 8, 64])


In [50]:
x_attn_trans = x_attn_trans.view(1,-1,8*64)
print(x_attn_trans.shape)

torch.Size([1, 10, 512])


In [51]:
x_attn_trans = linears[-1](x_attn_trans)
print(x_attn_trans.shape)

torch.Size([1, 10, 512])


#### SublayerConnection最后两步实践

In [52]:
# dropout + 残差连接
dropout = nn.Dropout(0.1)
sublayer1_output = x_attn_trans + dropout(x_attn_trans)
print(sublayer1_output.shape)

torch.Size([1, 10, 512])


In [None]:
'''
在获得sublayer1的输出之后，传入sublayer2
sublayer2: norm(x) -> PositionwiseFeedForward(x) -> dropout -> +x [残差连接]
'''

#### sublayer2的实验

In [53]:
sublayer2_norm = dropout(sublayer1_output)
print(sublayer2_norm.shape)

torch.Size([1, 10, 512])


In [54]:
# 实现PositionwiseFeedForward类
class PositionwiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff, dropout=0.1):
        super(PositionwiseFeedForward,self).__init__()
        self.w_1 = nn.Linear(d_model,d_ff)
        self.w_2 = nn.Linear(d_ff,d_model)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self,x):
        # self.w_1
        # relu()
        # dropout
        # self.w_2
        return self.w_2(self.dropout(self.w_1(x).relu()))

In [55]:
w_1 = nn.Linear(512,2048)
w_2 = nn.Linear(2048,512)

position_temp = w_1(sublayer2_norm).relu()
print(position_temp.shape)

torch.Size([1, 10, 2048])


In [56]:
pos_temp = dropout(position_temp)
print(pos_temp.shape)

torch.Size([1, 10, 2048])


In [57]:
sublayer2_pos = w_2(pos_temp)
print(sublayer2_pos.shape)

torch.Size([1, 10, 512])


In [58]:
# sublayer2 最后两步: dropout + 残差连接
dropout = nn.Dropout(0.1)
sublayer2_output = sublayer2_pos + dropout(sublayer2_pos)
print(sublayer2_output.shape)

torch.Size([1, 10, 512])


In [59]:
# 以上就是单个EncoderLayer实验的过程
# EncoderLayer类实现如下
class EncoderLayer(nn.Module):
    "Encoder is made up of self-attn and feed forward (defined below)"

    def __init__(self, size, self_attn, feed_forward, dropout):
        super(EncoderLayer, self).__init__()
        self.self_attn = self_attn
        self.feed_forward = feed_forward
        self.sublayer = clones(SublayerConnection(size, dropout), 2)
        self.size = size

    def forward(self, x, mask):
        "Follow Figure 1 (left) for connections."
        x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, mask))
        return self.sublayer[1](x, self.feed_forward)

In [60]:
# EncoderLayer中的SublayerConnection实现过程如下
class SublayerConnection(nn.Module):
    """
    A residual connection followed by a layer norm.
    Note for code simplicity the norm is first as opposed to last.
    """

    def __init__(self, size, dropout):
        super(SublayerConnection, self).__init__()
        self.norm = LayerNorm(size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, sublayer):
        "Apply residual connection to any sublayer with the same size."
        return x + self.dropout(sublayer(self.norm(x)))

In [61]:
# Encoder的实现
class Encoder(nn.Module):
    def __init__(self,layer,N):
        super(Encoder,self).__init__()
        self.layers = clones(layer,N)
        self.norm = LayerNorm(layer.size)
        
    def forward(self,x,mask):
        for layer in self.layers:
            x = layer(x,mask)
        return self.norm(x)
    
# Encoder = 6个EncoderLayer层 + 1个LayerNorm层
# 在上面得到第一个EncoderLayer层的输出sublayer2_ouput之后
# 把它传入第2个EncoderLayer层
# 第2个EncoderLayer层的输出再当作输入传入下一个EncoderLayer
# 依次类推
# 在得到最后一个EncoderLayer的输出之后进行LayerNorm
# 就得到了Encoder的最后结果即前文提到的memory

##### 创建简单Encoder相关类获得memory

In [62]:
# 创建EncoderTest
# 用于简单获得memory
class EncoderTest(nn.Module):
    def __init__(self,encoder,src_embed):
        super(EncoderTest,self).__init__()
        self.encoder = encoder
        self.src_embed = src_embed
        
    def forward(self,src,src_mask):
        return self.encode(src,src_mask)
    
    def encode(self,src,src_mask):
        return self.encoder(self.src_embed(src),src_mask)

In [63]:
# 创建make_encoder模型
# 
def make_encoder(src_vocab,N=6, d_model=512, d_ff=2048, h=8, dropout=0.1):
    c = copy.deepcopy
    attn = MultiHeadedAttention(h, d_model)
    ff = PositionwiseFeedForward(d_model, d_ff, dropout)
    position = PositionalEncoding(d_model, dropout)
    model = EncoderTest(
        Encoder(EncoderLayer(d_model, c(attn), c(ff), dropout), N),
        nn.Sequential(Embeddings(d_model, src_vocab), c(position))
    )
    
    for p in model.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform_(p)
    return model

In [64]:
print(src.shape)
print(src_mask.shape)

torch.Size([1, 10])
torch.Size([1, 1, 1, 10])


In [65]:
test_model = make_encoder(11) # N=2
# 实验memory = test_model.encode(src, src_mask)
memory = test_model.encode(src,src_mask)
print(memory.shape)

torch.Size([1, 10, 512])


#### 实验decode

In [None]:
'''
在上述encode部分已经获得了encoder的结果memory

接下来继续试验decode

实验inference_test()中的
out = test_model.decode(
            memory, src_mask, ys, subsequent_mask(ys.size(1)).type_as(src.data)
        )
        
与EncoderDecoder的作用相同
class EncoderDecoder(nn.Module):
    def decode(self, memory, src_mask, tgt, tgt_mask):
        return self.decoder(self.tgt_embed(tgt), memory, src_mask, tgt_mask)


其中:
decoder = Decoder(DecoderLayer(d_model=512,c(attn),c(attn),c(ff), dropout=0.1),N)
tgt_embed = nn.Sequential(Embeddings(d_model=512, tgt_vocab), c(position))
'''

In [136]:
# Decoder的结构与Encoder类似
# 就是6个连续DecoderLayer + 1个LayerNorm
# 区别是多了memory -> encoder的输出结果
# 以及src_mask和tag_mask
class Decoder(nn.Module):
    def __init__(self, layer, N):
        super(Decoder,self).__init__()
        self.layers = clones(layer,N)
        self.norm = LayerNorm(layer.size)
    
    def forward(self, x, memory, src_task, tgt_mask):
        for layer in self.layers:
            x = layer(x, memory, src_mask, tgt_mask)
        return self.norm(x)

In [68]:
class DecoderLayer(nn.Module):
    "Decoder is made of self-attn, src-attn, and feed forward (defined below)"

    def __init__(self, size, self_attn, src_attn, feed_forward, dropout):
        super(DecoderLayer, self).__init__()
        self.size = size
        self.self_attn = self_attn
        self.src_attn = src_attn
        self.feed_forward = feed_forward
        self.sublayer = clones(SublayerConnection(size, dropout), 3)

    def forward(self, x, memory, src_mask, tgt_mask):
        "Follow Figure 1 (right) for connections."
        m = memory
        x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, tgt_mask))
        x = self.sublayer[1](x, lambda x: self.src_attn(x, m, m, src_mask))
        return self.sublayer[2](x, self.feed_forward)

In [69]:
# DecoderLayer与EncoderLayer大部分类似
# 每个sublayer层依然是 norm(x) -> sublayer -> dropout  -> +x [残差]

# 不一样的地方
# 1. DecoderLayer有3个sublayer, EncoderLayer有2个，多了一个attention
# 2. 多出的层为encoder-decoder attention, 即src为query, 它对memory作为key-value的attention
#    就是上述self.src_attn(x, m, m, src_mask)

In [71]:
print(memory.shape)

torch.Size([1, 10, 512])


In [73]:
print(src.data)
print(src)

tensor([[ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10]])
tensor([[ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10]])


In [74]:
src_mask = torch.ones(1,1,10)
print(src_mask.shape)

torch.Size([1, 1, 10])


In [78]:
ys = torch.zeros(1,1)
print(ys.shape)
ys = ys.type_as(src)
print(ys.shape)
print(ys)

torch.Size([1, 1])
torch.Size([1, 1])
tensor([[0]])


In [77]:
# subsequent mask
# ensures that predictions for position i can 
# depend only on the known ouputs at positions less than i
def subsequent_mask(size):
    attn_shape = (1,size,size)
    subsequent_task = torch.triu(torch.ones(attn_shape),diagonal=1).type(torch.uint8)
    return subsequent_task==0

In [80]:
ys_size = ys.size(1)
print(ys_size)

1


In [81]:
attn_shape = (1,ys_size, ys_size)
print(attn_shape)

(1, 1, 1)


In [82]:
subsequent_test = torch.ones(attn_shape)
subsequent_test

tensor([[[1.]]])

In [83]:
subsequent_test = torch.triu(subsequent_test, diagonal=1)
subsequent_test

tensor([[[0.]]])

In [87]:
# torch.triu基本用法
# torch.triu(input, diagonal=0):
# input: the input tensor
# diagonal: the diagonal to consider
subsequent_test = subsequent_test.type(torch.uint8)
print(subsequent_test)
print(subsequent_test.shape)

tensor([[[0]]], dtype=torch.uint8)
torch.Size([1, 1, 1])


In [88]:
subsequent_test==0

tensor([[[True]]])

In [91]:
# decode传入参数:
# memory, src_mask, ys, subsequent_mask(ys_size(1))
# def decode(self, memory, src_mask, tgt, tgt_mask):
#       return self.decoder(self.tgt_embed(tgt), memory, src_mask, tgt_mask)

# 就是
# self.decoder(tgt_embed(ys), memory, src_mask, subseuent_mask(ys_size(1)))
print(ys)
print(ys.shape)

tensor([[0]])
torch.Size([1, 1])


In [93]:
print(subsequent_mask(ys.size(1)).shape)

torch.Size([1, 1, 1])


In [94]:
print(memory.shape)

torch.Size([1, 10, 512])


In [95]:
print(src_mask.shape)

torch.Size([1, 1, 10])


In [98]:
# tat_emb和src_emb过程类似
# 直接进行shape测试
# tgt_vocab=11
decoder_position = PositionalEncoding(512,0.1)
tgt_emb = nn.Sequential(Embeddings(512,11),decoder_position)

In [100]:
decoder_embedding = tgt_emb(ys)
print(decoder_embedding.shape)

torch.Size([1, 1, 512])


In [102]:
# 忽略掉SublayerConnection的norm步骤
# 直接attention
decoder_attention = MultiHeadedAttention(h=8,d_model=512,dropout=0.1)
x_decoder_attention = decoder_attention(decoder_embedding,decoder_embedding,decoder_embedding,subsequent_mask(ys.size(1)))
x_decoder_attention.shape

torch.Size([1, 1, 512])

In [104]:
# 再看encoder-decoder attention
# 这一步的详细转换再补充
# x为[1,1,512], memory [1,10,512] -> output[1,1,512]
encoder_decoder_attention = MultiHeadedAttention(h=8,d_model=512,dropout=0.1)
x_encoder_decoder_attention = encoder_decoder_attention(x_decoder_attention,memory,memory,src_mask)
x_encoder_decoder_attention.shape

torch.Size([1, 1, 512])

In [106]:
# 因为inference的ys只有一个元素
# 就不再进行SublayerConnection的dropout和残差连接
# 可以假设下列代码的输出shape为[1,1,512]
#test_model.decode(
#            memory, src_mask, ys, subsequent_mask(ys.size(1))

In [107]:
# 再次回到inference_test()
out = x_encoder_decoder_attention.type_as(src.data)
print(out.shape)

torch.Size([1, 1, 512])


In [108]:
# 实验 prob = test_model.generator(out[:, -1])
# generator = Generator(d_model=512, tgt_vocab=11)
# standard linear + softmax generation step
class Generator(nn.Module):
    def __init__(self,d_model,vocab):
        super(Generator,self).__init__()
        self.proj = nn.Linear(d_model,vocab)
    
    def forward(self,x):
        return log_softmax(self.proj(x),dim=-1)

In [114]:
print(out.shape)
print(out[:,-1].shape)

torch.Size([1, 1, 512])
torch.Size([1, 512])


In [117]:
proj = nn.Linear(512,11)
generator_test = proj(out[:,-1].type(torch.float32))
print(generator_test.shape)

torch.Size([1, 11])


In [122]:
from torch.nn.functional import log_softmax,pad
prob = log_softmax(generator_test,dim=-1)
print(prob.shape)

torch.Size([1, 11])


In [123]:
print(prob)

tensor([[-2.3889, -2.3669, -2.4077, -2.3771, -2.4408, -2.3702, -2.4429, -2.3906,
         -2.3710, -2.3855, -2.4396]], grad_fn=<LogSoftmaxBackward0>)


In [124]:
print(torch.max(prob,dim=1))

torch.return_types.max(
values=tensor([-2.3669], grad_fn=<MaxBackward0>),
indices=tensor([1]))


In [128]:
_, next_word = torch.max(prob,dim=1)
#print(next_word)
next_word = next_word.data[0]
print(next_word)

tensor(1)


In [129]:
print(ys)

tensor([[0]])


In [130]:
ys = torch.cat(
            [ys, torch.empty(1, 1).type_as(src.data).fill_(next_word)], dim=1
        )
print(ys) 
# ye的初始值为tensor([[0]])，再添加最新获得结果tensor(1), 类似一共循环9次
# 获得类似tensor([[0, 4, 1, 4, 1, 4, 1, 4, 1, 4]])的结果

tensor([[0, 1]])


#### 用inference_test查验结果
#### 全部的transformer过程

In [132]:
class EncoderDecoder(nn.Module):
    """
    A standard Encoder-Decoder architecture. Base for this and many
    other models.
    """

    def __init__(self, encoder, decoder, src_embed, tgt_embed, generator):
        super(EncoderDecoder, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.src_embed = src_embed
        self.tgt_embed = tgt_embed
        self.generator = generator

    def forward(self, src, tgt, src_mask, tgt_mask):
        "Take in and process masked src and target sequences."
        return self.decode(self.encode(src, src_mask), src_mask, tgt, tgt_mask)

    def encode(self, src, src_mask):
        return self.encoder(self.src_embed(src), src_mask)

    def decode(self, memory, src_mask, tgt, tgt_mask):
        return self.decoder(self.tgt_embed(tgt), memory, src_mask, tgt_mask)

In [134]:
def make_model(
    src_vocab, tgt_vocab, N=6, d_model=512, d_ff=2048, h=8, dropout=0.1
):
    "Helper: Construct a model from hyperparameters."
    c = copy.deepcopy
    attn = MultiHeadedAttention(h, d_model)
    ff = PositionwiseFeedForward(d_model, d_ff, dropout)
    position = PositionalEncoding(d_model, dropout)
    model = EncoderDecoder(
        Encoder(EncoderLayer(d_model, c(attn), c(ff), dropout), N),
        Decoder(DecoderLayer(d_model, c(attn), c(attn), c(ff), dropout), N),
        nn.Sequential(Embeddings(d_model, src_vocab), c(position)),
        nn.Sequential(Embeddings(d_model, tgt_vocab), c(position)),
        Generator(d_model, tgt_vocab),
    )

    # This was important from their code.
    # Initialize parameters with Glorot / fan_avg.
    for p in model.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform_(p)
    return model

In [139]:
def inference_test():
    test_model = make_model(11, 11, 2)
    test_model.eval()
    src = torch.LongTensor([[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]])
    src_mask = torch.ones(1, 1, 10)

    memory = test_model.encode(src, src_mask)
    ys = torch.zeros(1, 1).type_as(src)

    for i in range(9):
        out = test_model.decode(
            memory, src_mask, ys, subsequent_mask(ys.size(1)).type_as(src.data)
        )
        #print("out.shape:", out.shape)
        prob = test_model.generator(out[:, -1])
        _, next_word = torch.max(prob, dim=1)
        next_word = next_word.data[0]
        ys = torch.cat(
            [ys, torch.empty(1, 1).type_as(src.data).fill_(next_word)], dim=1
        )

    print("Example Untrained Model Prediction:", ys)


def run_tests():
    for _ in range(10):
        inference_test()


show_example(run_tests)

Example Untrained Model Prediction: tensor([[ 0,  1,  8,  1,  2,  4, 10, 10, 10, 10]])
Example Untrained Model Prediction: tensor([[0, 4, 1, 4, 1, 4, 1, 4, 1, 4]])
Example Untrained Model Prediction: tensor([[0, 5, 3, 3, 3, 3, 3, 3, 3, 3]])
Example Untrained Model Prediction: tensor([[0, 6, 5, 7, 1, 5, 7, 1, 5, 7]])
Example Untrained Model Prediction: tensor([[0, 1, 3, 9, 9, 9, 9, 9, 9, 9]])
Example Untrained Model Prediction: tensor([[0, 3, 0, 3, 0, 3, 0, 3, 0, 3]])
Example Untrained Model Prediction: tensor([[0, 1, 1, 6, 4, 1, 1, 1, 1, 1]])
Example Untrained Model Prediction: tensor([[ 0,  1,  4,  8,  9, 10, 10, 10, 10, 10]])
Example Untrained Model Prediction: tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
Example Untrained Model Prediction: tensor([[ 0, 10, 10, 10, 10, 10, 10, 10, 10, 10]])
