# Assignment 8

### 1.复习课上内容， 阅读相应论文。

In [7]:
import torch.nn as nn
import torch
import numpy as np
from torch.autograd import Variable
import math
import torch.nn.functional as F

In [8]:
class ScaledDotProductAttention(nn.Module):
    def __init__(self,attention_dropout=0.0):
        super(ScaledDotProductAttention,self).__init__()
        self.dropout=nn.Dropout(attention_dropout)
        self.softmax=nn.Softmax(dim=-1)
        
    def forword(self,q,k,v,scale=None,attn_mask=None):
        attention=torch.matmul(q,k,transpose(-2,-1))# 计算 Q*K^T
        if scale:
            attention=attention*scale
        if attn_mask is not None:
            attention=attention.masked_fill_(attn_mask,-np.inf)
        attention=self.softmax(attention)
        attention=self.dropout(attention)
        context=torch.matmul(attention,v)
        return context
    
    

In [39]:
class MultiHeadAttention(nn.Module):
    def __init__(self,d_modl=512,num_heads=8,dropout=0.0):
        super(MultiHeadAttention,self).__init__()
        self.dim_per_head=d_modl
        self.num_heads=num_heads
        self.linear_k=nn.Linear(d_modl,d_modl)
        self.linear_v=nn.Linear(d_modl,d_modl)
        self.linear_q=nn.Linear(d_modl,d_modl)
        
        self.dot_product_attention=ScaledDotProductAttention(dropout)
        
        self.linear_final=nn.Linear(d_modl,d_modl)
        self.norm=nn.LayerNorm(d_modl)
        
    def forward(self,keys,values,queries,attn_mask=None):
        residual=queries
        batch_size=keys.size(0)
        keys=self.linear_k(keys)
        values=self.linear_v(values)
        queries=self.linear_q(queries)
        
        key=keys.view(batch_size,-1,self.num_heads,self.dim_per_head).transpose(1,2) #重塑张量
        values = values.view(batch_size, -1, self.num_heads, self.dim_per_head).transpose(1,2)
        queries = queries.view(batch_size, -1, self.num_heads, self.dim_per_head).transpose(1,2)
        
        if attn_mask is not None:
            # unsqueeze 对维度进行扩充 ，repeat维度翻倍
            attn_mask-attn_mask.unsqueeze(1).repeat(1,self.num_heads,1,1)
            
        #**幂运算
        scale=(keys.size(-1))**-0.5
        
        #计算注意力
        context=self.dot_product_attention(queries,keys,values,scale,attn_mask)
        
        #将多个头的输出向量拼接合并
        # contiguous 
        context=context.transpose(1,2).contiguous \
            .view(batch_size,-1,self.num_heads*self.dim_per_head)
        
        return self.norm(residual+self.linear_final(context)) # linear 将拼接够的多头 进行信息融合和映射回d维度
        
        
        

In [11]:
class PositionalWiseFeedForward(nn.Module):
    #前向传播+residual connection
    def __init__(self,
                 d_model=512,
                 ffn_dim=2048,
                 dropout=0.0):
        super(PositionalWiseFeedForward,self).__init__()
        self.w1=nn.Linear(d_model,ffn_dim)
        self.w2=nn.Linear(ffn_dim,d_model)
        self.dropout=nn.Dropout(dropout)
        self.norm=nn.LayerNorm(d_model)
        
    def forward(self,x):
        output=self.w2(F.relu(self.w1(x)))
        # layer normalization and residual network
        return self.norm(x+self.dropout(output))

In [12]:
class EncoderLayer(nn.Module):
    def __init__(self,d_model=512,num_heads=8,ffn_dim=2018,dropout=0.0):
        super(EncoderLayer,self).__init__()
        self.attention=MultiHeadAttention(d_model, num_heads, dropout)
        self.feed_forward=PositionalWiseFeedForward(d_model, ffn_dim, dropout)
        
    def forward(self,x,attn_mask=None):
        context=self.attention(x,x,x,attn_mask)
        output=self.feed_forward(context)
        return output
    
    

In [31]:
class Encoder(nn.Module):

    def __init__(self,
                 vocab_size,
                 max_seq_len,
                 num_layers = 6,
                 d_model = 512,
                 num_heads = 8,
                 ffn_dim = 2048,
                 dropout = 0.0):
        super(Encoder,self).__init__()
        self.encoder_layers=nn.ModuleList([EncoderLayer(d_model,num_heads,ffn_dim,dropout) for _ in range(num_layers)])
        self.pos_embedding=PositionalEncoding(d_model, max_seq_len,dropout)
        self.norm = nn.LayerNorm(d_model)
        
    def forward(self,x,seq_embedding):
        embedding=seq_embedding(x)
        output=self.pos_embedding(embedding)
        self_attention_mask=padding_mask(x,x)
        
        for encoder in self.encoder_layers:
            output=encoder(output,self_attention_mask)
            
        return self.norm(output)
        
        
        

In [32]:
def padding_mask(seq_k, seq_q):

    # pad sentence
    len_q = seq_q.size(1)
    pad_mask = seq_k.eq(0)
    pad_mask = pad_mask.unsqueeze(1).expand(-1,len_q,-1)

    return pad_mask

In [40]:
class DecoderLayer(nn.Module):
    def __init__(self,
                 d_model,
                 num_heads = 8,
                 ffn_dim = 2048,
                 dropout = 0.0):
        super(DecoderLayer,self).__init__()

        self.attention = MultiHeadAttention(d_model, num_heads, dropout)
        self.feed_forward = PositionalWiseFeedForward(d_model, ffn_dim, dropout)
    
    def forward(self,dec_inputs,enc_outputs,self_attn_mask=None,context_attn_mask=None):
        dec_ouput  = self.attention(dec_inputs, dec_inputs, dec_inputs ,self_attn_mask)
        dec_ouput = self.attention(enc_outputs, enc_outputs,dec_ouput, context_attn_mask)
        dec_output=self.feed_forward(dec_output)
        return dec_output
    
class Decoder(nn.Module):
    def __init__(self,
                vocab_size,
                 max_seq_len,
                 device,
                 num_layers = 6,
                 d_model  = 512,
                 num_heads = 8,
                 ffn_dim = 2048,
                 dropout = 0.0,
                 ):
        super(Decoder,self).__init__()
        self.device = device
        self.num_layers = num_layers

        self.decoder_layers = nn.ModuleList(
            [DecoderLayer(d_model,num_heads,ffn_dim,dropout) for _ in range(num_layers)])

        self.seq_embedding = nn.Embedding(vocab_size, d_model, padding_idx=0)
        self.pos_embedding = PositionalEncoding(d_model, max_seq_len)
        self.linear = nn.Linear(d_model, vocab_size, bias=False)

        
    def forward(self, inputs, enc_output, seq_embedding, context_attn_mask = None):

        embedding = seq_embedding(inputs)
        output =  embedding + self.pos_embedding(embedding)

        self_attention_padding_mask = padding_mask(inputs, inputs)
        seq_mask = sequence_mask(inputs).to(self.device)
        self_attn_mask = torch.gt((self_attention_padding_mask+seq_mask), 0 )

        for decoder in self.decoder_layers:
            output = decoder(output, enc_output,self_attn_mask,context_attn_mask)

        output = self.linear(output)
        return output
    
        

In [37]:
class Transformer(nn.Module):
    def __init__(self,
                 vocab_size,
                 max_len,
                 device,
                 num_layers = 6,
                 stack_layers= 6,
                 d_model = 512,
                 num_heads = 8,
                 ffn_dim = 2048,
                 dropout = 0.2):
        super(Transformer, self).__init__()

        self.device = device

        self.encoder=Encoder(vocab_size,max_len,num_layers,d_model,num_heads,ffn_dim,dropout)
        self.decoder=Decoder(vocab_size, max_len,device, num_layers,d_model,num_heads, ffn_dim, dropout)
        self.embedding=nn.Embedding(vocab_size,d_model)
        self.linear=nn.Linear(d_model,vocab_size,bias=False)
        
    def forward(self,src_seq,dec_tgt,dec_in):
        context_attn_mask_dec=padding_mask(dec_tgt,src_tgt)
        
        en_output=self.encoder(src_seq,embedding)
        dec_output=self.decoder(dec_tgt,en_output,self.embedding,context_attn_mask_dec)
        
        return dec_output

In [18]:
inputs = torch.tensor([[1,2,3,0,0,0],
                       [3,4,0,0,0,0],
                       [3,0,0,0,0,0],
                       [4,5,6,7,0,0]])

In [19]:
padding_mask(inputs,inputs)

tensor([[[False, False, False,  True,  True,  True],
         [False, False, False,  True,  True,  True],
         [False, False, False,  True,  True,  True],
         [False, False, False,  True,  True,  True],
         [False, False, False,  True,  True,  True],
         [False, False, False,  True,  True,  True]],

        [[False, False,  True,  True,  True,  True],
         [False, False,  True,  True,  True,  True],
         [False, False,  True,  True,  True,  True],
         [False, False,  True,  True,  True,  True],
         [False, False,  True,  True,  True,  True],
         [False, False,  True,  True,  True,  True]],

        [[False,  True,  True,  True,  True,  True],
         [False,  True,  True,  True,  True,  True],
         [False,  True,  True,  True,  True,  True],
         [False,  True,  True,  True,  True,  True],
         [False,  True,  True,  True,  True,  True],
         [False,  True,  True,  True,  True,  True]],

        [[False, False, False, False,  T

In [38]:
def sequence_mask(seq):

    batch_size , seq_len = seq.size()
    mask = torch.triu(torch.ones((seq_len, seq_len),dtype = torch.uint8),
                      diagonal = 1)
    mask = mask.unsqueeze(0).expand(batch_size, -1,-1)
    return mask

In [21]:
sequence_mask(inputs)

tensor([[[0, 1, 1, 1, 1, 1],
         [0, 0, 1, 1, 1, 1],
         [0, 0, 0, 1, 1, 1],
         [0, 0, 0, 0, 1, 1],
         [0, 0, 0, 0, 0, 1],
         [0, 0, 0, 0, 0, 0]],

        [[0, 1, 1, 1, 1, 1],
         [0, 0, 1, 1, 1, 1],
         [0, 0, 0, 1, 1, 1],
         [0, 0, 0, 0, 1, 1],
         [0, 0, 0, 0, 0, 1],
         [0, 0, 0, 0, 0, 0]],

        [[0, 1, 1, 1, 1, 1],
         [0, 0, 1, 1, 1, 1],
         [0, 0, 0, 1, 1, 1],
         [0, 0, 0, 0, 1, 1],
         [0, 0, 0, 0, 0, 1],
         [0, 0, 0, 0, 0, 0]],

        [[0, 1, 1, 1, 1, 1],
         [0, 0, 1, 1, 1, 1],
         [0, 0, 0, 1, 1, 1],
         [0, 0, 0, 0, 1, 1],
         [0, 0, 0, 0, 0, 1],
         [0, 0, 0, 0, 0, 0]]], dtype=torch.uint8)

### 2. 回答以下理论题目

#### 1.  What is autoencoder?

自编码器，人工神经网络的一种，通常用于降维或者特征提取

#### 2. What are the differences between greedy search and beam search?

greedy search在较大数据量的情况下占用很大内存空间，beam search 使用beam size参数来限制在每一步保留下来的可能性词的数量，能找到相对最优解，不一定能找到全局最优解

#### 3. What is the intuition of attention mechanism?

在输入文本中寻找对整句话意思起到重要作用的词语，给每个词分配相应的权重比例

#### 4. What is the disadvantage of word embeding introduced in previous lectures ?

词向量不能解决一词多义的问题

#### 5. What is the architecture of ELMo model. (A brief description is enough)

根据已知的上下文来推算下一个词的概率分布

#### 6. Compared to RNN,  what is the advantage of Transformer ?

能够解决一词多义的问题

#### 7. Why we use layer normalizaiton instead of batch normalization in Transformer ?

batch normalization 适合输入数据分布比较接近，Transformer中每层输入的数据是不一样的，Layer Normalization考虑了输入的所有维度

#### 8. Why we need position embedding in Transformer ?

利用position embedding处理时序问题

#### 9. Briefly describe what is self-attention and what is multi-head attention?

self-attention：一句话中的每个词语对于有助于理解整句话含义的权重比例

multi-head attention:由多个attention进行计算，不同的头有不同维度的信息，多头操作融合了多种信息

#### 10. What is the basic unit of GPT model?

- Masked Multi Self Attention 
- Layer Norm  
- Feed Forward 
- Layer Norm

#### 11. Briefly descibe how to use GPT in other NLP tasks?

先通过无标签的文本去训练生成语言模型，再根据具体的NLP任务（如文本蕴涵、QA、文本分类等），来通过有标签的数据对模型进行fine-tuning。

#### 12. What is masked language model in BERT ?

随机去掉句子中的部分token，然后根据上下文预测被去掉的token值

#### 13. What are the inputs of BERT ?

- Word Vector
- Positional Encoding
- Segment Embedding

#### 14. Briely descibe how to use BERT in other NLP tasks.

- 分类任务：对文本最后的向量进行二分类概率计算
- 文本标注：利用文本中每个词对应的输出向量对词进行标注

#### 15. What are the differences between these three models: GPT, BERT, GPT2.

- GPT： 使用transformer中去掉中间Encoder-Decoder Attention层的decoder，单向语言模型，pre-training的结构
- BERT：使用transformer的encoder，双向语言模型，pre-training + fine-tuning的结构，不能做生成式任务
- GPT2：和GPT结构一样，参数更多，训练数据更大