In [35]:
from email.header import decode_header
from math import sqrt
import torch
import torch.nn as nn
import numpy as np
import math

from pkg_resources import require
from torch.nn.functional import layer_norm

In [31]:

class Config(object):
    def __init__(self):
        # 定义多头注意力机制的头数
        self.num_heads=2
        # 定义模型的维度
        self.d_model=20

        # 断言模型的维度可以被头数整除
        assert self.d_model%self.num_heads==0

        # 定义每个头的维度
        self.d_k=self.d_model//self.num_heads
        self.d_v=self.d_model//self.num_heads

        # 定义词汇表的大小
        self.vocab_size=6

        # 定义填充的大小
        self.padding_size=30
        # 定义未知词的索引
        self.UNK=5
        # 定义填充词的索引
        self.PAD=4

        # 定义序列的长度
        self.N=6
        # 定义丢弃率
        self.p=0.1
config=Config()

In [26]:
#多头注意力机制
class Multi_Head_Attention(nn.Module):
    def __init__(self,d_model,d_k,d_v,num_heads):
        super(Multi_Head_Attention,self).__init__()
        self.num_heads=num_heads
        self.d_k=d_k
        self.d_v=d_v

        assert d_k%num_heads==0 and d_v%num_heads==0#确保在使用多头注意力机制的时候，维度匹配

        #线性投影层
        self.q=nn.Linear(d_model,d_k)
        self.k=nn.Linear(d_model,d_k)
        self.v=nn.Linear(d_model,d_v)
        #每个头的维度
        self.head_d_k=d_k//num_heads
        self.head_d_v=d_v//num_heads
        #缩放因子
        self.scale=1/sqrt(self.head_d_k)

    def generate_mask(self,dim):
        matrix=np.ones((dim,dim))
        mask=torch.Tensor(np.tril(matrix))
        return mask==1

    def forward(self,x,y,require_mask=False):
        batch_size,seq_len,_=x.shape
        Q=self.q(x).view(batch_size,seq_len,self.num_heads,self.head_d_k).transpose(1,2)
        K=self.k(x).view(batch_size,seq_len,self.num_heads,self.head_d_k).transpose(1,2)
        V=self.v(x).view(batch_size,seq_len,self.num_heads,self.head_d_v).transpose(1,2)

        scores=torch.matmul(Q,K.transpose(-1,-2))*self.scale
        attention=nn.Softmax(dim=-1)(scores)

        if require_mask:
            mask=self.generate_mask(seq_len)
            attention.mask_fill(mask,value=float("_inf"))
        output=torch.matmul(attention,V)
        output=output.transpose(1,2).contiguous().view(batch_size,seq_len,self.d_v)

        return output

In [33]:
class Embedding(nn.Module):
    #将普通Embedding和位置编码结合
    def __init__(self):
        super(Embedding,self).__init__()
        self.embedding=nn.Embedding(config.vocab_size,config.d_model,padding_idx=config.PAD)

    def forward(self,x):
        for i in range(self,x):
            if len(x[i])<config.padding_size:
                x[i].append([config.UNK]*(config.padding_size-len(x[i])))
            else:
                x[i]=x[i][:config.padding_size]

        x=self.embedding(torch.tensor(x))
        return x

In [36]:
class PositionEmbedding(nn.Module):
    def __init__(self,d_model):
        super(PositionEmbedding,self).__init__()
        self.d_model=d_model

    def forward(self,seq_len,embedding_dim):
        positional_encoding=np.zeros((seq_len,embedding_dim))
        for pos in range(positional_encoding.shape[0]):
            for i in range(positional_encoding.shape[1]):
                positional_encoding[pos][i]=math.sin(pos/(10000**(2*i/self.d_model))) if i%2==0 else math.cos(pos/(10000**(2*i/self.d_model)))
        return torch.from_numpy(positional_encoding)

In [37]:
class Feed_Forward(nn.Module):
    def __init__(self,input_dim,hidden_dim=2048):
        super(Feed_Forward,self).__init__()
        self.linear1=nn.Linear(input_dim,hidden_dim)
        self.linear2=nn.Linear(hidden_dim,input_dim)

    def forward(self,x):
        output=nn.ReLU()(self.linear1(x))
        output=self.linear2(output)
        return output

In [38]:
class Add_Norm(nn.Module):
    def __init__(self):
        super(Add_Norm,self).__init__()

    def forward(self,x,sublayer,**kwargs):
        sub_output=sublayer(x,**kwargs)
        x=self.dropout(x+sub_output)
        layer_norm=nn.LayerNorm(x.size()[1:])
        out=layer_norm(x)
        return out

In [39]:
class Encoder(nn.Module):
    def __init___(self):
        super(Encoder,self).__init__()
        self.positional_encoding=PositionEmbedding(config.d_model)
        self.multi_head_attention=Multi_Head_Attention(config.d_model,config.d_k,config.d_v,config.num_heads)
        self.feed_forward=Feed_Forward(config.d_model)
        self.add_norm=Add_Norm()
    def forward(self,x):
        x+=self.positional_encoding(x.size()[1],config.d_model)
        output=self.add_norm(x,self.multi_head_attention,y=x)
        output=self.add_norm(output,self.feed_forward)
        return output

In [None]:
class Decode(nn.Module):
    def __init__(self):
        super(Decode,self).__init__()
        self.positional_encoding=PositionEmbedding(config.d_model)
        self.multi_head_attention=Multi_Head_Attention(config.d_model,config.d_k,config.d_v,config.num_heads)
        self.feed_forward=Feed_Forward(config.d_model)
        self.add_norm=Add_Norm()
    def forward(self,x,encoder_output):
        x+=self.positional_encoding(x.size()[1],config.d_model)
        output=self.add_norm(x,self.multi_head_attention,y=x,require_mask=True)
        output=self.add_norm(x,self.multi_head_attention,y=encoder_output,require_mask=False)
        output=self.add_norm(output,self.feed_forward)
        return output

In [45]:
class Transformer_layers(nn.Module):
    def __init__(self):
        super(Transformer_layers,self).__init__()
        self.encoder=Encoder()
        self.decoder=Decode()
    def forward(self,x):
        x_input,x_output=x
        encoder_output=self.encoder(x_input)
        decoder_output=self.decoder(x_output,encoder_output)
        return (encoder_output,decoder_output)

class Transformer(nn.Module):
    def __init__(self,N,vocab_size,output_dim):
        super(Transformer,self).__init__()
        self.embedding_input=Embedding(vocab_size=vocab_size)
        self.embedding_output=Embedding(vocab_size=vocab_size)
        self.output_dim=output_dim
        self.linear=nn.Linear(config.d_model,output_dim)
        self.softmax=nn.Softmax(dim=-1)
        self.model=nn.Sequential(*[Transformer_layers() for _ in range(N)])
    def forward(self,x):
        x_input,x_output=x
        x_input=self.embedding_input(x_input)
        x_output=self.embedding_output(x_output)
        _,output=self.model((x_input,x_output))
        output=self.linear(output)
        output=self.softmax(output)
        return output