# 本文我们将采用带有注意力机制的seq2seq，用于金融数据的预测

这里是模型图

# 导入包

In [1]:
from __future__ import unicode_literals, print_function, division

from io import open
import unicodedata
import string
import re
import datetime
import random
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from torch.nn import Parameter

# Transformer

![avatar](2.png)

## Scaled dot-product attention
对于查询向量Q，键K，值V，我们可以得到Scaled dot-product attention:
$$Attention(Q,K,V)=softmax(\frac{QK^T}{\sqrt{d_k}})V$$

In [None]:
class Scaled_Dot_Product_Attention(nn.Module):
    """Scaled dot-product attention mechanism."""

    def __init__(self, attention_dropout=0.0):
        super(Scaled_Dot_Product_Attention, self).__init__()
        self.dropout = nn.Dropout(attention_dropout)
        self.softmax = nn.Softmax(dim=2)

    def forward(self, q, k, v, scale=None, attn_mask=None):
        """前向传播.

        Args:
            q: Queries张量，形状为[B, L_q, D_q]
            k: Keys张量，形状为[B, L_k, D_k]
            v: Values张量，形状为[B, L_v, D_v]，一般来说就是k
            上面的的B是Batch_size,L是序列长度，D为feature_size
            scale: 缩放因子，一个浮点标量
            attn_mask: Masking张量，形状为[B, L_q, L_k]

        Returns:
            上下文张量和attetention张量
        """
        attention = torch.bmm(q, k.transpose(1, 2))# bmm:perform a batch matrix-matrix product of matrices
        if scale:
            attention = attention * scale
        if attn_mask:
            # 给需要mask的地方设置一个负无穷
            attention = attention.masked_fill_(attn_mask, -np.inf)
        # 计算softmax
        attention = self.softmax(attention)
        # 添加dropout
        attention = self.dropout(attention)
        # 和V做点积
        context = torch.bmm(attention, v)
        return context, attention

In [4]:
from torch.autograd import Variable
q = Variable(torch.randn(5, 3, 10))
k = Variable(torch.randn(5, 3, 10))
v = Variable(torch.randn(5, 3, 10))
attention = torch.bmm(q, k.transpose(1, 2))
attention.size()

torch.Size([5, 3, 3])

In [5]:
softmax = nn.Softmax(dim=2)
dropout = nn.Dropout(0.2)
attention1 = softmax(attention)#为（5,3,3）张量，可以知道（*，*，0）+（*，*，1）+（*，*，2）=1
attention1.size()

torch.Size([5, 3, 3])

In [6]:
context = torch.bmm(attention1, v)
context.size()

torch.Size([5, 3, 10])

## Multi-head attention
所谓多头注意力，就是在上面的点积注意力基础上做了h次。两者的关系可以表示为：
![avatar](4.png)

In [None]:
class Multi_Head_Attention(nn.Module):

    def __init__(self, model_dim=512, num_heads=8, dropout=0.0):
        """
        args:
        model_dim:为多头注意力连接之后的维度
        num_heads:多头数目
        dropout:dropout率
        """
        super(Multi_Head_Attention, self).__init__()

        self.dim_per_head = model_dim // num_heads
        self.num_heads = num_heads
        self.linear_k = nn.Linear(model_dim, self.dim_per_head * num_heads)
        self.linear_v = nn.Linear(model_dim, self.dim_per_head * num_heads)
        self.linear_q = nn.Linear(model_dim, self.dim_per_head * num_heads)

        self.dot_product_attention = Scaled_Dot_Product_Attention(dropout)
        self.linear_final = nn.Linear(model_dim, model_dim)
        self.dropout = nn.Dropout(dropout)
        # multi-head attention之后需要做layer norm
        self.layer_norm = nn.LayerNorm(model_dim)

    def forward(self, key, value, query, attn_mask=None):
        # 残差连接
        residual = query

        dim_per_head = self.dim_per_head
        num_heads = self.num_heads
        batch_size = key.size(0)

        # linear projection
        key = self.linear_k(key)
        value = self.linear_v(value)
        query = self.linear_q(query)

        # split by heads
        key = key.view(batch_size * num_heads, -1, dim_per_head)
        value = value.view(batch_size * num_heads, -1, dim_per_head)
        query = query.view(batch_size * num_heads, -1, dim_per_head)

        if attn_mask:
            attn_mask = attn_mask.repeat(num_heads, 1, 1)
        # scaled dot product attention
        scale = (key.size(-1) // num_heads) ** -0.5
        context, attention = self.dot_product_attention(
          query, key, value, scale, attn_mask)

        # concat heads
        context = context.view(batch_size, -1, dim_per_head * num_heads)

        # final linear projection
        output = self.linear_final(context)

        # dropout
        output = self.dropout(output)

        # add residual and norm layer
        output = self.layer_norm(residual + output)

        return output, attention

In [None]:
mha = Multi_Head_Attention(dropout=0.1)
query = Variable(torch.randn(5, 3, 512))
key = Variable(torch.randn(5, 3, 512))
value = Variable(torch.randn(5, 3, 512))
output, attention = mha(key,value,query)

In [29]:
output.size()

torch.Size([5, 3, 512])

In [30]:
attention.size()

torch.Size([40, 3, 3])

## Layer normalization
是在每一个样本上计算均值和方差：
$$LN(x_i)=\alpha\times\frac{x_i-\mu_L}{\sqrt{\sigma^2_L+\epsilon}}+\beta$$

In [None]:
class LayerNorm(nn.Module):
    """实现LayerNorm。其实PyTorch已经实现啦，见nn.LayerNorm。"""

    def __init__(self, features, epsilon=1e-6):
        """初始化

        Args:
            features: 就是模型的维度。论文默认512
            epsilon: 一个很小的数，防止数值计算的除0错误
        """
        super(LayerNorm, self).__init__()
        # alpha
        self.gamma = nn.Parameter(torch.ones(features))
        # beta
        self.beta = nn.Parameter(torch.zeros(features))
        self.epsilon = epsilon

    def forward(self, x):
        """前向传播.

        Args:
            x: 输入序列张量，形状为[B, L, D]
        """
        # 根据公式进行归一化
        # 在X的最后一个维度求均值，最后一个维度就是模型的维度
        mean = x.mean(-1, keepdim=True)
        # 在X的最后一个维度求方差，最后一个维度就是模型的维度
        std = x.std(-1, keepdim=True)
        return self.gamma * (x - mean) / (std + self.epsilon) + self.beta

## mask 
顾名思义就是掩码，在我们这里的意思大概就是对某些值进行掩盖，使其不产生效果。

Transformer模型里面涉及两种mask。分别是padding mask和sequence mask。其中，padding mask在所有的scaled dot-product attention里面都需要用到，而sequence mask只有在decoder的self-attention里面用到。
### Padding mask
我们的每个批次输入序列长度是不一样的！也就是说，我们要对输入序列进行对齐！具体来说，就是给在较短的序列后面填充0。因为这些填充的位置，其实是没什么意义的，所以我们的attention机制不应该把注意力放在这些位置上，所以我们需要进行一些处理。

具体的做法是，把这些位置的值加上一个非常大的负数(可以是负无穷)，这样的话，经过softmax，这些位置的概率就会接近0！
### sequence mask
sequence mask是为了使得decoder不能看见未来的信息。也就是对于一个序列，在time_step为t的时刻，我们的解码输出应该只能依赖于t时刻之前的输出，而不能依赖t之后的输出。因此我们需要想一个办法，把t之后的信息给隐藏起来。

那么具体怎么做呢？也很简单：产生一个上三角矩阵，上三角的值全为1，下三角的值权威0，对角线也是0。把这个矩阵作用在每一个序列上，就可以达到我们的目的啦。

In [None]:
def padding_mask(seq_k, seq_q):
    # seq_k和seq_q的形状都是[B,L]
    len_q = seq_q.size(1)
    # `PAD` is 0
    pad_mask = seq_k.eq(0)
    pad_mask = pad_mask.unsqueeze(1).expand(-1, len_q, -1)  # shape [B, L_q, L_k]
    return pad_mask

def sequence_mask(seq):
    batch_size, seq_len = seq.size()
    mask = torch.triu(torch.ones((seq_len, seq_len), dtype=torch.uint8),
                    diagonal=1)
    mask = mask.unsqueeze(0).expand(batch_size, -1, -1)  # [B, L, L]
    return mask

# Positional encoding


In [1]:
class PositionalEncoding(nn.Module):
    
    def __init__(self, d_model, max_seq_len):
        """初始化。
        
        Args:
            d_model: 一个标量。模型的维度，论文默认是512
            max_seq_len: 一个标量。文本序列的最大长度
        """
        super(PositionalEncoding, self).__init__()
        
        # 根据论文给的公式，构造出PE矩阵
        position_encoding = np.array([
          [pos / np.pow(10000, 2.0 * (j // 2) / d_model) for j in range(d_model)]
          for pos in range(max_seq_len)])
        # 偶数列使用sin，奇数列使用cos
        position_encoding[:, 0::2] = np.sin(position_encoding[:, 0::2])
        position_encoding[:, 1::2] = np.cos(position_encoding[:, 1::2])

        # 在PE矩阵的第一行，加上一行全是0的向量，代表这`PAD`的positional encoding
        # 在word embedding中也经常会加上`UNK`，代表位置单词的word embedding，两者十分类似
        # 那么为什么需要这个额外的PAD的编码呢？很简单，因为文本序列的长度不一，我们需要对齐，
        # 短的序列我们使用0在结尾补全，我们也需要这些补全位置的编码，也就是`PAD`对应的位置编码
        pad_row = torch.zeros([1, d_model])
        position_encoding = torch.cat((pad_row, position_encoding))
        
        # 嵌入操作，+1是因为增加了`PAD`这个补全位置的编码，
        # Word embedding中如果词典增加`UNK`，我们也需要+1。看吧，两者十分相似
        self.position_encoding = nn.Embedding(max_seq_len + 1, d_model)
        self.position_encoding.weight = nn.Parameter(position_encoding,
                                                     requires_grad=False)
    def forward(self, input_len):
        """神经网络的前向传播。

        Args:
          input_len: 一个张量，形状为[BATCH_SIZE, 1]。每一个张量的值代表这一批文本序列中对应的长度。

        Returns:
          返回这一批序列的位置编码，进行了对齐。
        """
        
        # 找出这一批序列的最大长度
        max_len = torch.max(input_len)
        tensor = torch.cuda.LongTensor if input_len.is_cuda else torch.LongTensor
        # 对每一个序列的位置进行对齐，在原序列位置的后面补上0
        # 这里range从1开始也是因为要避开PAD(0)的位置
        input_pos = tensor(
          [list(range(1, len + 1)) + [0] * (max_len - len) for len in input_len])
        return self.position_encoding(input_pos)

NameError: name 'nn' is not defined

# Position-wise Feed-Forward network

In [None]:
class PositionalWiseFeedForward(nn.Module):

    def __init__(self, model_dim=512, ffn_dim=2048, dropout=0.0):
        super(PositionalWiseFeedForward, self).__init__()
        self.w1 = nn.Conv1d(model_dim, ffn_dim, 1)
        self.w2 = nn.Conv1d(model_dim, ffn_dim, 1)
        self.dropout = nn.Dropout(dropout)
        self.layer_norm = nn.LayerNorm(model_dim)

    def forward(self, x):
        output = x.transpose(1, 2)
        output = self.w2(F.relu(self.w1(output)))
        output = self.dropout(output.transpose(1, 2))

        # add residual and norm layer
        output = self.layer_norm(x + output)
        return output

# Encoder

In [None]:
class EncoderLayer(nn.Module):
    """Encoder的一层。"""

    def __init__(self, model_dim=512, num_heads=8, ffn_dim=2018, dropout=0.0):
        super(EncoderLayer, self).__init__()

        self.attention = MultiHeadAttention(model_dim, num_heads, dropout)
        self.feed_forward = PositionalWiseFeedForward(model_dim, ffn_dim, dropout)

    def forward(self, inputs, attn_mask=None):

        # self attention
        context, attention = self.attention(inputs, inputs, inputs, padding_mask)

        # feed forward network
        output = self.feed_forward(context)

        return output, attention


class Encoder(nn.Module):
    """多层EncoderLayer组成Encoder。"""

    def __init__(self,
               vocab_size,
               max_seq_len,
               num_layers=6,
               model_dim=512,
               num_heads=8,
               ffn_dim=2048,
               dropout=0.0):
        super(Encoder, self).__init__()

        self.encoder_layers = nn.ModuleList(
          [EncoderLayer(model_dim, num_heads, ffn_dim, dropout) for _ in
           range(num_layers)])

        self.seq_embedding = nn.Embedding(vocab_size + 1, model_dim, padding_idx=0)
        self.pos_embedding = PositionalEncoding(model_dim, max_seq_len)

    def forward(self, inputs, inputs_len):
        output = self.seq_embedding(inputs)
        output += self.pos_embedding(inputs_len)

        self_attention_mask = padding_mask(inputs, inputs)

        attentions = []
        for encoder in self.encoder_layers:
            output, attention = encoder(output, self_attention_mask)
            attentions.append(attention)

        return output, attentions


# Decoder

In [None]:
class DecoderLayer(nn.Module):

    def __init__(self, model_dim, num_heads=8, ffn_dim=2048, dropout=0.0):
        super(DecoderLayer, self).__init__()

        self.attention = MultiHeadAttention(model_dim, num_heads, dropout)
        self.feed_forward = PositionalWiseFeedForward(model_dim, ffn_dim, dropout)

    def forward(self,
              dec_inputs,
              enc_outputs,
              self_attn_mask=None,
              context_attn_mask=None):
        # self attention, all inputs are decoder inputs
        dec_output, self_attention = self.attention(
          dec_inputs, dec_inputs, dec_inputs, self_attn_mask)

        # context attention
        # query is decoder's outputs, key and value are encoder's inputs
        dec_output, context_attention = self.attention(
          enc_outputs, enc_outputs, dec_output, context_attn_mask)

        # decoder's output, or context
        dec_output = self.feed_forward(dec_output)

        return dec_output, self_attention, context_attention


class Decoder(nn.Module):

    def __init__(self,
               vocab_size,
               max_seq_len,
               num_layers=6,
               model_dim=512,
               num_heads=8,
               ffn_dim=2048,
               dropout=0.0):
        super(Decoder, self).__init__()

        self.num_layers = num_layers

        self.decoder_layers = nn.ModuleList(
          [DecoderLayer(model_dim, num_heads, ffn_dim, dropout) for _ in
           range(num_layers)])

        self.seq_embedding = nn.Embedding(vocab_size + 1, model_dim, padding_idx=0)
        self.pos_embedding = PositionalEncoding(model_dim, max_seq_len)

    def forward(self, inputs, inputs_len, enc_output, context_attn_mask=None):
        output = self.seq_embedding(inputs)
        output += self.pos_embedding(inputs_len)

        self_attention_padding_mask = padding_mask(inputs, inputs)
        seq_mask = sequence_mask(inputs)
        self_attn_mask = torch.gt((self_attention_padding_mask + seq_mask), 0)

        self_attentions = []
        context_attentions = []
        for decoder in self.decoder_layers:
            output, self_attn, context_attn = decoder(
            output, enc_output, self_attn_mask, context_attn_mask)
            self_attentions.append(self_attn)
            context_attentions.append(context_attn)

        return output, self_attentions, context_attentions

# Transformer

In [None]:
class Transformer(nn.Module):

    def __init__(self,
               src_vocab_size,
               src_max_len,
               tgt_vocab_size,
               tgt_max_len,
               num_layers=6,
               model_dim=512,
               num_heads=8,
               ffn_dim=2048,
               dropout=0.2):
        super(Transformer, self).__init__()

        self.encoder = Encoder(src_vocab_size, src_max_len, num_layers, model_dim,
                               num_heads, ffn_dim, dropout)
        self.decoder = Decoder(tgt_vocab_size, tgt_max_len, num_layers, model_dim,
                               num_heads, ffn_dim, dropout)

        self.linear = nn.Linear(model_dim, tgt_vocab_size, bias=False)
        self.softmax = nn.Softmax(dim=2)

    def forward(self, src_seq, src_len, tgt_seq, tgt_len):
        context_attn_mask = padding_mask(tgt_seq, src_seq)

        output, enc_self_attn = self.encoder(src_seq, src_len)

        output, dec_self_attn, ctx_attn = self.decoder(
          tgt_seq, tgt_len, output, context_attn_mask)

        output = self.linear(output)
        output = self.softmax(output)

        return output, enc_self_attn, dec_self_attn, ctx_attn

# 构建基于GRU的编码器
 编码器结构图:

In [None]:
'''
在金融数据中，词嵌入层是否需要，Embedding的含义我需要搞清楚
'''
# Encoder 部分


class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size,num_layers,seq_len,batch_num):
        """它的初始化参数有两个, input_size代表编码器的输入尺寸即源语言的，hidden_size代表GRU的隐层节点数, 同时又是GRU的输入尺寸"""
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size   # 隐藏层特征
        self.seq_len = seq_len   #序列长度
        self.batch_num = batch_num  #一批同时训练数据的数目
        self.num_layers = num_layers # GRU层数
        self.gru = nn.GRU(input_size, hidden_size,num_layers)   # 多层的GRU
        
    def forward(self, Input, hidden):
        """编码器前向逻辑函数中参数有两个, input代表输入张量
           hidden代表编码器层gru的初始隐层张量"""
        # Input为三维张量:(seq_len, batch_num, input_size),seq_len为序列长度，batch_num为一批同时训练多少数据，input_size为输入的特征
        #输出output为三维张量:(seq_len, batch_num,  hidden_size),seq_len为序列长度，batch_num为一批同时训练多少数据，output_size为输入的特征
        #隐藏层hidden为三维张量:(num_layers, batch_num,  hidden_size),num_layers为隐藏层层数，batch_num为一批同时训练多少数据，input_size为输入的特征
        output, hidden = self.gru(Input, hidden)
        return output, hidden
    
    def initHidden(self):
        """初始化隐层张量函数"""
        return torch.zeros(self.num_layers,self.batch_num, self.hidden_size, device=device)


# Decoder部分


class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
         """初始化函数有两个参数，hidden_size代表解码器中GRU的输入尺寸，也是它的隐层节点数
           output_size代表整个解码器的输出尺寸,"""
        super(DecoderRNN, self).__init__()
        # 将hidden_size传入到类中# 将hidden_size传入到类中
        self.hidden_size = hidden_size
        # 实例化GRU对象，输入参数都是hidden_size，代表它的输入尺寸和隐层节点数相同
        self.gru = nn.GRU(hidden_size, hidden_size)
        # 实例化线性层, 对GRU的输出做线性变化, 获我们希望的输出尺寸output_size
        self.out = nn.Linear(hidden_size, output_size)
        
        
    def forward(self, Input, hidden):
        output, hidden = self.gru(Iutput, hidden)
        output = self.out(output[0])
        return output, hidden
    
    def initHidden(self):
        return torch.zeros(1,1,self.hidden_size, device=device)

# Attention 部分
class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=MAX_LENGTH):
        """初始化函数中的参数有4个, hidden_size代表解码器中GRU的输入尺寸，也是它的隐层节点数
           output_size代表整个解码器的输出尺寸,
           dropout_p代表我们使用dropout层时的置零比率，默认0.1, max_length代表句子的最大长度"""
        super(AttnDecoderRNN, self).__init__()
        # 将以下参数传入类中
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length
        
        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.attn = nn.Linear(self.hidden_size*2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size*2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)
    
    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)
        
        attn_weights = F.softmax(self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)  # 注意力权重
        attn_applied = torch.bmm(attn_weights.unsqueeze(0), encoder_outputs.unsqueeze(0))  # 两个batch之间的矩阵乘法
        
        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)
        
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        
        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights

# 构建基于GRU的解码器
解码器结构图:

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 将法语翻译成英语


SOS_token = 0  # 开始的标注
EOS_token = 1  # 结束的标注

# 辅助类


class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}   # word---->index
        self.index2word = {0: "SOS", 1: "EOS"}  # index---->word
        self.word2count = {}   # 稍后用来替换稀有单词，统计每个单词出现的次数
        self.n_words = 2  # 统计单词总数
    
    def addSentence(self, sentence):
        for word in sentence.split(" "):
            self.addWord(word)
    
    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1
 # Turn a Unicode string to plain ASCII
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# 小写，修剪和删除非字母字符
def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s
# 加载文件
def readLangs(lang1, lang2, reverse=False):
    print("Reading lines.......")
    # 读取文件并进行划分成行
    lines = open(r"E://DeepLearning//jupyter_code//dataset//corpus//translation_data//%s-%s.txt" % (lang1, lang2), encoding='utf-8').\
                read().strip().split("\n")
    
    # 将每行切成一组pairs
    pairs = [[normalizeString(s) for s in l.split("\t")] for l in lines]
    # 将其他语言翻译成英语
    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Lang(lang2)
        output_lang = Lang(lang1)
    else:
        input_lang = Lang(lang1)  
        output_lang = Lang(lang2)   
        
    return input_lang, output_lang, pairs

# 由于有很多例句，为了能快速训练，我们会将数据集修剪成相对简短的句子。这里最大长度是10个单词（包括结束标点符号）

MAX_LENGTH = 10

# 英语前缀
eng_prefixes = (
    "i am ", "i m ",
    "he is", "he s ",
    "she is", "she s ",
    "you are", "you re ",
    "we are", "we re ",
    "they are", "they re "
)


def filterPair(p):
    return len(p[0].split(' ')) < MAX_LENGTH and \
        len(p[1].split(' ')) < MAX_LENGTH and \
        p[1].startswith(eng_prefixes)


def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]
def prepareData(lang1, lang2, reverse=False):
    input_lang, output_lang, pairs = readLangs(lang1, lang2, reverse)
    print("Read %s sentence pairs" % len(pairs))
    pairs = filterPairs(pairs)
    print("Trimmed to %s sentence pairs" % len(pairs))
    print("Counting words...")
    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])
    print("Counted words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    return input_lang, output_lang, pairs


input_lang, output_lang, pairs = prepareData('eng', 'fra', True)    
# print("pairs:\n", pairs)  pairs = [法语,英语]
print(random.choice(pairs))


# Encoder 部分


class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size   # 隐藏状态a的大小
        
        self.embedding = nn.Embedding(input_size, hidden_size)  # 词嵌入层
        self.gru = nn.GRU(hidden_size, hidden_size)   # 多层的GRU
        
    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden
    
    def initHidden(self):
        return torch.zeros(1,1, self.hidden_size, device=device)


# Decoder部分


class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)
        
    def forward(self, input, hidden):
        output = self.embedding(input).view(1, 1, -1)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden
    
    def initHidden(self):
        return torch.zeros(1,1,self.hidden_size, device=device)

# Attention 部分
class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=MAX_LENGTH):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length
        
        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.attn = nn.Linear(self.hidden_size*2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size*2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)
    
    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)
        
        attn_weights = F.softmax(self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)  # 注意力权重
        attn_applied = torch.bmm(attn_weights.unsqueeze(0), encoder_outputs.unsqueeze(0))  # 两个batch之间的矩阵乘法
        
        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)
        
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        
        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights
    
    # 隐状态初始化
    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

# 训练模型

# 准备训练数据

def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(" ")]

def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)   # EOS作为encoder编码器网络的结束标志，  SOS作为Decoder解码器网络的开始标志
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)

def tensorsFromPair(pair): 
    input_tensor = tensorFromSentence(input_lang, pair[0])   # pair[0]是法语
    targe_tensor = tensorFromSentence(output_lang, pair[1])  # pair[1]是英语
    return (input_tensor, targe_tensor)


# 开始训练

# “tearcher_forcing_ratio将上一时刻的真实目标输出当作下一个时刻的Encoder网络的输入，而不是使用Encoder网络的上一时刻的预测输出作为下一时刻的输入。
tearcher_forcing_ratio = 0.5  

def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_LENGTH):
    encoder_hidden = encoder.initHidden()
    
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()
    
    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)
    
    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)
    
    loss = 0
    
    # encoder部分
    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]
    
    # decoder部分
    decoder_input = torch.tensor([[SOS_token]], device=device)
    
    decoder_hidden = encoder_hidden
    
    use_teacher_foring = True if random.random() < tearcher_forcing_ratio else False
    
    # using teacher forcing
    if use_teacher_foring:
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(decoder_input, decoder_hidden, encoder_outputs)
            loss += criterion(decoder_output, target_tensor[di])
            decoder_input = target_tensor[di]  
            
    # 不使用teacher forcing,使用上一时刻的输出作为下一时刻的输入        
    else:
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(decoder_input, decoder_hidden, encoder_outputs)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()  
            
            loss += criterion(decoder_output, target_tensor[di])
            
            if decoder_input.item() == EOS_token:
                break
    loss.backward()
    
    encoder_optimizer.step()
    decoder_optimizer.step()
    
    return loss.item() / target_length


# 辅助函数------记录时间

import time
import math

def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return "%dm %ds" % (m, s)

def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return "%s (- %s)" % (asMinutes(s), asMinutes(rs))


# 整个训练过程如下：
 # 开启定时器
 # 初始化优化器和loss函数
 # 创建training pairs
 # 开始训练并绘图

def trainIters(encoder, decoder, n_iters, print_every=1000, plot_every=100, learning_rate=0.01):
    start = time.time()  # 开启定时器
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)  # 定义优化算法
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    training_pairs = [tensorsFromPair(random.choice(pairs))  # 创建training pairs
                      for i in range(n_iters)]
    criterion = nn.NLLLoss()  # 定义损失函数

    for iter in range(1, n_iters + 1):
        training_pair = training_pairs[iter - 1]
        input_tensor = training_pair[0]
        target_tensor = training_pair[1]

        loss = train(input_tensor, target_tensor, encoder,
                     decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss

        if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
                                         iter, iter / n_iters * 100, print_loss_avg))

        if iter % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

    showPlot(plot_losses)


# 绘制loss曲线

import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np

%matplotlib inline

def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)


# 测试阶段--------测试阶段整体与训练阶段类似，但是测试阶段，不用给出target_tensor,只是将decoder网络上一时刻的预测值作为下一时刻的输入值
# 当预测值是EOS时，则停止预测

def evaluate(encoder, decoder, sentence, max_length = MAX_LENGTH):
     with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang, sentence)
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()

        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

        # encoder部分
        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei],
                                                     encoder_hidden)
            encoder_outputs[ei] += encoder_output[0, 0]

        decoder_input = torch.tensor([[SOS_token]], device=device)  # SOS

        decoder_hidden = encoder_hidden

        decoded_words = []
        decoder_attentions = torch.zeros(max_length, max_length)
        
        # decoder部分
        for di in range(max_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            decoder_attentions[di] = decoder_attention.data
            topv, topi = decoder_output.data.topk(1)
            if topi.item() == EOS_token:  # 结束时的条件
                decoded_words.append('<EOS>')
                break
            else:
                decoded_words.append(output_lang.index2word[topi.item()])

            decoder_input = topi.squeeze().detach()

        return decoded_words, decoder_attentions[:di + 1]

# 随机地从训练集中选择pairs,然后在测试集上进行评估

def evaluateRandomly(encoder, decoder, n=10):
    for i in range(n):
        pair = random.choice(pairs)
        print('输入:>', pair[0])
        print('目标:=', pair[1])
        output_words, attentions = evaluate(encoder, decoder, pair[0])
        output_sentence = ' '.join(output_words)
        print('预测:<', output_sentence)
        print('')

# 正式训练开始运行

hidden_size = 256
encoder1 = EncoderRNN(input_lang.n_words, hidden_size).to(device)
attn_decoder1 = AttnDecoderRNN(hidden_size, output_lang.n_words, dropout_p=0.1).to(device)

trainIters(encoder1, attn_decoder1, 75000, print_every=5000)

evaluateRandomly(encoder1, attn_decoder1)

# 注意力可视化

output_words, attentions = evaluate(
    encoder1, attn_decoder1, "je suis trop froid .")
plt.matshow(attentions.numpy());

# 增加坐标轴，更加清楚的可视化

def showAttention(input_sentence, output_words, attentions):
    # Set up figure with colorbar
    fig = plt.figure()
    ax = fig.add_subplot(111)
    cax = ax.matshow(attentions.numpy(), cmap='bone')
    fig.colorbar(cax)

    # Set up axes
    ax.set_xticklabels([''] + input_sentence.split(' ') +
                       ['<EOS>'], rotation=90)
    ax.set_yticklabels([''] + output_words)

    # Show label at every tick
    ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
    ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

    plt.show()


def evaluateAndShowAttention(input_sentence):
    output_words, attentions = evaluate(
        encoder1, attn_decoder1, input_sentence)
    print('input =', input_sentence)
    print('output =', ' '.join(output_words))
    showAttention(input_sentence, output_words, attentions)

# benchmark
基础的

![avatar](5.png)

堆栈的两层结构为：

![avatar](8.png)

# 一种一对多的含注意力机制的模型
提出两种结构，分别为Ⅰ和Ⅱ型。Ⅰ型相对简单，Ⅱ型较为复杂。两者模型结构图

![avatar](3.png)

![avatar](7.png)

模型中的GA_cell是gru_attention_cell,通过在单元中引入注意力机制来获得其他任务中的信息。

对应的不包含注意力机制的模型如下：

![avatar](6.png)

In [17]:
class seq2seq_rnn(nn.Module):
    """这是benchmark"""

    def __init__(self,input_size,hidden_size,output_size=1,result_nums=2,dropout=0.1):
        super(seq2seq_rnn, self).__init__()
        self.input_size = input_size #输入数据特征数目
        self.hidden_size = hidden_size #隐藏层特征数目
        self.output_size = output_size #输出层特征数目
        self.result_nums = result_nums #Decoder返回的结果数目
        self.dropout = dropout
        
        self.encoder_layer1 = nn.GRU(input_size,hidden_size,1,dropout=dropout)
        self.encoder_layer2 = nn.GRU(hidden_size,output_size,1,dropout=dropout)
        self.decoder_layer1 = nn.GRU(output_size,hidden_size,1,dropout=dropout)
        self.decoder_layer2 = nn.GRU(hidden_size,output_size,1,dropout=dropout)

    def forward(self, x):
        """前向传播.

        Args:
            x: 输入张量，形状为[seq_len , batch , input_size]
        Returns:
            y
        """
        x,h1 = self.encoder_layer1(x) # x:(seq_len, batch, hidden_size),h1:(1, batch, hidden_size)
        x,h2 = self.encoder_layer2(x) # x:(seq_len, batch, output_size),h2:(1, batch, output_size)
        out = h2.repeat(self.result_nums,1,1) # out:(result_nums, batch, output_size)
        y,_ = self.decoder_layer1(out,h1) # y:(result_nums, batch, hidden_size)
        y,_ = self.decoder_layer2(y,h2) # y:(result_nums, batch, output_size)
        return y

In [18]:
x = torch.randn(3,5,10)
rnn = seq2seq_rnn(10,20)
y = rnn(x)

In [None]:
class OSMG(nn.Module):
    __constants__ = [ 'input_size', 'hidden_size', 'task_nums', 'bias',
                     'batch_first', 'dropout']

    def __init__(self, input_size, hidden_size,task_nums,result_nums,
                bias=True, batch_first=False,
                 dropout=0.):
        super(OSMG, self).__init__()
        self.input_size = input_size #输入数据特征数目
        self.hidden_size = hidden_size #隐藏状态特征数目
        self.task_nums = task_nums #表示的是任务数量
        self.bias = bias
        self.batch_first = batch_first
        self.dropout = dropout
        self.models = nn.ModuleList([seq2seq_rnn(input_size,hidden_size,result_nums,dropout) for i in range(task_nums)])
    
    def forward(self,x):
        """
        前向传播
        
        Args:
        
        x:输入张量，形状为[seq_len , batch , input_size]
        
        返回值：
        y:list
        """
        y = []
        for model in self.models:
            y_i = model(x)
            y.append(y_i)
        return y

OSMGIAM-I的Encoder_base结构

![avatar](9.png)

Decoder_base结构

![avatar](10.png)

In [None]:
class Encoder_base(nn.Module):
    def __init__(self,input_size,hidden_size,output_size,dropout):
        super(Encoder_base,self).__init__()
        self.input_size = input_size #输入数据特征数目
        self.hidden_size = hidden_size #隐藏层特征数目
        self.output_size = output_size #输出层特征数目
        self.dropout = dropout
        
        self.layer1 = nn.GRU(input_size,hidden_size,1,dropout=dropout)
        self.layer2 = nn.GRU(hidden_size,output_size,1,dropout=dropout)
        
    def forward(self,x):
        """
        前向传播
        
        Args:
        
        x:输入张量，形状为[seq_len , batch , input_size]
        
        返回值：
        
        """
        x,h1 = self.layer1(x) # x:(seq_len, batch, hidden_size),h1:(1, batch, hidden_size)
        _,h2 = self.layer2(x) # h2:(1, batch, output_size)
        return h1,h2

In [None]:
class Decoder_base(nn.Module):
    def __init__(self,hidden_size,output_size,dropout):
        super(Decoder_base,self).__init__()
        self.hidden_size = hidden_size #隐藏层特征数目
        self.output_size = output_size #输出层特征数目
        self.dropout = dropout
        
        self.layer1 = nn.GRU(2*output_size,2*hidden_size,1,dropout=dropout)
        self.layer2 = nn.GRU(2*hidden_size,output_size,1,dropout=dropout)
        
    def forward(self,x,h1,h2):
        """
        前向传播
        
        Args:
        
        x:输入张量，形状为[seq_len , batch , 2*output_size]
        h1隐藏状态张量，形状为[1 , batch , 2*hidden_size]
        h2隐藏状态张量，形状为[1 , batch , output_size]
        返回值：
        
        """
        x,_ = self.layer1(x,h1) # x:(seq_len, batch, 2*hidden_sizeh)
        x,_ = self.layer2(x,h2) # x:(seq_len, batch, output_size)
        return x

In [None]:
class OSMGIAM1(nn.Module):
    __constants__ = [ 'input_size', 'hidden_size', 'task_nums', 'bias',
                     'batch_first', 'dropout']

    def __init__(self, input_size, hidden_size,output_size,task_nums,result_nums,
                bias=True, batch_first=False,
                 dropout=0.):
        super(OSMGIAM1, self).__init__()
        self.input_size = input_size #输入数据特征数目
        self.hidden_size = hidden_size #隐藏状态特征数目
        self.output_size = output_size #输出状态特征数目
        self.task_nums = task_nums #表示的是任务数量
        self.result_nums = result_nums
        self.bias = bias
        self.batch_first = batch_first
        self.dropout = dropout
        self.dot_product_attention = Scaled_Dot_Product_Attention(dropout)
        self.encoders = nn.ModuleList([Encoder_base(input_size,hidden_size,output_size,dropout) for i in range(task_nums)])
        self.decoders = nn.ModuleList([Decoder_base(hidden_size,output_size,dropout) for i in range(task_nums)])
        
    def forward(self,x):
        """
        前向传播
        
        Args:
        
        x:输入张量，形状为[seq_len , batch , input_size]
        """
        _,b,_ = x.size()
        H1 = torch.zeros(self.task_nums,b,self.hidden_size)
        H2 = torch.zeros(self.task_nums,b,self.output_size)
        i = 0
        for encoder in self.encoders:
            h1,h2 = encoder(x)
            H1[i] = h1.reshape(b,self.hidden_size)
            H2[i] = h2.reshape(b,self.output_size)
            i += 1
        H1 = H1.permute(1,0,2) # (batch,task,hidden)
        H2 = H2.permute(1,0,2) # (batch,task,output)
        context1,attention1 = self.dot_product_attention(H1,H1,H1) # context1:(batch,task,hidden)
        context2,attention2 = self.dot_product_attention(H2,H2,H2) # context2:(batch,task,output)
        H1 = H1.permute(1,0,2) # (task,batch,hidden)
        H2 = H2.permute(1,0,2) # (task,batch,output)
        context1 = context1.permute(1,0,2) # (task,batch,hidden)
        H1 = torch.cat([H1,context1],dim=2) # (task,batch,2*hidden)
        context2 = context2.permute(1,0,2) # (task,batch,output)
        H = torch.cat([H2,context2],dim=2) # (task,batch,2*output)
        j = 0
        Y = []
        for decoder in self.decoders:
            temp = H[j].reshape(1,b,2*self.output_size)
            out = temp.repeat(self.result_nums,1,1) # out:(result_nums, batch,2*output_size)
            h1_temp = H1[j].reshape(1,b,2*self.hidden_size) # (1, batch, 2*hidden_size)
            h2_temp = H2[j].reshape(1,b,self.output_size) # (1, batch, output_size)
            y = decoder(out,h1_temp,h2_temp) # y:(result_nums, batch, output_size)
            Y.append(y)
            j += 1
        return Y

## GA_cell
![avatar](1.png)

\begin{equation}
\begin{array}{l}
r_{t, k}=\sigma\left(W_{r} x_{t, k}+U_{r} h_{t, k-1}+A_{r} a_{t, k}+b_{r}\right) \\
z_{t, k}=\sigma\left(W_{z} x_{t, k}^{i}+U_{z} h_{t, k-1}+A_{z} a_{t, k}+b_{z}\right) \\
\widetilde{h_{t, k}}=tanh\left(W x_{t, k}+U\left[r_{t, k}^{i} \cdot h_{t, k-1}^{i}\right]+A a_{t, k}+b\right), \\
h_{t, k}=\left(1-z_{t, k}\right) \cdot h_{t, k-1}+z_{t, k} \cdot \widetilde{h_{t, k}},
\end{array}
\end{equation}

In [None]:
import numbers

class OSMGIAM2(nn.Module):
    __constants__ = [ 'input_size', 'hidden_size', 'task_nums', 'bias',
                     'batch_first', 'dropout']

    def __init__(self, input_size, hidden_size,num_tasks,
                bias=True, batch_first=False,
                 dropout=0.):
        super(OSMGIAM2, self).__init__()
        self.input_size = input_size #输入数据特征数目
        self.hidden_size = hidden_size #隐藏状态特征数目
        self.task_nums = task_nums #表示的是任务数量
        self.bias = bias
        self.batch_first = batch_first
        self.dropout = nn.Dropout(dropout)
        self.dot_product_attention = Scaled_Dot_Product_Attention(dropout) #点积注意力模型
        
        self.sigmoid = nn.Sigmoid()
        self.tanh = nn.Tanh()
        
        # 与输入相关的参数
        self.W_r = Parameter(torch.Tensor(task_nums, hidden_size, input_size))
        self.W_z = Parameter(torch.Tensor(task_nums, hidden_size, input_size))
        self.W_h =  Parameter(torch.Tensor(task_nums, hidden_size, input_size))
        
        # 与隐藏状态相关的参数
        self.U_r = Parameter(torch.Tensor(task_nums, hidden_size, hidden_size))
        self.U_z = Parameter(torch.Tensor(task_nums, hidden_size, hidden_size))
        self.U_h =  Parameter(torch.Tensor(task_nums, hidden_size, hidden_size))
        
        #与注意力相关的参数
        self.A_r = Parameter(torch.Tensor(task_nums, hidden_size, hidden_size))
        self.A_z = Parameter(torch.Tensor(task_nums, hidden_size, hidden_size))
        self.A_h =  Parameter(torch.Tensor(task_nums, hidden_size, hidden_size))
        
        #偏置
        self.b_r = Parameter(torch.Tensor(task_nums, hidden_size))
        self.b_z = Parameter(torch.Tensor(task_nums, hidden_size))
        self.b_h = Parameter(torch.Tensor(task_nums, hidden_size))

    
    def forward(self,x):
        """
        前向传播
        
        Args:
        
        x:输入张量，形状为[seq_len , batch , input_size]
        """
        #定义一个张量H和A，其中H用来存储上一时刻的隐藏状态，而A用来存储上一时刻的注意力值。H:(task_nums,batch,hidden_size),A:(task_nums,batch,hidden_size)
        seq_len , b , _  = x.size()
        H = torch.zeros(self.task_nums,b,self.hidden_size)
        A = torch.zeros(self.task_nums,b,self.hidden_size)
        for i in range(seq_len):
            H_temp = torch.ones(self.task_nums,b,self.hidden_size)
            for j in range(self.task_nums):
                r_j = self.sigmoid(torch.mm(self.W_r[j],x[i].T) + torch.mm(self.U_r[j],H[j].T) + torch.mm(self.A_r[j],A[j].T) + self.b_r[j].repeat(b,1).T) # r_j:(hidden_size,batch)
                z_j = self.sigmoid(torch.mm(self.W_z[j],x[i].T) + torch.mm(self.U_z[j],H[j].T) + torch.mm(self.A_z[j],A[j].T) + self.b_z[j].repeat(b,1).T) # z_j:(hidden_size,batch)
                h_tilde = self.tanh(torch.mm(self.W_h[j],x[i].T) + torch.mm(self.U_h[j],r_j.mul(H[0].T)) + torch.mm(self.A_h[j],A[j].T) + self.b_h[j].repeat(b,1).T) # h_tilde:(hidden_size,batch)
                h_j = (1-z_j).mul(H[j].T) + z_j.mul(h_tilde) # h_j:(hidden_size,batch)
                H_temp[j] = h_j.T
            H = H_temp
            H_temp = H_temp.permute(1,0,2)
            A,_ = self.dot_product_attention(H_temp,H_temp,H_temp)
            A = A.permute(1,0,2) #(task_nums,b,hidden_size)
        return H,A

In [None]:
gru = nn.GRU(10,20,6)
Input = Variable(torch.randn(5, 4, 10))#Input的维度：(seq_len, batch, input_size)
h0 = Variable(torch.randn(6, 4, 20)) #初始化h0 维度：(layer_nums, batch, output_size)
out,h = gru(Input, h0)

In [None]:
H = torch.zeros(task_nums,b,hidden_size)
A = torch.zeros(task_nums,b,hidden_size)
for i in range(seq_len):
    H_temp = torch.ones(task_nums,b,hidden_size)
    for j in range(task_nums):
        r_j = sigmoid(torch.mm(W_r[j],x[i].T) + torch.mm(U_r[j],H[j].T) + torch.mm(A_r[j],A[j].T) + b_r[j].repeat(b,1).T) # r_j:(hidden_size,batch)
        z_j = sigmoid(torch.mm(W_z[j],x[i].T) + torch.mm(U_z[j],H[j].T) + torch.mm(A_z[j],A[j].T) + b_z[j].repeat(b,1).T) # z_j:(hidden_size,batch)
        h_tilde = tanh(torch.mm(W_h[j],x[i].T) + torch.mm(U_h[j],r_j.mul(H[0].T)) + torch.mm(A_h[j],A[j].T) + b_h[j].repeat(b,1).T) # h_tilde:(hidden_size,batch)
        h_j = (1-z_j).mul(H[j].T) + z_j.mul(h_tilde) # h_j:(hidden_size,batch)
        H_temp[j] = h_j.T
    H = H_temp #(task_nums,b,hidden_size)
    H_temp = H_temp.permute(1,0,2) # 
    A,_ = dot_product_attention(H_temp,H_temp,H_temp) # A:(b,task_nums,hidden_size)
    A = A.permute(1,0,2) #(task_nums,b,hidden_size)

In [None]:
#订单付款时间，拆分成：month,day,weekday,hour,minute,second
import datetime
def time2multi(x):
    t = datetime.datetime.strptime(x,'%Y-%m-%d %H:%M:%S')
    return pd.Series([t.month,t.day,t.weekday(),t.hour,t.minute,t.second])
