Attention（注意力）机制如果浅层的理解，跟他的名字非常匹配。他的核心逻辑就是「从关注全部到关注重点」。

Attention（注意力）

In [9]:
import torch
from torch import nn
import torch.nn.functional as F

In [10]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size,
                 num_layers=1, bidirectional=False, batch_size=1):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.bidirectional = bidirectional
        self.batch_size = batch_size

        self.embedding = nn.Embedding(input_size, embedding_size)

        self.gru = nn.GRU(embedding_size, hidden_size, num_layers,
                          bidirectional=bidirectional)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output, hidden = self.gru(embedded, hidden)
        return output, hidden

    def initHidden(self):
        directions = 2 if self.bidirectional else 1
        return torch.zeros(
            self.num_layers * directions,
            self.batch_size,
            self.hidden_size,
            device=DEVICE
        )


class AttnDecoderRNN(nn.Module):
    def __init__(self, embedding_size, hidden_size, output_size, dropout_p=0):
        super(AttnDecoderRNN, self).__init__()
        self.embedding_size = embedding_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p

        self.embedding = nn.Embedding(
            num_embeddings=output_size,
            embedding_dim=embedding_size
        )
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(embedding_size, hidden_size)
        self.attn = nn.Linear(hidden_size, hidden_size)
        # hc: [hidden, context]
        self.Whc = nn.Linear(hidden_size * 2, hidden_size)
        # s: softmax
        self.Ws = nn.Linear(hidden_size, output_size)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)

        gru_out, hidden = self.gru(embedded, hidden)

        attn_prod = torch.mm(self.attn(hidden)[0], encoder_outputs.t())
        attn_weights = F.softmax(attn_prod, dim=1)
        context = torch.mm(attn_weights, encoder_outputs)

        # hc: [hidden: context]
        hc = torch.cat([hidden[0], context], dim=1)
        out_hc = F.tanh(self.Whc(hc))
        output = F.log_softmax(self.Ws(out_hc), dim=1)

        return output, hidden, attn_weights

Attention（注意力）

In [12]:
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from torch.autograd import Variable

In [21]:
class Encoder(nn.Module):
    
    def __init__(self,input_size,hidden_size,bidirectional= True):
        
        super(Encoder,self).__init__()
        
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.bidirectional = bidirectional
        
        self.lstm = nn.LSTM(input_size,hidden_size,bidirectional = bidirectional)
        
        
    def forward(self,input,hidden):
        
        output , hidden = self.lstm(input.view((1, 1, self.input_size)),hidden)
        
        return output,hidden
    
    
    def init_hidden(self):
        
         return (torch.zeros(1 + int(self.bidirectional), 1, self.hidden_size),
      torch.zeros(1 + int(self.bidirectional), 1, self.hidden_size))
        
        
        
        
class AttentionDecoder(nn.Module):
    
    
    def __init__(self,input_size,output_size,vocab_size):
        
        super(AttentionDecoder,self).__init__()
        
        self.hidden_size = hidden_size
        self.output_size = output_size
        
        self.attn = nn.Linear(hidden_size + output_size, 1)
        self.lstm = nn.LSTM(hidden_size + vocab_size, output_size)
        self.final = nn.Linear(output_size, vocab_size)
        
        
    def init_hidden(self):
    return torch.zeros(1, 1, self.output_size,
      torch.zeros(1, 1, self.output_size))    
        
        
    def forward(self, decoder_hidden, encoder_outputs, input):
    
    weights = []
    for i in range(len(encoder_outputs)):
      print(decoder_hidden[0][0].shape)
      print(encoder_outputs[0].shape)
      weights.append(self.attn(torch.cat((decoder_hidden[0][0], 
                                          encoder_outputs[i]), dim = 1)))
    normalized_weights = F.softmax(torch.cat(weights, 1), 1)
    
    attn_applied = torch.bmm(normalized_weights.unsqueeze(1),
                             encoder_outputs.view(1, -1, self.hidden_size))
    
    input_lstm = torch.cat((attn_applied[0], input[0]), dim = 1) #if we are using embedding, use embedding of input here instead
    
    output, hidden = self.lstm(input_lstm.unsqueeze(0), decoder_hidden)
    
    output = self.final(output[0])
    
    return output, hidden, normalized_weights       
        
             

IndentationError: expected an indented block (<ipython-input-21-9db717cd86f8>, line 45)

In [22]:
bidirectional = True
c = Encoder(10, 20, bidirectional)
a, b = c.forward(torch.randn(10), c.init_hidden())
print(a.shape)
print(b[0].shape)
print(b[1].shape)

x = AttentionDecoder(20 * (1 + bidirectional), 25, 30)
y, z, w = x.forward(x.init_hidden(), torch.cat((a,a)), torch.zeros(1,1, 30)) #Assuming <SOS> to be all zeros
print(y.shape)
print(z[0].shape)
print(z[1].shape)
print(w)

NameError: name 'Encoder' is not defined

https://github.com/pytorch/translate/tree/master/pytorch_translate/attention

In [None]:
import torch.nn as nn

class BaseAttention(nn.Module):
    
    def __init__(self,decoder_hidden_state_dim,context_dim):
        
        super().__init__()
        self.decoder_hidden_state_dim = decoder_hidden_state_dim
        self.context_dim = context_dim
        
    def forward(self,decoder_state, source_hids, src_lengths):
        
        """
        Input
            decoder_state: bsz x decoder_hidden_state_dim
            source_hids: srclen x bsz x context_dim
            src_lengths: bsz x 1, actual sequence lengths
        Output
            output: bsz x context_dim
            attn_scores: max_src_len x bsz
        """
        
        raise NotImplementedError
        

In [23]:
class SelfAttention(nn.Module):
    def __init__(self, hidden_dim):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.projection = nn.Sequential(
            nn.Linear(hidden_dim, 64),
            nn.ReLU(True),
            nn.Linear(64, 1)
        )

    def forward(self, encoder_outputs):
        # (B, L, H) -> (B , L, 1)
        energy = self.projection(encoder_outputs)
        weights = F.softmax(energy.squeeze(-1), dim=1)
        # (B, L, H) * (B, L, 1) -> (B, H)
        outputs = (encoder_outputs * weights.unsqueeze(-1)).sum(dim=1)
        return outputs, weights

class AttnClassifier(nn.Module):
    
    def __init__(self, input_dim, embedding_dim, hidden_dim):
        super().__init__()
        self.input_dim = input_dim
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, bidirectional=True)
        self.attention = SelfAttention(hidden_dim)
        self.fc = nn.Linear(hidden_dim, 1)
        
        
    def set_embedding(self, vectors):
        self.embedding.weight.data.copy_(vectors)
        
        
    def forward(self, inputs, lengths):
        batch_size = inputs.size(1)
        # (L, B)
        embedded = self.embedding(inputs)
        # (L, B, E)
        packed_emb = nn.utils.rnn.pack_padded_sequence(embedded, lengths)
        out, hidden = self.lstm(packed_emb)
        out = nn.utils.rnn.pad_packed_sequence(out)[0]
        out = out[:, :, :self.hidden_dim] + out[:, :, self.hidden_dim:]
        # (L, B, H)
        embedding, attn_weights = self.attention(out.transpose(0, 1))
        # (B, HOP, H)
        outputs = self.fc(embedding.view(batch_size, -1))
        # (B, 1)
        return outputs, attn_weights

In [None]:
class SelfAttention(nn.Module):
    """    
    scores each element of the sequence
    with a linear layer and uses the normalized 
    scores to compute a context over the sequence.   
    """  
    def __init__(self, d_hid, dropout=0.): 
        super().__init__()        
        self.scorer = nn.Linear(d_hid, 1)       
        self.dropout = nn.Dropout(dropout)   
        
    def forward(self, input_seq, lens):   
        batch_size, seq_len, feature_dim = input_seq.size()   
        input_seq = self.dropout(input_seq)      
        scores = self.scorer(input_seq.contiguous().view(-1, feature_dim)).view(batch_size, seq_len)     
        max_len = max(lens)      
        for i, l in enumerate(lens):          
            if l < max_len:               
                scores.data[i, l:] = -np.inf      
                scores = F.softmax(scores, dim=1)      
        context = scores.unsqueeze(2).expand_as(input_seq).mul(input_seq).sum(1)      
        return context # 既然命名为context就应该是整句的表示 

https://cloud.tencent.com/developer/article/1455815

In [25]:
#%%

from keras.preprocessing import sequence
from keras.datasets import imdb
from matplotlib import pyplot as plt
import pandas as pd

from keras import backend as K
from keras.engine.topology import Layer

class Self_Attention(Layer):

    def __init__(self, output_dim, **kwargs):
        self.output_dim = output_dim
        super(Self_Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        # 为该层创建一个可训练的权重
        #inputs.shape = (batch_size, time_steps, seq_len)
        self.kernel = self.add_weight(name='kernel',
                                      shape=(3,input_shape[2], self.output_dim),
                                      initializer='uniform',
                                      trainable=True)

        super(Self_Attention, self).build(input_shape)  # 一定要在最后调用它

    def call(self, x):
        WQ = K.dot(x, self.kernel[0])
        WK = K.dot(x, self.kernel[1])
        WV = K.dot(x, self.kernel[2])

        print("WQ.shape",WQ.shape)

        print("K.permute_dimensions(WK, [0, 2, 1]).shape",K.permute_dimensions(WK, [0, 2, 1]).shape)

        QK = K.batch_dot(WQ,K.permute_dimensions(WK, [0, 2, 1]))

        QK = QK / (64**0.5)

        QK = K.softmax(QK)

        print("QK.shape",QK.shape)

        V = K.batch_dot(QK,WV)

        return V

    def compute_output_shape(self, input_shape):

        return (input_shape[0],input_shape[1],self.output_dim)

max_features = 20000

print('Loading data...')

(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)
#标签转换为独热码
y_train, y_test = pd.get_dummies(y_train),pd.get_dummies(y_test)
print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')

#%%数据归一化处理

maxlen = 64

print('Pad sequences (samples x time)')

x_train = sequence.pad_sequences(x_train, maxlen=maxlen)

x_test = sequence.pad_sequences(x_test, maxlen=maxlen)

print('x_train shape:', x_train.shape)

print('x_test shape:', x_test.shape)

#%%

batch_size = 32
from keras.models import Model
from keras.optimizers import SGD,Adam
from keras.layers import *
from Attention_keras import Attention,Position_Embedding


S_inputs = Input(shape=(64,), dtype='int32')

embeddings = Embedding(max_features, 128)(S_inputs)


O_seq = Self_Attention(128)(embeddings)


O_seq = GlobalAveragePooling1D()(O_seq)

O_seq = Dropout(0.5)(O_seq)

outputs = Dense(2, activation='softmax')(O_seq)


model = Model(inputs=S_inputs, outputs=outputs)

print(model.summary())
# try using different optimizers and different optimizer configs
opt = Adam(lr=0.0002,decay=0.00001)
loss = 'categorical_crossentropy'
model.compile(loss=loss,

             optimizer=opt,

             metrics=['accuracy'])

#%%
print('Train...')

h = model.fit(x_train, y_train,

         batch_size=batch_size,

         epochs=5,

         validation_data=(x_test, y_test))

plt.plot(h.history["loss"],label="train_loss")
plt.plot(h.history["val_loss"],label="val_loss")
plt.plot(h.history["acc"],label="train_acc")
plt.plot(h.history["val_acc"],label="val_acc")
plt.legend()
plt.show()

#model.save("imdb.h5")

ModuleNotFoundError: No module named 'keras'

https://www.jianshu.com/p/4d52edda1d76

### https://github.com/bentrevett/pytorch-seq2seq/blob/master/6%20-%20Attention%20is%20All%20You%20Need.ipynb


http://nlp.seas.harvard.edu/2018/04/03/attention.html

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim

import torchtext
from torchtext.datasets import TranslationDataset, Multi30k
from torchtext.data import Field, BucketIterator

import spacy

import random
import math
import os
import time

ModuleNotFoundError: No module named 'torchtext'