In [52]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import os
import pathlib
from pprint import pprint
import math

#pytorch
import torch
from torch import nn
from torch.nn import functional as F
from torch import optim
from torch.utils.data import Dataset,DataLoader,random_split
from torchvision import transforms
from torch.nn import Module
from torchvision import models
from PIL import Image

# Transformer Encoder and Decoder

## 1. nn.TransformerEncoderの使い方

[reference]
https://discuss.pytorch.org/t/how-to-add-padding-mask-to-nn-transformerencoder-module/63390/2


### The required shapes are shown in nn.Transformer.forward

[入力 shape]:
1. Eembeddingされたtensor (Sequence length, Batch_Size, Embedding dim)
2. src_mask: (Sequence_lehgh,Seqnence_length)
3. src_key_padding_mask: (Batch_size,Sequence_length)

#### batch_firstではないことに注意

### 1. 基本：maskなし

In [57]:
import torch, torch.nn as nn

# shape:(Sequence length, Batch_Size, Embedding dim)
q = torch.randn(3,1,10)

# embedding size: 10, multi attention head :one head
attn = nn.MultiheadAttention(10, 1) 

#maskなしでの出力の場合は以下で
output,atten_map=attn(q,q,q)
print(output.shape,atten_map.shape)

torch.Size([3, 1, 10]) torch.Size([1, 3, 3])


### 2. maskを設定する場合

In [58]:
#三角行列を用いたmask
def src_mask(sz):
    mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask

In [59]:
#attention_mapを設定
#sequence_len:3のため引数は3
#src_mask: (Sequence_lehgh,Seqnence_length)

# attention output weights
attn(q, q, q, attn_mask=src_mask(3))[1]

tensor([[[1.0000, 0.0000, 0.0000],
         [0.4852, 0.5148, 0.0000],
         [0.3130, 0.3614, 0.3256]]], grad_fn=<DivBackward0>)

### 3.padding_indexを指定してmask指定したい場合
 

src_key_padding_maskのshape: (Batch_size,Sequence_length)

In [60]:
#padding_indexの部分をTrueとしてboolを作成
#sequence_length:3で index3を学習させたくない場合以下の様に指定
src_key_padding_mask = torch.tensor([[0, 0, 1]]).bool()
print(src_key_padding_mask)

tensor([[False, False,  True]])


In [61]:
attn(q, q, q, attn_mask=src_mask(3), key_padding_mask=src_key_padding_mask)[1]

tensor([[[1.0000, 0.0000, 0.0000],
         [0.4852, 0.5148, 0.0000],
         [0.4641, 0.5359, 0.0000]]], grad_fn=<DivBackward0>)

### example

In [62]:
#sampleのsetenceデータを作成
sample_sentence=torch.LongTensor([1,2,3,5,4,3,0,0,0,0]).unsqueeze(0)

#embedding paddingを0で指定
embedding=nn.Embedding(6,5,padding_idx=0)

#embedding
embed=embedding(sample_sentence)
print(embed.shape)
print(embed)

torch.Size([1, 10, 5])
tensor([[[ 0.7567,  1.1566, -0.9552,  0.2868, -0.2045],
         [ 0.4830,  0.0679, -0.7548, -1.0602, -0.5305],
         [-1.6067,  0.1129, -1.3658,  0.1027,  0.0240],
         [ 1.1009, -0.4258, -0.1315, -0.9095,  1.4193],
         [ 0.4125,  1.4098,  0.1352,  0.9740, -0.4609],
         [-1.6067,  0.1129, -1.3658,  0.1027,  0.0240],
         [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000]]],
       grad_fn=<EmbeddingBackward>)


####　key_padding_maskはbool値で指定可


In [63]:
# padding_indexを学習させないようにするにはkeypaddingを設定する必要がある。
# 上記データの場合 sample_sentenceに格納されているデータは、0,1,2,3,4,5
print(sample_sentence)
padding_mask= (sample_sentence==0).bool()
print(padding_mask)

tensor([[1, 2, 3, 5, 4, 3, 0, 0, 0, 0]])
tensor([[False, False, False, False, False, False,  True,  True,  True,  True]])


In [64]:
inputs=embed.transpose(0,1)
inputs.shape
#sequence_length,batch_size,embedding_dimsの順番に変更する。

torch.Size([10, 1, 5])

In [65]:
#三角行列を用いたmask
def src_mask(sz):
    mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask

In [66]:
attn = nn.MultiheadAttention(5, 1) # embedding size 10, one head

attn(inputs,inputs,inputs, 
     attn_mask=src_mask(10),
     key_padding_mask=padding_mask)[1]

tensor([[[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
          0.0000, 0.0000],
         [0.5885, 0.4115, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
          0.0000, 0.0000],
         [0.4047, 0.4630, 0.1323, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
          0.0000, 0.0000],
         [0.2140, 0.2022, 0.2418, 0.3421, 0.0000, 0.0000, 0.0000, 0.0000,
          0.0000, 0.0000],
         [0.1686, 0.1086, 0.2827, 0.0654, 0.3747, 0.0000, 0.0000, 0.0000,
          0.0000, 0.0000],
         [0.1835, 0.2099, 0.0600, 0.3218, 0.1648, 0.0600, 0.0000, 0.0000,
          0.0000, 0.0000],
         [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000,
          0.0000, 0.0000],
         [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000,
          0.0000, 0.0000],
         [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000,
          0.0000, 0.0000],
         [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000,
          0.0000,

### 上記をまとめて

In [67]:
def mha_output(input_sentence):
    
    """
    inputs_sentenceにはstoiで変換されたtorch.LongTensorを入力
    """
    
    #embedding:
    embed=embedding(input_sentence)
    
    #mask: paddingの項目をTrueのboolを
    mask_pad=0
    key_padding_mask=(input_sentence==mask_pad).bool()
    
    #入力がbatch_firstではないため注意
    #sequence_length,batch_size,embedding_dimsの順番に変更する。
    embed=embed.transpose(0,1)
    seq_len=embed.shape[0]

    #attention output
    output,atten_mask=attn(embed,embed,embed,
                            attn_mask=src_mask(seq_len),
                            key_padding_mask=key_padding_mask)
    
    return output,atten_mask

In [68]:
output,atten_mask=mha_output(sample_sentence)
print(output.shape)
print(atten_mask.shape)

torch.Size([10, 1, 5])
torch.Size([1, 10, 10])


## 2. nn.TransformerEncoder()の使い方

https://pytorch.org/docs/stable/generated/torch.nn.TransformerEncoder.html

### 2.1 基本

In [85]:
#基本的な使い方
encoder_layer = nn.TransformerEncoderLayer(d_model=512, nhead=8)
transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=6)
src = torch.rand(10, 32, 512)
out = transformer_encoder(src)
print(out.shape)

torch.Size([10, 32, 512])


### 2.2 maskを設定したい場合

In [82]:
#sampleのsetenceデータを作成
sample_sentence=torch.LongTensor([1,2,3,5,4,3,0,0,0,0]).unsqueeze(0)

#embedding paddingを0で指定
embedding=nn.Embedding(6,5,padding_idx=0)

embed=embedding(sample_sentence)
print(embed.shape)
print(embed)
#paddingのembeddingの値を0に設定しておく。

torch.Size([1, 10, 5])
tensor([[[ 0.2856, -0.5371, -1.1387,  0.8060, -0.5655],
         [ 0.2140, -0.9235,  1.7130, -0.1160, -0.8278],
         [ 0.6873, -0.5021, -0.0607,  1.9373, -0.7169],
         [ 1.0140, -0.9519, -0.5368, -0.7806, -0.7649],
         [ 1.2167,  0.8828, -0.4382, -0.8354, -0.9914],
         [ 0.6873, -0.5021, -0.0607,  1.9373, -0.7169],
         [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000]]],
       grad_fn=<EmbeddingBackward>)


In [83]:
#maskの設定は同様
def src_mask(sz):
    mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask

In [84]:
#d_modelはembedding dimの次元数を設定
#d_model/nheadは割り切れる数で設定しないとerror(ex. d_model=5/nhead=2)だとerror
encoder_layer = nn.TransformerEncoderLayer(d_model=5, nhead=1)

In [73]:
transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=6)

In [74]:
inputs=embed.transpose(0,1)
print(inputs.shape)

torch.Size([10, 1, 5])


In [75]:
#paddingのstoiを指定
mask_pad=0
#bool値でpadding_maskを作成
key_padding_mask=(sample_sentence==0).bool()
print(key_padding_mask)

tensor([[False, False, False, False, False, False,  True,  True,  True,  True]])


In [76]:
out=transformer_encoder(inputs,
                        mask=src_mask(10),
                        src_key_padding_mask=key_padding_mask)
print(out.shape)

torch.Size([10, 1, 5])


### 3. Transformerを用いたEncoder and Decoder

https://pytorch.org/tutorials/beginner/transformer_tutorial.html


#### なぜ math.sqrt（）でかけてあるのか。

回答1\
https://stackoverflow.com/questions/56930821/why-does-embedding-vector-multiplied-by-a-constant-in-transformer-model


回答2\
https://datascience.stackexchange.com/questions/87906/transformer-model-why-are-word-embeddings-scaled-before-adding-positional-encod

The reason we increase the embedding values before the addition is to make the positional encoding relatively smaller. This means the original meaning in the embedding vector won’t be lost when we add them together.

### 3.1 maskなし

In [80]:
class Transformer(nn.Module):
    
    def __init__(self,vocab_size,embed_dim,num_token):
        super().__init__()
        
        #embedding:
        self.embedding=nn.Embedding(vocab_size,embed_dim)
        self.embed_dim=embed_dim
        
        #encoder
        encoder_layer=nn.TransformerEncoderLayer(d_model=embed_dim,
                                                        nhead=1)

        self.transformer_encoder=nn.TransformerEncoder(encoder_layer,
                                                         num_layers=6)
        
        #decoder
        self.decoder=nn.Linear(embed_dim,num_token)
        
    def forward(self,x):
        #positional Encoderを使用していない場合も*math.sqrt()が必要か不明
        out=self.embedding(x)*math.sqrt(self.embed_dim)
        out=out.transpose(0,1)
        
        out=self.transformer_encoder(out)
        
        out=self.decoder(out)
        
        return out

In [81]:
sample_sentence=torch.LongTensor([1,2,3,5,4,3,0,0,0,0]).unsqueeze(0)

t=Transformer(vocab_size=10,embed_dim=5,num_token=41)
out=t(sample_sentence)
out.shape

torch.Size([10, 1, 41])

### 3.2 maskあり

In [92]:
class Transformer_(nn.Module):
    
    def __init__(self,vocab_size,embed_dim,num_token):
        super().__init__()
        
        #embedding:
        self.embedding=nn.Embedding(vocab_size,embed_dim)
        self.embed_dim=embed_dim
        self.vocab_size=vocab_size
        
        #encoder
        encoder_layer=nn.TransformerEncoderLayer(d_model=embed_dim,
                                                        nhead=1)
        
        self.transformer_encoder=nn.TransformerEncoder(encoder_layer,
                                                         num_layers=6)

        #decoder
        self.decoder=nn.Linear(embed_dim,num_token)
        
    #mask時に使用する。maskの設定は同様
    def generate_square_subsequent_mask(self, sz):
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask
    
    def forward(self,x,mask):
        #positional Encoderを使用していない場合も*math.sqrt()が必要か不明
        out=self.embedding(x)*math.sqrt(self.embed_dim)
        
        out=out.transpose(0,1)
        
        if mask is not None:
            
            key_padding_mask = (x == 0).bool()
            out=self.transformer_encoder(out,
                                        mask=mask,
                                        src_key_padding_mask=key_padding_mask
                                        )
        #decoder
        out=self.decoder(out)
        
        return out

In [93]:
model=Transformer_(vocab_size=10,embed_dim=5,num_token=10)

In [95]:
#sampleのstoiされた文章データ
sample_sentence=torch.LongTensor([1,2,3,5,4,3,0,0,0,0]).unsqueeze(0)
print(sample_sentence.shape)

#maskを作成
src_mask=model.generate_square_subsequent_mask(10)

torch.Size([1, 10])


In [96]:
out=model(sample_sentence,src_mask)
print(out.shape)

torch.Size([10, 1, 10])


### 3.3 loss出力

In [105]:
#reshape Data
out=out.view(-1,10)
out.shape

torch.Size([10, 10])

In [104]:
criterion = nn.CrossEntropyLoss()

loss=criterion(out.unsqueeze(0),sample_sentence)
print(loss)

tensor(2.2568, grad_fn=<NllLoss2DBackward>)
