<a href="https://colab.research.google.com/github/tanakakao/test/blob/main/transformer_torch2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optimizers

import math

In [13]:
class AddPositionalEncoding(nn.Module):
    '''
    入力テンソルに対し、位置の情報を付与して返すレイヤー
    see: https://arxiv.org/pdf/1706.03762.pdf

    PE_{pos, 2i}   = sin(pos / 10000^{2i / d_model})
    PE_{pos, 2i+1} = cos(pos / 10000^{2i / d_model})
    '''
    def forward(self, inputs):
        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

        fl_type = inputs.dtype
        batch_size, max_length, depth = inputs.shape
        
        depth_counter = torch.div(torch.arange(depth) ,2, rounding_mode='trunc')*2
 
        depth_matrix = torch.tile(torch.unsqueeze(depth_counter, 0), [max_length, 1])  # [max_length, depth]
        depth_matrix = torch.pow(10000.0, depth_matrix / depth)  # [max_length, depth]
        # cos(x) == sin(x + π/2)
        phase = torch.remainder(torch.arange(depth), 2) * math.pi / 2
        phase_matrix = torch.tile(torch.unsqueeze(phase, 0), [max_length, 1])  # [max_length, depth]

        pos_counter = torch.arange(max_length)
        pos_matrix = (torch.tile(torch.unsqueeze(pos_counter, 1), [1, depth]))  # [max_length, depth]

        positional_encoding = torch.sin(pos_matrix / depth_matrix + phase_matrix)
        # [batch_size, max_length, depth]
        positional_encoding = torch.tile(torch.unsqueeze(positional_encoding, 0), [batch_size, 1, 1])
        positional_encoding = positional_encoding.to(device)

        return inputs + positional_encoding

In [3]:
class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size, embedding_dim, pretrained_weight=None):
        # vocab_size: 単語の総数
        # embedding_dim: Embeddingの次数
        super().__init__()
        self.embedding_dim = embedding_dim
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=1)
        self.embedding.weight.requires_grad = True
        
        if pretrained_weight is not None:
            self.embedding.weight.data.copy_(pretrained_weight)

    def forward(self, x):
        # inputのIDに対応したベクトルを持ってくる
        embedding = self.embedding(x)
        
        return embedding * (self.embedding_dim ** 0.5)

In [14]:
class Encoder(nn.Module):
    '''
    TransformerのEncoder
    '''
    def __init__(
            self,
            vocab_size, # 単語の総数
            hopping_num, # Multi-head Attentionの繰り返し数
            heads_num, # Multi-head Attentionのヘッド数
            hidden_dim, # Embeddingの次数
            token_num, # 系列長(文章中のトークン数)
            drop_rate, # ドロップアウトの確率
            pretrained_weight=None
    ):
        super().__init__()
        self.hopping_num = hopping_num
        
        # Embedding層
        self.token_embedding = TokenEmbedding(vocab_size, hidden_dim, pretrained_weight)
        # Position Embedding
        self.add_position_embedding = AddPositionalEncoding()
        self.input_dropout_layer = nn.Dropout(drop_rate)

        # Multi-head Attentionの繰り返し(hopping)のリスト
        encoder_layer = nn.TransformerEncoderLayer(d_model=hidden_dim, nhead=heads_num, dropout=drop_rate, batch_first=True)
        self.attention_block = nn.TransformerEncoder(encoder_layer, num_layers=hopping_num)
        
        self.output_normalization = nn.LayerNorm(hidden_dim)

    def forward(
            self,
            input,
            memory=None,
            attention_mask=None
    ):
        '''
        input: 入力 [batch_size, length]
        memory: 入力 [batch_size, length]
        attention_mask: attention weight に適用される mask
            [batch_size, 1, q_length, k_length] 
            pad 等無視する部分が 0 となるようなもの(Decoderで使用)
        出力 [batch_size, length, hidden_dim]
        '''
        # [batch_size, token_num] -> [batch_size, token_num, hidden_dim]
        embedded_input = self.token_embedding(input)
        # Positional Embedding
        embedded_input = self.add_position_embedding(embedded_input)
        query = self.input_dropout_layer(embedded_input)
        
        query = self.attention_block(query, src_key_padding_mask=attention_mask)

        # [batch_size, token_num, hidden_dim]
        return self.output_normalization(query)

In [15]:
class AttentionClassifier(nn.Module):
    def __init__(
            self,
            vocab_size, # 単語の総数
            hopping_num, # Multi-head Attentionの繰り返し数
            heads_num, # Multi-head Attentionのヘッド数
            hidden_dim, # Embeddingの次数
            token_num, # 系列長(文章中のトークン数)
            drop_rate, # ドロップアウトの確率
            NUMLABELS, # クラス数
            pretrained_weight=None,
            PAD_ID = 1
    ):
        super().__init__()
        self.PAD_ID = PAD_ID
        
        self.encoder = Encoder(vocab_size, hopping_num, heads_num, hidden_dim, token_num, drop_rate, pretrained_weight)
        self.dense1 = nn.Linear(hidden_dim, hidden_dim)
        self.act1 = nn.Tanh()
        self.dropout1 = nn.Dropout(drop_rate)   
        self.final_layer = nn.Linear(hidden_dim, NUMLABELS)
        
        nn.init.normal_(self.dense1.weight, std=0.02)
        nn.init.normal_(self.dense1.bias, std=0)
        nn.init.normal_(self.final_layer.weight, std=0.02)
        nn.init.normal_(self.final_layer.bias, std=0)

    def forward(self, x):
        self_attention_mask=self._create_enc_attention_mask(x)
        
        # [batch_size, token_num] -> [batch_size, token_num, hidden_dim]
        enc_output = self.encoder(x, self_attention_mask)
        
        # 文頭の重みを使用 [batch_size, 0, hidden_dim]
        # [batch_size, hidden_dim] -> [batch_size, hidden_dim]
        enc_output = self.dense1(enc_output[:, 0, :])
        enc_output = self.act1(enc_output)
        enc_output = self.dropout1(enc_output)
        
        # [batch_size, hidden_dim] -> [batch_size, NUMLABELS]
        final_output = self.final_layer(enc_output)

        return final_output
    
    def _create_enc_attention_mask(self, x):
        batch_size, length = x.shape
        # マスクする部分を1とする
        pad_array = torch.eq(x, self.PAD_ID).to(dtype=torch.int8)  # [batch_size, token_num]
        
        # shape broadcasting で [batch_size, head_num, token_num, token_num] になる
        return pad_array.view([batch_size, length])

In [6]:
!pip install janome
import re
from janome.tokenizer import Tokenizer
j_t = Tokenizer(wakati=True)

def tokenizer_janome(text):
    return [tok for tok in j_t.tokenize(text, wakati=True)]

def preprocessing_text(text):
    text = re.sub('\r', '', text)
    text = re.sub('\n', '', text)
    text = re.sub('　', '', text)
    text = re.sub(' ', '', text)
    
    text = re.sub(r'[0-9 ０-９]', '0', text)
    return text

def tokenizer_with_preprocessing(text):
    text = preprocessing_text(text)
    ret = tokenizer_janome(text)
    return ret

Collecting janome
  Downloading Janome-0.4.2-py2.py3-none-any.whl (19.7 MB)
[K     |████████████████████████████████| 19.7 MB 1.2 MB/s 
[?25hInstalling collected packages: janome
Successfully installed janome-0.4.2


In [7]:
import torchtext
from torchtext import data, datasets
from torchtext.legacy import data

max_length = 64
TEXT = data.Field(sequential=True, tokenize=tokenizer_with_preprocessing,
                  use_vocab=True, lower=True, include_lengths=True,
                  batch_first=True, fix_length=max_length,init_token="<eos>",eos_token="<cls>")
LABEL = data.Field(sequential=False, use_vocab=False, preprocessing=None)

dataset = data.TabularDataset(
        path='reviews.csv', format='csv',
        skip_header=True,
        fields=[('Text', TEXT), ('Label', LABEL), ('Label2', LABEL)])

train_dataset, test_dataset = dataset.split(split_ratio=0.7)
train_dataset, val_dataset = train_dataset.split(split_ratio=0.7)

train_iter = data.Iterator(
    train_dataset, batch_size=32, 
    train=True  # train=Trueならシャッフルソートは有効
)
val_iter = data.Iterator(
    val_dataset, batch_size=32, 
    train=False, sort=False
)
test_iter = data.Iterator(
    test_dataset, batch_size=32, 
    train=False, sort=False
)

#train_iter, val_iter, test_iter = data.BucketIterator.splits((train_dataset, val_dataset, test_dataset), batch_size=32, repeat=False, shuffle=True)

In [16]:
TEXT.build_vocab(train_dataset)
vocab = TEXT.vocab
len(vocab)

batch = next(iter(train_iter))

net = AttentionClassifier(
            vocab_size = len(vocab), # 単語の総数
            hopping_num = 8, # Multi-head Attentionの繰り返し数
            heads_num = 6, # Multi-head Attentionのヘッド数
            hidden_dim = 300, # Embeddingの次数
            drop_rate = 0.1, # ドロップアウトの確率
            token_num = 64,
            
    pretrained_weight=None,
    NUMLABELS=2
    )

def weights_init(m):
    classname = m.__class__.__name__
    if classname.find('Linear') != -1:
        nn.init.kaiming_normal_(m.weight)
        if m.bias is not None:
            nn.init.constant_(m.bias, 0.0)
net.train()

net.apply(weights_init)

AttentionClassifier(
  (encoder): Encoder(
    (token_embedding): TokenEmbedding(
      (embedding): Embedding(10495, 300, padding_idx=1)
    )
    (add_position_embedding): AddPositionalEncoding()
    (input_dropout_layer): Dropout(p=0.1, inplace=False)
    (attention_block): TransformerEncoder(
      (layers): ModuleList(
        (0): TransformerEncoderLayer(
          (self_attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=300, out_features=300, bias=True)
          )
          (linear1): Linear(in_features=300, out_features=2048, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (linear2): Linear(in_features=2048, out_features=300, bias=True)
          (norm1): LayerNorm((300,), eps=1e-05, elementwise_affine=True)
          (norm2): LayerNorm((300,), eps=1e-05, elementwise_affine=True)
          (dropout1): Dropout(p=0.1, inplace=False)
          (dropout2): Dropout(p=0.1, inplace=False)
        )
        (1): Trans

In [19]:
criterion = nn.CrossEntropyLoss()

learning_rate = 2e-5
optimizer = optimizers.Adam(net.parameters(), lr=learning_rate, amsgrad=True, eps=1e-07)

In [10]:
def train_model(net, dataloaders_dict, criterion, optimizer, num_epochs):
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    print("使用デバイス:", device)
    print('--------start--------')
    net.to(device)
    
    torch.backends.cudnn.benchmark = True
    
    for epoch in range(num_epochs):
        for phase in ['train', 'val']:
            if phase == 'train':
                net.train()
            else:
                net.eval()
            
            epoch_loss = 0.0
            epoch_corrects = 0
            
            for batch in (dataloaders_dict[phase]):
                inputs = batch.Text[0].to(device)
                labels = batch.Label2.to(device)
                
                optimizer.zero_grad()
                
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = net(inputs)
                    loss = criterion(outputs, labels)
                    
                    _, preds = torch.max(outputs, 1)
                    
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()
                    
                    epoch_loss += loss.item() * inputs.size(0)
                    epoch_corrects += torch.sum(preds == labels.data)
            
            epoch_loss = epoch_loss / len(dataloaders_dict[phase].dataset)
            epoch_acc = epoch_corrects.double() / len(dataloaders_dict[phase].dataset)
            
            print('Epoch {}/{} | {:.^5} | Loss: {:.4f} Acc: {:.4f}'.format(epoch+1,
                                                                           num_epochs,
                                                                           phase,
                                                                           epoch_loss,
                                                                           epoch_acc))
        
    return net

In [20]:
dataloaders_dict = {"train":train_iter, "val":val_iter}

In [21]:
num_epochs = 20

net_trained = train_model(net, dataloaders_dict, criterion, optimizer, num_epochs=num_epochs)

使用デバイス: cuda:0
--------start--------
Epoch 1/20 | train | Loss: 0.5990 Acc: 0.7360
Epoch 1/20 | .val. | Loss: 0.5695 Acc: 0.7467
Epoch 2/20 | train | Loss: 0.6028 Acc: 0.7307
Epoch 2/20 | .val. | Loss: 0.5654 Acc: 0.7467
Epoch 3/20 | train | Loss: 0.5920 Acc: 0.7319
Epoch 3/20 | .val. | Loss: 0.5258 Acc: 0.7467
Epoch 4/20 | train | Loss: 0.5546 Acc: 0.7235
Epoch 4/20 | .val. | Loss: 0.4851 Acc: 0.7702
Epoch 5/20 | train | Loss: 0.5071 Acc: 0.7646
Epoch 5/20 | .val. | Loss: 0.5145 Acc: 0.7411
Epoch 6/20 | train | Loss: 0.4628 Acc: 0.7957
Epoch 6/20 | .val. | Loss: 0.4772 Acc: 0.8107
Epoch 7/20 | train | Loss: 0.4444 Acc: 0.7994
Epoch 7/20 | .val. | Loss: 0.4284 Acc: 0.8126
Epoch 8/20 | train | Loss: 0.4025 Acc: 0.8256
Epoch 8/20 | .val. | Loss: 0.5038 Acc: 0.8013
Epoch 9/20 | train | Loss: 0.3900 Acc: 0.8321
Epoch 9/20 | .val. | Loss: 0.4560 Acc: 0.8277
Epoch 10/20 | train | Loss: 0.3778 Acc: 0.8450
Epoch 10/20 | .val. | Loss: 0.4860 Acc: 0.7957
Epoch 11/20 | train | Loss: 0.3711 Acc: 0