<a href="https://colab.research.google.com/github/tanakakao/test/blob/main/transformer_torch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optimizers

import math

In [2]:
class MultiHeadAttention(nn.Module):
    '''
    Multi-Head Attentionレイヤ
    
    hidden_dim : Embeddingされた単語ベクトルの長さ
    heads_num : マルチヘッドAttentionのヘッド数
       ※hidden_numはheads_numで割り切れえる値とすること
    drop_rate : 出力のDropout率

    model = MultiheadAttention(
        hidden_dim = 512,
        head_num = 8,
        drop_rate = 0.5
    )
    '''
    def __init__(self, token_num, hidden_dim, heads_num, drop_rate=0.5):
        super(MultiHeadAttention, self).__init__()
        # 入力の線形変換
        # 重み行列は[hidden_dim, hidden_dim]
        self.query = nn.Linear(hidden_dim, hidden_dim)
        self.key   = nn.Linear(hidden_dim, hidden_dim)
        self.value = nn.Linear(hidden_dim, hidden_dim)
        #self.query = nn.Conv1d(token_num, token_num, kernel_size=1)
        #self.key   = nn.Conv1d(token_num, token_num, kernel_size=1)
        #self.value = nn.Conv1d(token_num, token_num, kernel_size=1)
        
        # 出力の線形変換
        self.projection = nn.Linear(hidden_dim, hidden_dim)
        #self.projection = nn.Conv1d(token_num, token_num, kernel_size=1)
        
        # 出力のDropout
        self.drop = nn.Dropout(drop_rate)
        
        self.nf = hidden_dim
        self.nh = heads_num
    
    def atten(self, query, key, value, attention_mask):
        """
        Attention
        
        query, key, value : 入力
        attention_mask : attention weight に適用される mask
        """
        # 各値を取得
        shape = query.shape
        batch_size = -1 if shape[0] is None else shape[0]
        token_num = shape[2] # トークン列数
        hidden_dim = shape[1]*shape[3] # 入力チャンネル数
        
        # ここで q と k の内積を取ることで、query と key の単語間の関連度のようなものを計算します。
        # tf.matmulで最後の2成分について積を計算(それ以外は形がそろっている必要あり)
        # transpose_bで転置
        # [token_num, hidden_dim/head_num] @ [hidden_dim/head_num, token_num] = [token_num, token_num]
        scores = torch.matmul(query, key.transpose(-2, -1))
        
        # scoreをhidden_dimの平方根割る
        scores = scores / math.sqrt(hidden_dim)
        
        # Attention Maskがあればscoreに加算
        # attention_mask: [batch_size, token_num, token_num] 
        # マスク(参照しない部分)の場所に1、使用する部分は0とする
        # 0の部分を -無限大にする(softmax(-無限大)=0となる)
        # 1. PADを無視
        # 2. DecoderのSelf-Attentionで未来の情報を参照できないようにする
        if attention_mask is not None:
            scores = scores.masked_fill(attention_mask == 1, -1e9)

        # softmax を取ることで正規化します
        # input(query) の各単語に対して memory(key) の各単語のどこから情報を引いてくるかの重み
        atten_weight = F.softmax(scores, dim = -1)
        #atten_weight = scores / torch.sum(scores, dim=-1, keepdim=True)
        
        # 重みに従って value から情報を引いてきます
        # [token_num, token_num] @ [token_num, hidden_dim/head_num] = [token_num, hidden_dim/head_num]
        # input(query) の単語ごとに memory(value)の各単語 に attention_weight を掛け合わせて足し合わせた ベクトル(分散表現の重み付き和)を計算
        context = torch.matmul(atten_weight, value)
        
        # 各ヘッドの結合(reshape)
        # 入力と同じ形に変換する
        context = context.transpose(1, 2).contiguous()
        context = context.view(batch_size, token_num, hidden_dim)
        
        # 線形変換
        context = self.projection(context)
        
        return self.drop(context), atten_weight

    def _split(self, x):
        """
        query, key, valueを分割する
        
        入力 shape: [batch_size, length, hidden_dim] の時
        出力 shape: [batch_size, head_num, length, hidden_dim//head_num]
        """
        # 各値を取得
        hidden_dim = self.nf
        heads_num = self.nh
        shape = x.shape
        batch_size = -1 if shape[0] is None else shape[0]
        token_num = shape[1] # トークン列数
        
        # [batch_size, token_num, hidden_dim] -> [batch_size, token_num, head_num, hidden_dim/head_num]
        # splitだが実際は次元を拡張する処理
        x = x.view(batch_size, token_num, heads_num, int(hidden_dim/heads_num))
        
        # [batch_size, token_num, head_num, hidden_dim/head_num] -> [batch_size, head_num, token_num, hidden_dim/head_num]
        x = x.transpose(1, 2)
        return x
    
    def forward(self, x, memory=None, attention_mask=None, return_attention_scores=False):
        """
        モデルの実行
        
        input : 入力(query) [batch_size, token_num, hidden_dim]
        memory : 入力(key, value) [batch_size, token_num, hidden_dim]
        attention_mask : attention weight に適用される mask
            [batch_size, 1, q_length, k_length] 
            pad 等無視する部分が 1 となるようなもの(Decoderで使用)
        """
        # memoryが入力されない場合、memory=input(Self Attention)とする
        if memory is None:
            memory = x
        
        # input -> query
        # memory -> key, value
        # [batch_size, token_num, hidden_dim] @ [hidden_dim, hidden_dim] -> [batch_size, token_num, hidden_dim] 
        query = self.query(x)
        key = self.key(memory)
        value = self.value(memory)
        
        # ヘッド数に分割する
        # 実際はreshapeで次数を1つ増やす
        # [batch_size, token_num, hidden_dim] -> [batch_size, head_num, token_num, hidden_dim/head_num]
        query = self._split(query)
        key = self._split(key)
        value = self._split(value)
        
        # attention
        # 入力と同じ形の出力
        # context: [batch_size, token_num, hidden_dim]
        # score_weightsはEncoderではNoneとする
        context, atten_weight = self.atten(query, key, value, attention_mask)
        
        if return_attention_scores:
            return context, atten_weight
        else:
            return context

In [3]:
class FeedForwardNetwork(nn.Module):
    '''
    Position-wise Feedforward Neural Network
    transformer blockで使用される全結合層
    '''
    def __init__(self, hidden_dim, drop_rate=0.1):
        super().__init__()
        # 2層構造
        # 1層目：チャンネル数を増加させる
        self.filter_dense_layer = nn.Linear(hidden_dim, hidden_dim * 4)
        self.relu1 = nn.ReLU()
        
        # 2層目：元のチャンネル数に戻す
        self.output_dense_layer = nn.Linear(hidden_dim * 4, hidden_dim)
        self.drop = nn.Dropout(drop_rate)

    def forward(self, x):
        '''
        入力と出力で形が変わらない
        [batch_size, token_num, hidden_dim]
        '''
        
        # [batch_size, token_num, hidden_dim] -> [batch_size, token_num, 4*hidden_dim]
        x = self.filter_dense_layer(x)
        x = self.relu1(x)
        x = self.drop(x)
        
        # [batch_size, token_num, 4*hidden_dim] -> [batch_size, token_num, hidden_dim]
        return self.output_dense_layer(x)

In [4]:
class ResidualNormalizationWrapper(nn.Module):
    '''
    残差接続
    output: input + SubLayer(input)
    '''
    def __init__(self, hidden_dim, layer, drop_rate=0.1):
        super().__init__()
        self.layer = layer # SubLayer : ここではAttentionかFFN
        self.layer_normalization = nn.LayerNorm(hidden_dim)
        self.drop = nn.Dropout(drop_rate)

    def forward(self, x, memory=None, attention_mask=None, return_attention_scores=False):
        """
        AttentionもFFNも入力と出力で形が変わらない
        [batch_size, token_num, hidden_dim]
        """
        
        params = {}
        if memory is not None:
            params['memory'] = memory
        if attention_mask is not None:
            params['attention_mask'] = attention_mask
        if return_attention_scores:
            params['return_attention_scores'] = return_attention_scores
        
        out = self.layer_normalization(x)
        if return_attention_scores:
            out, attn_weights = self.layer(out,**params)
            out = self.drop(out)
            return x + out, attn_weights
        else:
            out = self.layer(out,**params)
            out = self.drop(out)
            return x + out

In [18]:
class AddPositionalEncoding(nn.Module):
    '''
    入力テンソルに対し、位置の情報を付与して返すレイヤー
    see: https://arxiv.org/pdf/1706.03762.pdf

    PE_{pos, 2i}   = sin(pos / 10000^{2i / d_model})
    PE_{pos, 2i+1} = cos(pos / 10000^{2i / d_model})
    '''
    def forward(self, inputs):
        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

        fl_type = inputs.dtype
        batch_size, max_length, depth = inputs.shape
        
        depth_counter = torch.div(torch.arange(depth) ,2, rounding_mode='trunc')*2
 
        depth_matrix = torch.tile(torch.unsqueeze(depth_counter, 0), [max_length, 1])  # [max_length, depth]
        depth_matrix = torch.pow(10000.0, depth_matrix / depth)  # [max_length, depth]
        # cos(x) == sin(x + π/2)
        phase = torch.remainder(torch.arange(depth), 2) * math.pi / 2
        phase_matrix = torch.tile(torch.unsqueeze(phase, 0), [max_length, 1])  # [max_length, depth]

        pos_counter = torch.arange(max_length)
        pos_matrix = (torch.tile(torch.unsqueeze(pos_counter, 1), [1, depth]))  # [max_length, depth]

        positional_encoding = torch.sin(pos_matrix / depth_matrix + phase_matrix)
        # [batch_size, max_length, depth]
        positional_encoding = torch.tile(torch.unsqueeze(positional_encoding, 0), [batch_size, 1, 1])
        positional_encoding = positional_encoding.to(device)

        return inputs + positional_encoding

In [19]:
class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size, embedding_dim, pretrained_weight=None):
        # vocab_size: 単語の総数
        # embedding_dim: Embeddingの次数
        super().__init__()
        self.embedding_dim = embedding_dim
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=1)
        self.embedding.weight.requires_grad = True
        
        if pretrained_weight is not None:
            self.embedding.weight.data.copy_(pretrained_weight)

    def forward(self, x):
        # inputのIDに対応したベクトルを持ってくる
        embedding = self.embedding(x)
        
        return embedding * (self.embedding_dim ** 0.5)

In [20]:
class TransformerBlock(nn.Module):
    """
    transformer block : before ->[attention -> FF]-> next
    それぞれ残差接続とLayerNormalizationの処理が含まれる
    """
    def __init__(self, token_num, hidden_dim, heads_num, drop_rate=0.1):
        """
        hidden_numはheads_numで割り切れえる値とすること
        """
        super().__init__()
        self.atten = ResidualNormalizationWrapper(
            hidden_dim = hidden_dim,
            layer = MultiHeadAttention(token_num=token_num, hidden_dim = hidden_dim, heads_num = heads_num, drop_rate = drop_rate),
            drop_rate = drop_rate)
        
        self.ffn = ResidualNormalizationWrapper(
            hidden_dim = hidden_dim,
            layer = FeedForwardNetwork(hidden_dim = hidden_dim, drop_rate = drop_rate),
            drop_rate = drop_rate)
    
    def forward(self, input, memory=None, attention_mask=None, return_attention_scores=False):
        """
        入力と出力で形式が変わらない
        [batch_size, token_num, hidden_dim]
        """
        if return_attention_scores:
            x, attn_weights = self.atten(input, memory, attention_mask, return_attention_scores)
            x = self.ffn(x)
            return x, attn_weights
        else:
            x = self.atten(input, memory, attention_mask, return_attention_scores)
            x = self.ffn(x)
            return x

In [21]:
class Encoder(nn.Module):
    '''
    TransformerのEncoder
    '''
    def __init__(
            self,
            vocab_size, # 単語の総数
            hopping_num, # Multi-head Attentionの繰り返し数
            heads_num, # Multi-head Attentionのヘッド数
            hidden_dim, # Embeddingの次数
            token_num, # 系列長(文章中のトークン数)
            drop_rate, # ドロップアウトの確率
            pretrained_weight=None
    ):
        super().__init__()
        self.hopping_num = hopping_num
        
        # Embedding層
        self.token_embedding = TokenEmbedding(vocab_size, hidden_dim, pretrained_weight)
        # Position Embedding
        self.add_position_embedding = AddPositionalEncoding()
        self.input_dropout_layer = nn.Dropout(drop_rate)

        # Multi-head Attentionの繰り返し(hopping)のリスト
        self.attention_block_list = nn.ModuleList([TransformerBlock(token_num, hidden_dim, heads_num) for _ in range(hopping_num)])
        self.output_normalization = nn.LayerNorm(hidden_dim)

    def forward(
            self,
            input,
            memory=None,
            attention_mask=None,
            return_attention_scores=False
    ):
        '''
        input: 入力 [batch_size, length]
        memory: 入力 [batch_size, length]
        attention_mask: attention weight に適用される mask
            [batch_size, 1, q_length, k_length] 
            pad 等無視する部分が 0 となるようなもの(Decoderで使用)
        出力 [batch_size, length, hidden_dim]
        '''
        # [batch_size, token_num] -> [batch_size, token_num, hidden_dim]
        embedded_input = self.token_embedding(input)
        # Positional Embedding
        embedded_input = self.add_position_embedding(embedded_input)
        query = self.input_dropout_layer(embedded_input)
        
        if return_attention_scores:
            # MultiHead Attentionを繰り返し適用
            for i in range(self.hopping_num):
                query, atten_weights = self.attention_block_list[i](query, memory, attention_mask, return_attention_scores)

            # [batch_size, token_num, hidden_dim]
            return self.output_normalization(query), atten_weights
        else:
            # MultiHead Attentionを繰り返し適用
            for i in range(self.hopping_num):
                query = self.attention_block_list[i](query, memory, attention_mask, return_attention_scores)

            # [batch_size, token_num, hidden_dim]
            return self.output_normalization(query)

In [22]:
class AttentionClassifier(nn.Module):
    def __init__(
            self,
            vocab_size, # 単語の総数
            hopping_num, # Multi-head Attentionの繰り返し数
            heads_num, # Multi-head Attentionのヘッド数
            hidden_dim, # Embeddingの次数
            token_num, # 系列長(文章中のトークン数)
            drop_rate, # ドロップアウトの確率
            NUMLABELS, # クラス数
            pretrained_weight=None,
            PAD_ID = 1
    ):
        super().__init__()
        self.PAD_ID = PAD_ID
        
        self.encoder = Encoder(vocab_size, hopping_num, heads_num, hidden_dim, token_num, drop_rate, pretrained_weight)
        self.dense1 = nn.Linear(hidden_dim, hidden_dim)
        self.act1 = nn.Tanh()
        self.dropout1 = nn.Dropout(drop_rate)   
        self.final_layer = nn.Linear(hidden_dim, NUMLABELS)
        
        nn.init.normal_(self.dense1.weight, std=0.02)
        nn.init.normal_(self.dense1.bias, std=0)
        nn.init.normal_(self.final_layer.weight, std=0.02)
        nn.init.normal_(self.final_layer.bias, std=0)

    def forward(self, x, return_attention_scores=False):
        self_attention_mask=self._create_enc_attention_mask(x)
        
        # [batch_size, token_num] -> [batch_size, token_num, hidden_dim]
        if return_attention_scores:
            enc_output, atten_weights = self.encoder(x, attention_mask=self_attention_mask,return_attention_scores=return_attention_scores)
        else:
            enc_output = self.encoder(x, attention_mask=self_attention_mask,return_attention_scores=return_attention_scores)
        
        # 文頭の重みを使用 [batch_size, 0, hidden_dim]
        # [batch_size, hidden_dim] -> [batch_size, hidden_dim]
        enc_output = self.dense1(enc_output[:, 0, :])
        enc_output = self.act1(enc_output)
        enc_output = self.dropout1(enc_output)
        
        # [batch_size, hidden_dim] -> [batch_size, NUMLABELS]
        final_output = self.final_layer(enc_output)

        if return_attention_scores:
            return final_output, atten_weights
        else:
            return final_output
    
    def _create_enc_attention_mask(self, x):
        batch_size, length = x.shape
        # マスクする部分を1とする
        pad_array = torch.eq(x, self.PAD_ID).to(dtype=torch.int8)  # [batch_size, token_num]
        
        # shape broadcasting で [batch_size, head_num, token_num, token_num] になる
        return pad_array.view([batch_size, 1, 1, length])

In [10]:
!pip install janome
import re
from janome.tokenizer import Tokenizer
j_t = Tokenizer(wakati=True)

def tokenizer_janome(text):
    return [tok for tok in j_t.tokenize(text, wakati=True)]

def preprocessing_text(text):
    text = re.sub('\r', '', text)
    text = re.sub('\n', '', text)
    text = re.sub('　', '', text)
    text = re.sub(' ', '', text)
    
    text = re.sub(r'[0-9 ０-９]', '0', text)
    return text

def tokenizer_with_preprocessing(text):
    text = preprocessing_text(text)
    ret = tokenizer_janome(text)
    return ret

Collecting janome
  Downloading Janome-0.4.2-py2.py3-none-any.whl (19.7 MB)
[K     |████████████████████████████████| 19.7 MB 1.2 MB/s 
[?25hInstalling collected packages: janome
Successfully installed janome-0.4.2


In [27]:
import torchtext
#from torchtext import data, datasets
from torchtext.legacy import data

max_length = 64
TEXT = data.Field(sequential=True, tokenize=tokenizer_with_preprocessing,
                  use_vocab=True, lower=True, include_lengths=True,
                  batch_first=True, fix_length=max_length,init_token="<eos>",eos_token="<cls>")
LABEL = data.Field(sequential=False, use_vocab=False, preprocessing=None)

dataset = data.TabularDataset(
        path='reviews.csv', format='csv',
        skip_header=True,
        fields=[('Text', TEXT), ('Label', LABEL), ('Label2', LABEL)])

train_dataset, test_dataset = dataset.split(split_ratio=0.7)
train_dataset, val_dataset = train_dataset.split(split_ratio=0.7)



#train_iter, val_iter, test_iter = data.BucketIterator.splits((train_dataset, val_dataset, test_dataset), batch_size=32, repeat=False, shuffle=True)

AttributeError: ignored

In [28]:
train_iter = data.Iterator(
    train_dataset, batch_size=32, 
    train=True  # train=Trueならシャッフルソートは有効
)
val_iter = data.Iterator(
    val_dataset, batch_size=32, 
    train=False, sort=False
)
test_iter = data.Iterator(
    test_dataset, batch_size=32, 
    train=False, sort=False
)

In [29]:
TEXT.build_vocab(train_dataset)
vocab = TEXT.vocab
len(vocab)

batch = next(iter(train_iter))

net = AttentionClassifier(
            vocab_size = len(vocab), # 単語の総数
            hopping_num = 8, # Multi-head Attentionの繰り返し数
            heads_num = 6, # Multi-head Attentionのヘッド数
            hidden_dim = 300, # Embeddingの次数
            drop_rate = 0.1, # ドロップアウトの確率
            token_num = 64,
            
    pretrained_weight=None,
    NUMLABELS=2
    )

def weights_init(m):
    classname = m.__class__.__name__
    if classname.find('Linear') != -1:
        nn.init.kaiming_normal_(m.weight)
        if m.bias is not None:
            nn.init.constant_(m.bias, 0.0)
net.train()

net.apply(weights_init)

AttentionClassifier(
  (encoder): Encoder(
    (token_embedding): TokenEmbedding(
      (embedding): Embedding(10582, 300, padding_idx=1)
    )
    (add_position_embedding): AddPositionalEncoding()
    (input_dropout_layer): Dropout(p=0.1, inplace=False)
    (attention_block_list): ModuleList(
      (0): TransformerBlock(
        (atten): ResidualNormalizationWrapper(
          (layer): MultiHeadAttention(
            (query): Linear(in_features=300, out_features=300, bias=True)
            (key): Linear(in_features=300, out_features=300, bias=True)
            (value): Linear(in_features=300, out_features=300, bias=True)
            (projection): Linear(in_features=300, out_features=300, bias=True)
            (drop): Dropout(p=0.1, inplace=False)
          )
          (layer_normalization): LayerNorm((300,), eps=1e-05, elementwise_affine=True)
          (drop): Dropout(p=0.1, inplace=False)
        )
        (ffn): ResidualNormalizationWrapper(
          (layer): FeedForwardNetwork(


In [30]:
criterion = nn.CrossEntropyLoss()

learning_rate = 2e-4
optimizer = optimizers.Adam(net.parameters(), lr=learning_rate, amsgrad=True, eps=1e-07)

In [15]:
def train_model(net, dataloaders_dict, criterion, optimizer, num_epochs):
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    print("使用デバイス:", device)
    print('--------start--------')
    net.to(device)
    
    torch.backends.cudnn.benchmark = True
    
    for epoch in range(num_epochs):
        for phase in ['train', 'val']:
            if phase == 'train':
                net.train()
            else:
                net.eval()
            
            epoch_loss = 0.0
            epoch_corrects = 0
            
            for batch in (dataloaders_dict[phase]):
                inputs = batch.Text[0].to(device)
                labels = batch.Label2.to(device)
                
                optimizer.zero_grad()
                
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = net(inputs)
                    loss = criterion(outputs, labels)
                    
                    _, preds = torch.max(outputs, 1)
                    
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()
                    
                    epoch_loss += loss.item() * inputs.size(0)
                    epoch_corrects += torch.sum(preds == labels.data)
            
            epoch_loss = epoch_loss / len(dataloaders_dict[phase].dataset)
            epoch_acc = epoch_corrects.double() / len(dataloaders_dict[phase].dataset)
            
            print('Epoch {}/{} | {:.^5} | Loss: {:.4f} Acc: {:.4f}'.format(epoch+1,
                                                                           num_epochs,
                                                                           phase,
                                                                           epoch_loss,
                                                                           epoch_acc))
        
    return net

In [31]:
dataloaders_dict = {"train":train_iter, "val":val_iter}

In [32]:
num_epochs = 20

net_trained = train_model(net, dataloaders_dict, criterion, optimizer, num_epochs=num_epochs)

使用デバイス: cuda:0
--------start--------
Epoch 1/20 | train | Loss: 0.5432 Acc: 0.7392
Epoch 1/20 | .val. | Loss: 0.4456 Acc: 0.7806
Epoch 2/20 | train | Loss: 0.4326 Acc: 0.8147
Epoch 2/20 | .val. | Loss: 0.4013 Acc: 0.8277
Epoch 3/20 | train | Loss: 0.3595 Acc: 0.8526
Epoch 3/20 | .val. | Loss: 0.3760 Acc: 0.8352
Epoch 4/20 | train | Loss: 0.3004 Acc: 0.8748
Epoch 4/20 | .val. | Loss: 0.3251 Acc: 0.8653
Epoch 5/20 | train | Loss: 0.2488 Acc: 0.8962
Epoch 5/20 | .val. | Loss: 0.3283 Acc: 0.8814
Epoch 6/20 | train | Loss: 0.2029 Acc: 0.9225
Epoch 6/20 | .val. | Loss: 0.3071 Acc: 0.8804
Epoch 7/20 | train | Loss: 0.1777 Acc: 0.9294
Epoch 7/20 | .val. | Loss: 0.3506 Acc: 0.8804
Epoch 8/20 | train | Loss: 0.1009 Acc: 0.9657
Epoch 8/20 | .val. | Loss: 0.5227 Acc: 0.8625
Epoch 9/20 | train | Loss: 0.0666 Acc: 0.9766
Epoch 9/20 | .val. | Loss: 0.4938 Acc: 0.8766
Epoch 10/20 | train | Loss: 0.0345 Acc: 0.9903
Epoch 10/20 | .val. | Loss: 0.6993 Acc: 0.8550
Epoch 11/20 | train | Loss: 0.0669 Acc: 0

In [73]:
from IPython.display import HTML
from IPython.display import HTML, display

def mk_html(index, batch, preds, normlized_weights, vocab, labels=["Negative", "Positive"]):
    "HTMLデータを作成する"

    # indexの結果を抽出
    sentence = batch[0][index]  # 文章
    label = batch[1][index] # ラベル
    pred = preds[index]  # 予測
    # ラベルと予測結果を文字に置き換え
    label_str = labels[label]
    pred_str = labels[pred]
    # 表示用のHTMLを作成する
    html = '正解ラベル：{}<br>推論ラベル：{}<br><br>'.format(label_str, pred_str)

    # 12種類のAttentionの平均を求める。最大値で規格化
    all_attens = normlized_weights[0, :, 0, :].sum(axis=0)*0  # all_attensという変数を作成する
    all_attens = np.sum(normlized_weights[index, :, 0, :], axis=0)
    all_attens = (all_attens -all_attens.min()) /  (all_attens.max()-all_attens.min())

    for word, attn in zip(sentence, all_attens):
        # 単語が[SEP]の場合は文章が終わりなのでbreak
        if TEXT.vocab.itos[word] == "<cls>":
            break

        # 関数highlightで色をつける、関数tokenizer_bert.convert_ids_to_tokensでIDを単語に戻す
        html += highlight(TEXT.vocab.itos[word], attn)
    html += "<br><br>"

    return html

def highlight(word, attn):
    "Attentionの値が大きいと文字の背景が濃い赤になるhtmlを出力させる関数"

    html_color = '#%02X%02X%02X' % (
        255, int(255*(1 - attn)), int(255*(1 - attn)))
    return '<span style="background-color: {}"> {}</span>'.format(html_color, word)

In [65]:
def make_html(preds, batch, labels=["Negative", "Positive"]):
  html_output = [mk_html(index=idx,
                         batch=batch, 
                         preds=np.argmax(preds, axis=1),
                         normlized_weights=atten,
                         vocab=vocab,
                         labels=labels) for idx in np.arange(len(preds))]
  return  html_output

In [66]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
batch = next(iter(test_iter))

preds, atten = net(batch.Text[0].to(device), return_attention_scores=True)
atten = atten.to('cpu').detach().numpy()
preds = preds.to('cpu').detach().numpy()

In [74]:
import numpy as np

html_results = make_html(preds, batch=(np.array(batch.Text[0]), np.array(batch.Label2)), labels=["Negative", "Positive"])

In [75]:
HTML(html_results[1])