# 5. Transformerの構成

## 5.1. 準備

In [1]:
import sys
sys.path.append("/content/drive/My Drive/Transformer")

from utils.dataloader import get_IMDb_DataLoaders_and_TEXT

train_dl, val_dl, test_dl, TEXT = get_IMDb_DataLoaders_and_TEXT(max_length=256, batch_size=32)
print(TEXT.vocab.vectors.shape)
batch = next(iter(train_dl))
print(batch.Text)
print(batch.Label)

  0%|          | 0/999994 [00:00<?, ?it/s]Skipping token b'999994' with 1-dimensional vector [b'300']; likely a header
100%|█████████▉| 999855/999994 [01:51<00:00, 9457.67it/s]

torch.Size([69959, 300])
(tensor([[   2,  105,   43,  ...,    1,    1,    1],
        [   2,   14,   87,  ...,    4,   93,    3],
        [   2,   12, 1926,  ...,    1,    1,    1],
        ...,
        [   2,   52,   50,  ...,    5,    5,    3],
        [   2,   19,  213,  ..., 1325,    4,    3],
        [   2,   14,  382,  ...,  106,   16,    3]]), tensor([131, 256, 167, 129, 147, 145, 237, 147, 256, 227, 208, 158, 186, 196,
        256, 236, 177, 221, 160, 215, 256, 159, 256, 219, 256, 256, 256, 129,
        256, 256, 256, 256]))
tensor([0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0,
        1, 1, 0, 1, 0, 0, 1, 0])


In [72]:
import torch
import torch.nn as nn
import torch.nn.functional as F 
import math

## 5.2. Embedder

In [33]:
# 単語IDから単語ベクトルへの変換
class Embedder(nn.Module):
    def __init__(self, text_embedding_vectors):
        super(Embedder, self).__init__()

        # 学習済みモデルを読込み更新されないようにする
        self.emb = nn.Embedding.from_pretrained(embeddings=text_embedding_vectors, freeze=True)

    def forward(self, x):
        return self.emb(x)


## 5.3. PositionEncoder

In [34]:
# 単語の位置を表すベクトルを付与
class PositionEncoder(nn.Module):
    def __init__(self, d_model=300, max_seq_len=256):
        super(PositionEncoder, self).__init__()

        self.d_model = d_model      # 単語ベクトルの次元数
        self.pe = torch.zeros(max_seq_len, d_model) # 位置情報ベクトル

        for pos in range(max_seq_len):
            for i in range(0, d_model, 2):
                self.pe[pos, i    ] = math.sin(pos / (10000 ** ((2 * i) / d_model)))
                self.pe[pos, i + 1] = math.cos(pos / (10000 ** ((2 * i) / d_model)))
        self.pe = self.pe.unsqueeze(0)  # バッチの次元を付与
        self.pe.requires_grad = False   # 勾配を計算させない

    def forward(self, x):
        return (math.sqrt(self.d_model) * x) + self.pe


## 5.4. Transformer

### 5.4.1. Attention

In [77]:
# SingleHeadAttention
class Attention(nn.Module):
    def __init__(self, d_model=300):
        super(Attention, self).__init__()

        self.d_model = d_model
        self.q_linear = nn.Linear(d_model, d_model)
        self.k_linear = nn.Linear(d_model, d_model)
        self.v_linear = nn.Linear(d_model, d_model)
        self.out = nn.Linear(d_model, d_model)

    def forward(self, x, mask):
        q = self.q_linear(x)
        k = self.k_linear(x)
        v = self.v_linear(x)

        # Attentionを計算
        weights = torch.matmul(q, k.transpose(1, 2)) / math.sqrt(self.d_model)
        weights = weights.masked_fill(mask.unsqueeze(1) == 0, -1e9)  # <pad>の重みが0になるようにする
        normalized_weights = F.softmax(weights, dim=-1)
        h = torch.matmul(normalized_weights, v)
        return self.out(h), normalized_weights


### 5.4.2. FeedForward

In [82]:
class FeedForward(nn.Module):
    def __init__(self, d_model=300, d_hidden=1024, drop_ratio=0.1):
        super(FeedForward, self).__init__()

        self.layers = nn.Sequential(
            nn.Linear(d_model, d_hidden),
            nn.ReLU(),
            nn.Dropout(drop_ratio),
            nn.Linear(d_hidden, d_model)
        )

    def forward(self, x):
        return self.layers(x)


### 5.4.3. TransformerBlock

In [84]:
class TransformerBlock(nn.Module):
    def __init__(self, d_model=300, d_hidden=10204, drop_ratio=0.1):
        super(TransformerBlock, self).__init__()

        # Attention
        self.norm1 = nn.LayerNorm(d_model)
        self.attn = Attention(d_model)
        self.dropout1 = nn.Dropout(drop_ratio)

        # FeedForward
        self.norm2 = nn.LayerNorm(d_model)
        self.ff = FeedForward(d_model, d_hidden, drop_ratio)
        self.dropout2 = nn.Dropout(drop_ratio)

    def forward(self, x, mask):
        # Attention
        h, normalized_weights = self.attn(self.norm1(x), mask)
        h = x + self.dropout1(h)

        # FeedForward
        h = h + self.dropout2(self.ff(self.norm2(h)))

        return h, normalized_weights


## 5.5. ClassificationHead

In [109]:
class ClassificationHead(nn.Module):
    def __init__(self, d_model=300, d_out=2):
        super(ClassificationHead, self).__init__()

        self.layer = nn.Linear(d_model, d_out)
        nn.init.normal_(self.layer.weight, std=0.02)
        nn.init.normal_(self.layer.bias, 0)

    def forward(self, x):
        return F.softmax(self.layer(x[:, 0, :]), dim=1)   # 最初の単語(<cls>)のみ使用する


## 5.6. TransformerClassification

In [105]:
class TransformerClassification(nn.Module):
    def __init__(self, emb_vectors, d_model=300, max_seq_len=256, d_hidden=1024, d_out=2, drop_ratio=0.1):
        super(TransformerClassification, self).__init__()

        self.emb = Embedder(emb_vectors)
        self.pe = PositionEncoder(d_model, max_seq_len)
        self.trm1 = TransformerBlock(d_model, d_hidden, drop_ratio)
        self.trm2 = TransformerBlock(d_model, d_hidden, drop_ratio)
        self.head = ClassificationHead(d_model, d_out)

    def forward(self, x, mask):
        h = self.pe(self.emb(x))
        h, attn_w1 = self.trm1(h, mask)
        h, attn_w2 = self.trm2(h, mask)
        h = self.head(h)
        return h, attn_w1, attn_w2


### 動作確認

In [None]:
d_model = 300
max_seq_len = 256
d_hidden = 1024
drop_ratio = 0.1
d_out = 2

emb_in = batch.Text[0]
input_mask = (emb_in != TEXT.vocab.stoi["<pad>"])       # 文章でない箇所をマスクする

emb = Embedder(TEXT.vocab.vectors)
emb_out = emb(emb_in)

pe = PositionEncoder(d_model=d_model, max_seq_len=max_seq_len)
pe_out = pe(emb_out)

attention = Attention(d_model=d_model)
attention_out, normalized_weights = attention(pe_out, input_mask)

ff = FeedForward(d_model=d_model, d_hidden=d_hidden, drop_ratio=drop_ratio)
ff_out = ff(attention_out)

trm = TransformerBlock(d_model, d_hidden, drop_ratio)
trm_out, normalized_weights = trm(pe_out, input_mask)

head = ClassificationHead(d_model, d_out)
head_out = head(trm_out)

print("入力テンソルサイズ: {}".format(emb_in.shape))
print("出力テンソルサイズ(Embedder): {}".format(emb_out.shape))
print("出力テンソルサイズ(PositionEncoder): {}".format(pe_out.shape))
print("出力テンソルサイズ(Attention): {}".format(attention_out.shape))
print("出力テンソルサイズ(FeedForward): {}".format(ff_out.shape))
print("出力テンソルサイズ(TransformerBlock): {}".format(trm_out.shape))
print("出力テンソルサイズ(ClassificationHead): {}".format(head_out.shape))
print("アテンションサイズ: {}".format(normalized_weights.shape))


In [112]:
batch = next(iter(train_dl))
input_mask = (emb_in != TEXT.vocab.stoi["<pad>"])

net = TransformerClassification(TEXT.vocab.vectors, d_model, max_seq_len, d_hidden, d_out, drop_ratio)
out, attn_w1, attn_w2 = net(batch.Text[0], input_mask)
print(out.shape)
print(attn_w1.shape)
print(attn_w2.shape)
print(out)


torch.Size([32, 2])
torch.Size([32, 256, 256])
torch.Size([32, 256, 256])
tensor([[0.0184, 0.9816],
        [0.0190, 0.9810],
        [0.0186, 0.9814],
        [0.0173, 0.9827],
        [0.0188, 0.9812],
        [0.0183, 0.9817],
        [0.0203, 0.9797],
        [0.0186, 0.9814],
        [0.0204, 0.9796],
        [0.0169, 0.9831],
        [0.0184, 0.9816],
        [0.0191, 0.9809],
        [0.0224, 0.9776],
        [0.0153, 0.9847],
        [0.0202, 0.9798],
        [0.0181, 0.9819],
        [0.0179, 0.9821],
        [0.0199, 0.9801],
        [0.0168, 0.9832],
        [0.0190, 0.9810],
        [0.0205, 0.9795],
        [0.0182, 0.9818],
        [0.0201, 0.9799],
        [0.0175, 0.9825],
        [0.0181, 0.9819],
        [0.0187, 0.9813],
        [0.0183, 0.9817],
        [0.0174, 0.9826],
        [0.0177, 0.9823],
        [0.0181, 0.9819],
        [0.0147, 0.9853],
        [0.0198, 0.9802]], grad_fn=<SoftmaxBackward>)
