<a href="https://colab.research.google.com/github/watanabe-gk/study_gpt/blob/main/GPT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# モジュールのインストール

# ライブラリーのインポート

In [331]:
# @title import
import math
import torch
import torch.nn as nn
import torch.nn.functional as F

In [332]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

位置の埋め込みレイヤー

In [333]:
#@title Position embedding
class PositionEmbedding(nn.Module):
  def __init__(self, context_size, d_model):
    super(PositionEmbedding, self).__init__()
    self.embedding = nn.Embedding(context_size, d_model)

  def forward(self, x):
    positions = torch.arange(0, x.size(1), device=x.device)
    return self.embedding(positions)

In [334]:
# Exercise
x = torch.LongTensor([[4545, 8410, 458, 3]])
position_embedding = PositionEmbedding(4, 256)
wpe = position_embedding(x)
wpe

tensor([[ 0.4800,  0.5192,  0.2423,  ..., -1.1535, -1.1035,  0.8625],
        [ 0.4029,  1.0680, -1.3726,  ...,  2.1120, -2.4860,  0.0073],
        [ 0.8543, -0.1828,  0.3025,  ...,  0.7086, -0.6651,  1.7564],
        [-1.2328,  0.6328, -0.6807,  ...,  1.5711,  2.5385, -0.4173]],
       grad_fn=<EmbeddingBackward0>)


Transformerの論文で提案された正弦波バージョンを使用しました

In [335]:
#@title Positional encoding
class PositionalEncoding(nn.Module):
    def __init__(self, context_size, d_model):
        super(PositionalEncoding, self).__init__()

        pe = torch.zeros(context_size, d_model)

        for pos in range(context_size):
            for i in range(0, d_model, 2):
                pe[pos,i]   = math.sin(pos/(10000**((2*i)/d_model)))
                pe[pos,i+1] = math.cos(pos/(10000**((2*i)/d_model)))

        # 学習パラメーターの更新対象から外してクラス変数に確保(重要)
        self.register_buffer('pe', pe.unsqueeze(0))

    def forward(self, x):
        # positional encodingを埋め込みベクトルへ追加します
        return self.pe[:, :x.size(1)].detach()

In [336]:
# Exercise
x = torch.LongTensor([[4545, 8410, 458, 3]])
position_embedding = PositionEmbedding(4, 256)
wpe = position_embedding(x)
wpe

tensor([[ 0.3726,  0.6929,  0.2467,  ..., -1.2644, -0.0486,  0.8893],
        [-0.3559,  1.5074,  0.2217,  ..., -0.7366, -1.2823, -0.0582],
        [-0.2904,  0.7499, -1.1993,  ...,  0.3225,  1.0186,  0.1519],
        [ 1.0053, -0.8694,  0.0742,  ...,  0.1555,  1.6620, -0.4843]],
       grad_fn=<EmbeddingBackward0>)

In [337]:
#@title Attention mask
def create_attention_mask(context_size):
    # 全ての要素が1となる正方行列を作成
    mask = torch.ones((context_size, context_size))

    # 対角線より下を0に変換
    mask = torch.triu(mask, diagonal=1)

    # True/False の型に変換(boolean)
    mask = mask == 0

    # 対角線より上の値を0、下の値を1に変換
    mask = mask*1

    return mask

In [338]:
# Exercise: シーケンスが10の場合のマスクの作成
create_attention_mask(10)

tensor([[1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])

In [339]:
# @title Scaled Dot-Product Attention
class ScaledDotProductAttention(nn.Module):
    ''' Scaled Dot-Product Attention '''

    def __init__(self, d_model, dropout_rate=0.1):
        super().__init__()
        self.sqrt_d_k = d_model ** 0.5 # sqrt(d_k)　と同じ
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, q, k, v, mask=None):
        score = torch.matmul(q, k.transpose(2, 3)) /  self.sqrt_d_k

        if mask is not None:
            score = score.masked_fill(mask == 0, float("-inf"))

        attn = F.softmax(score, dim=-1)
        attn = self.dropout(attn)
        output = torch.matmul(attn, v)

        return output, attn

In [340]:
# Exercise: Square root
print(math.sqrt(256))
print(256 ** 0.5)

16.0
16.0


In [341]:
# Exercise: Attention score
context_size = 3
dim = 4
q = torch.randn(context_size, dim)
k = torch.randn(context_size, dim)
v = torch.randn(context_size, dim)
print(q)
print(k)
print(v)

a = q@k.T
print(a)

tensor([[-0.1984,  0.5174, -1.8787, -1.7152],
        [-1.4715, -0.3552,  0.2060,  0.3720],
        [-0.3460,  1.6622,  0.7940, -0.8056]])
tensor([[-0.6117, -0.0803,  1.1335,  0.3045],
        [ 0.2897, -0.1205, -0.7841, -0.7923],
        [ 1.6311, -0.9104, -2.0586,  0.5871]])
tensor([[ 1.4550,  0.9173,  0.4871,  0.2666],
        [-1.2774, -0.9786, -0.0969,  0.0608],
        [ 0.4834, -1.8371,  0.8575,  1.5238]])
tensor([[-2.5718,  2.7122,  2.0658],
        [ 1.2754, -0.8398, -2.2824],
        [ 0.7329, -0.2849, -4.1852]])


In [342]:
# Exercise: Scaling
a = a / (dim ** 0.5)
print(a)

tensor([[-1.2859,  1.3561,  1.0329],
        [ 0.6377, -0.4199, -1.1412],
        [ 0.3665, -0.1424, -2.0926]])


In [343]:
# Exercise: Masking
mask = create_attention_mask(context_size)
print(a)
print(mask)
attn = a.masked_fill(mask == 0, float("-inf"))

tensor([[-1.2859,  1.3561,  1.0329],
        [ 0.6377, -0.4199, -1.1412],
        [ 0.3665, -0.1424, -2.0926]])
tensor([[1, 0, 0],
        [1, 1, 0],
        [1, 1, 1]])


In [344]:
# Exercise: probability
F.softmax(attn, dim=1)

tensor([[1.0000, 0.0000, 0.0000],
        [0.7422, 0.2578, 0.0000],
        [0.5929, 0.3564, 0.0507]])

Multi-Head Attention

$$
\text{MultiHead}(Q,K,V)=\text{Concat}(head_1,…,head_h)W^O \\
\text{where } ℎead_i = \text{Attention}(QW^{Q}_i, KW^{K}_i, VW^{V}_i).
$$

In [345]:
#@title  Multi-Head Attention
class MultiHeadAttention(nn.Module):
    ''' Multi-Head Attention module '''

    def __init__(self, n_head, d_model, dropout_rate=0.1):
        super().__init__()
        self.n_head = n_head
        self.d_model = d_model
        self.fc_q = nn.Linear(d_model, d_model)
        self.fc_k = nn.Linear(d_model, d_model)
        self.fc_v = nn.Linear(d_model, d_model)
        self.attn = ScaledDotProductAttention(d_model, dropout_rate)
        self.fc = nn.Linear(d_model, d_model)
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, q, k, v, mask=None):
        N = q.size(0) # バッチサイズ（Transformerの場合、QKVは同じサイズ）
        S = q.size(1) # ウィンドウサイズ（Transformerの場合、QKVは同じサイズ）
        H = self.n_head # マルチヘッドの数
        D = self.d_model // self.n_head # 潜在区間の次元（Cross Attentonの場合、個別に定義）


        # 線形変換
        q = self.fc_q(q)
        k = self.fc_k(k)
        v = self.fc_v(v)

        # 展開
        q = q.view(N, S, H, D)
        k = k.view(N, S, H, D)
        v = v.view(N, S, H, D)

        # Scaled dot-product attention
        q, k, v = q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2)
        x, attn = self.attn(q, k, v, mask=mask)

        # Concat
        # Transpose to move the head dimension back: b x lq x n x dv
        # Combine the last two dimensions to concatenate all the heads together: b x lq x (n*dv)
        # transposeは、見かけのテンソールを変形しているので、contiguous()で、メモリ内の形状を書き換えます。
        x = x.transpose(1, 2).contiguous().view(N, S, -1) # re-assemble all head outputs side by side

        # 線形変換
        x = self.fc(x)

        x = self.dropout(x)

        return x, attn

In [346]:
# Exercise: transposeの使い方
a = torch.randint(10,(128,15,8,256))
print(a.shape)
a.transpose(1,2).shape

torch.Size([128, 15, 8, 256])


torch.Size([128, 8, 15, 256])

In [347]:
# Exercise: Multi-head attention
n_head = 8
d_model = 16
attention = MultiHeadAttention(n_head, d_model)

In [348]:
batch_size = 1
context_size = 10
# Query: x
# Attentionブロックへの最初の入力となるクエリxを作成します
x = torch.randn(batch_size, context_size, d_model)
q, w = attention(x, x, x)

In [349]:
q.shape # 入力トークン数分の潜在変数

torch.Size([1, 10, 16])

In [350]:
w.shape  # 入力トークン数の正方行列

torch.Size([1, 8, 10, 10])

Position-wise feed forward Networks

論文 '$\text{Attention Is All You Need}$' では、FeedForwardは、ReLU活性化関数を介した2つの線形変換で構成されています。

$$
\text{FFN}(x) = \text{max}(0, xW_1 + b_1)W_2 + b_2  
\\
$$

入力と出力の次元は$d_{model} = 512$で、隠れ層の次元は$d_{ff} = 2048$です。

In [351]:
#@title Feed Forward
class FeedForward(nn.Module):
    def __init__(self, d_model, dropout_rate=0.1):
        super(FeedForward, self).__init__()
        # 潜在空間の４倍の隠れ層を持つMLPを作成
        self.fc1 = nn.Linear(d_model, d_model * 4)
        self.dropout = nn.Dropout(dropout_rate)
        self.fc2 = nn.Linear(d_model * 4, d_model)

    def forward(self, x):
        h = self.fc1(x)
        h = F.gelu(h)
        h = self.fc2(h)
        h = self.dropout(h)
        return h

In [352]:
# Exercise: FeedForward
d_model = 2
ff = FeedForward(d_model)
print(ff)
x = torch.randn(d_model)
print(x)
ff(x)

FeedForward(
  (fc1): Linear(in_features=2, out_features=8, bias=True)
  (dropout): Dropout(p=0.1, inplace=False)
  (fc2): Linear(in_features=8, out_features=2, bias=True)
)
tensor([-1.5701, -0.4246])


tensor([-0.0000, -0.2646], grad_fn=<MulBackward0>)

In [353]:
#@title Transformer Block
class TransformerBlock(nn.Module):
    def __init__(self, d_model, n_head, dropout_rate=0.1):
        super(TransformerBlock, self).__init__()
        self.norm_1 = nn.LayerNorm(d_model)
        self.norm_2 = nn.LayerNorm(d_model)
        self.attn = MultiHeadAttention(n_head, d_model, dropout_rate)
        self.ff = FeedForward(d_model)

        nn.init.normal_(self.norm_1.weight, mean=0, std=0.02)
        nn.init.normal_(self.norm_2.weight, mean=0, std=0.02)

    # GPT-1
    def forward(self, x, mask=None):
        rx = x # 残差 (residual value)
        x, w = self.attn(x, x, x, mask)
        x = self.norm_1(x + rx)

        rx = x
        x = self.ff(x)
        x = self.norm_2(x + rx)

        return x, w

In [354]:
# Exercise: TransformerBlock
d_model = 2
n_head = 1
block = TransformerBlock(d_model, n_head)
batch_size = 1
context_size = 5
# token vectors
x = torch.randn(batch_size, context_size, d_model)
y, w = block(x)
print(x)
print(y.shape)
print(y)
print(w.shape)
print(w)

tensor([[[ 0.6843, -0.5601],
         [ 1.4043,  0.4406],
         [ 1.4172,  0.4066],
         [ 0.5804,  0.8283],
         [-1.0066, -0.1304]]])
torch.Size([1, 5, 2])
tensor([[[-0.0450, -0.0142],
         [-0.0450, -0.0142],
         [-0.0450, -0.0142],
         [-0.0451, -0.0142],
         [-0.0451, -0.0142]]], grad_fn=<NativeLayerNormBackward0>)
torch.Size([1, 1, 5, 5])
tensor([[[[0.1215, 0.1525, 0.1484, 0.2998, 0.3889],
          [0.1244, 0.1816, 0.1768, 0.3182, 0.3101],
          [0.0000, 0.1802, 0.0000, 0.3197, 0.3130],
          [0.1727, 0.2033, 0.2007, 0.2657, 0.2686],
          [0.2199, 0.1790, 0.1787, 0.2135, 0.3200]]]], grad_fn=<MulBackward0>)


In [355]:
#@title GPT
class GPT(nn.Module):
    def __init__(self, vocab_size, context_size, d_model, n_block, n_head, dropout_rate=0.1):
        super(GPT, self).__init__()
        self.vocab_size = vocab_size
        self.context_size = context_size
        self.d_model = d_model
        self.n_block = n_block
        self.n_head = n_head
        self.token_embedding = nn.Embedding(vocab_size, d_model)
        # self.position_embedding = PositionEmbedding(context_size, d_model)
        self.positional_encoding = PositionalEncoding(context_size, d_model)
        self.dropout = nn.Dropout(dropout_rate)
        self.transformer_blocks = nn.ModuleList(
            [TransformerBlock(d_model, n_head, dropout_rate) for _ in range(self.n_block)])
        self.norm = nn.LayerNorm(d_model)
        self.fc = nn.Linear(d_model * context_size, vocab_size)

        nn.init.xavier_uniform_(self.fc.weight)
        nn.init.normal_(self.token_embedding.weight, mean=0.0, std=0.02)
        # nn.init.normal_(self.position_embedding.embedding.weight, mean=0.0, std=0.02)

    def forward(self, x, mask=None):
        x = self.token_embedding(x) + self.positional_encoding(x)
        x = self.dropout(x)

        for block in self.transformer_blocks:
            x, w = block(x, mask)

        # GPT-2
        x = self.norm(x)

        x = x.view(-1, self.context_size * self.d_model)

        # 線形変換
        x = self.fc(x)

        # Softmax は損失関数に組み込まれています。

        return x, w

In [356]:
# モデルの動作を確認しましょう
context_size = 5 # @param{type:'integer'}
vocab_size = 10   # @param{type:'integer'}
d_model = 8 # @param{type:'integer'}
n_block = 6 # @param{type:'integer'}
n_head = 4 # @param{type:'integer'}

In [357]:
# モデルの動作を確認しましょう
model = GPT(vocab_size, context_size, d_model, n_block, n_head)
mask = create_attention_mask(context_size).to(device)
x = torch.LongTensor([[1,2,3,4,9]]) # 0～9までの数値を使って context_size の長さの配列を作成します。
y, w = model(x)

In [358]:
y.shape

torch.Size([1, 10])

In [359]:
y

tensor([[ 0.4637, -0.0513, -0.1753,  1.7514, -0.1457,  0.3194,  1.4937,  1.5145,
          1.7834, -1.4435]], grad_fn=<AddmmBackward0>)

In [361]:
y.argmax() # [2,3,4,9,8]になる

tensor(8)