In [2]:
import torch
import torch.nn as nn
import os
import requests
import tiktoken
import pandas as pd
import math

In [3]:
token_length= 16
d_model = 64
batch_size = 4
num_heads = 4

In [4]:
with open('dataset.txt', 'r') as f:
    text = f.read()



In [5]:
encoding = tiktoken.get_encoding('cl100k_base')
tokenized_text = encoding.encode(text)
tokenized_text = torch.tensor(tokenized_text, dtype=torch.long)

In [6]:
train_index = int(len(tokenized_text) * 0.9)
train_data = tokenized_text[:train_index]
validation_data = tokenized_text[train_index:]

In [7]:
data = train_data
indexs = torch.randint(low=0, high=len(data) - token_length, size=(batch_size,))
# 4 * 16 tensor
x_batch = torch.stack([data[idx: idx + token_length] for idx in indexs])
y_batch = torch.stack([data[idx + 1: idx + token_length + 1] for idx in indexs])

In [8]:
# pd.DataFrame(x_batch[0].numpy())
encoding.decode(x_batch[0].numpy())

'?"\nActive listening skills complement effective questioning techniques. By actively listening to your customers,'

In [9]:
# get max index of token
max_token_value = tokenized_text.max().item()
max_token_value

100069

In [10]:
# construct a 100070 * 16 matrix, get input embedding
input_embedding_table = nn.Embedding(max_token_value + 1, d_model)
input_embedding_table.weight.data

tensor([[ 0.8140, -0.4648,  1.0317,  ..., -0.3554,  0.8825, -1.4436],
        [-1.7957, -0.6973,  0.4765,  ..., -0.0712,  0.7564, -0.6118],
        [ 1.4702, -0.9546,  1.4693,  ..., -0.1258,  0.0822, -0.2688],
        ...,
        [ 0.4427,  0.1723, -0.6616,  ..., -0.7277,  0.0448, -0.4593],
        [ 1.2491, -1.9159, -0.4042,  ...,  1.6459,  0.8562, -0.3897],
        [-0.7713, -0.1298,  0.5199,  ...,  0.6477,  1.8090,  0.1579]])

In [11]:
x_batch_embedding = input_embedding_table(x_batch)
y_batch_embedding = input_embedding_table(y_batch)
x_batch_embedding.shape

torch.Size([4, 16, 64])

In [12]:
# get postional encoding
# first 16 * 16 matrix
postion_encoding_table = torch.zeros(token_length, d_model)
# postion 16 * 1
postion = torch.arange(0, token_length, dtype=torch.float).unsqueeze(1)
div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
# [:, 0::2], ':' = selecting all lines, '0::2' = starting from 0, step by step 2
postion_encoding_table[:, 0::2] = torch.sin(postion * div_term)
postion_encoding_table[:, 1::2] = torch.sin(postion * div_term)
postion_encoding_table = postion_encoding_table.unsqueeze(0).expand(batch_size, -1, -1)

postion_encoding_table.shape

torch.Size([4, 16, 64])

In [13]:
# add postion embedding with input embedding
x = x_batch_embedding + postion_encoding_table
y = y_batch_embedding + postion_encoding_table
x.shape

torch.Size([4, 16, 64])

In [14]:
Wq = nn.Linear(d_model, d_model)
Wk = nn.Linear(d_model, d_model)
Wv = nn.Linear(d_model, d_model)
# linear last two dimension do the matrix mul
Q = Wq(x)
K = Wk(x)
V = Wv(x)

Q.shape

torch.Size([4, 16, 64])

In [15]:
# [4, 16, 4, 16] -> [4, 4, 16, 16]
# why permute? To enable each head to do separate computing which is parallel computing
# 说人话，就是考虑的是token_length与每个head维度之间的关系，而不是num_heads与别的关系；矩阵相乘是
Q = Q.reshape(batch_size, token_length, num_heads, d_model//num_heads).permute(0, 2, 1, 3)
K = K.reshape(batch_size, token_length, num_heads, d_model//num_heads).permute(0, 2, 1, 3)
V = V.reshape(batch_size, token_length, num_heads, d_model//num_heads).permute(0, 2, 1, 3)
Q.shape

torch.Size([4, 4, 16, 16])

In [19]:
# apply the attension fomular
output = Q @ K.transpose(-2, -1) / math.sqrt(d_model // num_heads)

In [24]:
# apply the mask，矩阵上三角（不包括对角线取-inf)
mask = torch.triu(torch.ones(token_length, token_length), diagonal=1).bool()
output = output.masked_fill(mask, float('-inf'))

output

tensor([[[[ 0.2623,    -inf,    -inf,  ...,    -inf,    -inf,    -inf],
          [-0.3034,  0.1390,    -inf,  ...,    -inf,    -inf,    -inf],
          [ 0.2898,  0.1144,  0.3308,  ...,    -inf,    -inf,    -inf],
          ...,
          [ 0.0499, -0.3026, -0.1784,  ..., -0.5264,    -inf,    -inf],
          [-0.4222,  0.2738,  0.2808,  ...,  1.3248, -0.3370,    -inf],
          [-0.4129,  0.3758, -0.9122,  ...,  0.2867,  0.3695, -0.3973]],

         [[-0.6873,    -inf,    -inf,  ...,    -inf,    -inf,    -inf],
          [ 0.4880, -0.9297,    -inf,  ...,    -inf,    -inf,    -inf],
          [ 0.2570, -0.1666,  0.0134,  ...,    -inf,    -inf,    -inf],
          ...,
          [-0.0403, -0.5665,  0.2625,  ...,  0.0392,    -inf,    -inf],
          [-0.3586,  0.3388, -0.4874,  ...,  0.0802, -0.3259,    -inf],
          [-0.3843, -0.6643,  0.3547,  ..., -0.0416, -0.7000, -0.1762]],

         [[ 0.4576,    -inf,    -inf,  ...,    -inf,    -inf,    -inf],
          [-0.0694,  0.8065,  

In [26]:
# softmax, dim = -1对每一行进行softmax
attention_score = torch.softmax(output, dim=-1)
attention_score

tensor([[[[1.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
          [0.3912, 0.6088, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
          [0.3471, 0.2913, 0.3616,  ..., 0.0000, 0.0000, 0.0000],
          ...,
          [0.0859, 0.0604, 0.0683,  ..., 0.0482, 0.0000, 0.0000],
          [0.0316, 0.0633, 0.0637,  ..., 0.1810, 0.0344, 0.0000],
          [0.0503, 0.1106, 0.0305,  ..., 0.1012, 0.1099, 0.0511]],

         [[1.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
          [0.8050, 0.1950, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
          [0.4101, 0.2685, 0.3214,  ..., 0.0000, 0.0000, 0.0000],
          ...,
          [0.0514, 0.0304, 0.0696,  ..., 0.0557, 0.0000, 0.0000],
          [0.0275, 0.0552, 0.0242,  ..., 0.0427, 0.0284, 0.0000],
          [0.0479, 0.0362, 0.1002,  ..., 0.0674, 0.0349, 0.0589]],

         [[1.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
          [0.2940, 0.7060, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
          [0.3475, 0.1931, 0.4594,  ..., 0