In [29]:
import torch
import torch.nn as nn
import os
import requests
import tiktoken
import pandas as pd
import math

In [30]:
token_length= 16
d_model = 64
batch_size = 4
num_heads = 4

In [31]:
with open('dataset.txt', 'r') as f:
    text = f.read()



In [32]:
encoding = tiktoken.get_encoding('cl100k_base')
tokenized_text = encoding.encode(text)
tokenized_text = torch.tensor(tokenized_text, dtype=torch.long)

In [33]:
train_index = int(len(tokenized_text) * 0.9)
train_data = tokenized_text[:train_index]
validation_data = tokenized_text[train_index:]

In [34]:
data = train_data
indexs = torch.randint(low=0, high=len(data) - token_length, size=(batch_size,))
# 4 * 16 tensor
x_batch = torch.stack([data[idx: idx + token_length] for idx in indexs])
y_batch = torch.stack([data[idx + 1: idx + token_length + 1] for idx in indexs])

In [35]:
# pd.DataFrame(x_batch[0].numpy())
encoding.decode(x_batch[0].numpy())

' you can identify the underlying motivations and desires. Through careful analysis and evaluation, customization'

In [36]:
# get max index of token
max_token_value = tokenized_text.max().item()
max_token_value

100069

In [37]:
# construct a 100070 * 16 matrix, get input embedding
input_embedding_table = nn.Embedding(max_token_value + 1, d_model)
input_embedding_table.weight.data

tensor([[ 1.0218, -0.1466, -1.0452,  ...,  0.3268, -0.6308, -1.1450],
        [ 0.9685, -0.9843,  0.2110,  ..., -1.1781,  0.8842,  0.0057],
        [-0.4306, -0.4919,  0.1680,  ..., -0.4137, -1.5682, -0.6410],
        ...,
        [ 0.4012,  0.6788, -0.0704,  ...,  0.2108,  0.0893, -0.0154],
        [ 1.1024,  0.3350,  0.0199,  ...,  2.0194,  0.3359,  0.5032],
        [-1.7695, -0.7623, -0.2536,  ..., -0.5876,  0.9567, -0.1181]])

In [38]:
x_batch_embedding = input_embedding_table(x_batch)
y_batch_embedding = input_embedding_table(y_batch)
x_batch_embedding.shape

torch.Size([4, 16, 64])

In [39]:
# get postional encoding
# first 16 * 16 matrix
postion_encoding_table = torch.zeros(token_length, d_model)
# postion 16 * 1
postion = torch.arange(0, token_length, dtype=torch.float).unsqueeze(1)
div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
# [:, 0::2], ':' = selecting all lines, '0::2' = starting from 0, step by step 2
postion_encoding_table[:, 0::2] = torch.sin(postion * div_term)
postion_encoding_table[:, 1::2] = torch.sin(postion * div_term)
postion_encoding_table = postion_encoding_table.unsqueeze(0).expand(batch_size, -1, -1)

postion_encoding_table.shape

torch.Size([4, 16, 64])

In [40]:
# add postion embedding with input embedding
x = x_batch_embedding + postion_encoding_table
y = y_batch_embedding + postion_encoding_table
x.shape

torch.Size([4, 16, 64])

In [41]:
Wq = nn.Linear(d_model, d_model)
Wk = nn.Linear(d_model, d_model)
Wv = nn.Linear(d_model, d_model)
# linear last two dimension do the matrix mul
Q = Wq(x)
K = Wk(x)
V = Wv(x)

Q.shape

torch.Size([4, 16, 64])

In [42]:
# [4, 16, 4, 16] -> [4, 4, 16, 16]
# why permute? To enable each head to do separate computing which is parallel computing
# 说人话，就是考虑的是token_length与每个head维度之间的关系，而不是num_heads与别的关系；矩阵相乘是
Q = Q.reshape(batch_size, token_length, num_heads, d_model//num_heads).permute(0, 2, 1, 3)
K = K.reshape(batch_size, token_length, num_heads, d_model//num_heads).permute(0, 2, 1, 3)
V = V.reshape(batch_size, token_length, num_heads, d_model//num_heads).permute(0, 2, 1, 3)
Q.shape

torch.Size([4, 4, 16, 16])

In [43]:
# apply the attension fomular
output = Q @ K.transpose(-2, -1) / math.sqrt(d_model // num_heads)

In [44]:
# apply the mask，矩阵上三角（不包括对角线取-inf)
mask = torch.triu(torch.ones(token_length, token_length), diagonal=1).bool()
output = output.masked_fill(mask, float('-inf'))

output

tensor([[[[ 8.4941e-02,        -inf,        -inf,  ...,        -inf,
                  -inf,        -inf],
          [-2.3501e-01,  2.2545e-01,        -inf,  ...,        -inf,
                  -inf,        -inf],
          [-2.1473e-01,  3.0864e-01, -5.7618e-02,  ...,        -inf,
                  -inf,        -inf],
          ...,
          [-1.2848e-01, -4.2314e-01,  1.2258e-01,  ..., -8.9928e-02,
                  -inf,        -inf],
          [-3.9670e-02,  1.6023e-01,  3.0037e-01,  ..., -2.4559e-01,
            1.1694e-01,        -inf],
          [-6.1681e-01,  3.7470e-01,  4.5829e-01,  ..., -6.7069e-01,
           -1.0295e-01, -7.4017e-02]],

         [[ 6.6310e-01,        -inf,        -inf,  ...,        -inf,
                  -inf,        -inf],
          [ 3.9625e-01,  1.0919e-01,        -inf,  ...,        -inf,
                  -inf,        -inf],
          [ 7.7241e-01,  1.8229e-01, -3.6682e-01,  ...,        -inf,
                  -inf,        -inf],
          ...,
     

In [45]:
# softmax, dim = -1对每一行进行softmax
attention_score = torch.softmax(output, dim=-1)
attention_score

tensor([[[[1.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
          [0.3869, 0.6131, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
          [0.2592, 0.4375, 0.3033,  ..., 0.0000, 0.0000, 0.0000],
          ...,
          [0.0614, 0.0457, 0.0789,  ..., 0.0638, 0.0000, 0.0000],
          [0.0487, 0.0595, 0.0684,  ..., 0.0396, 0.0569, 0.0000],
          [0.0282, 0.0759, 0.0825,  ..., 0.0267, 0.0471, 0.0484]],

         [[1.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
          [0.5713, 0.4287, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
          [0.5335, 0.2957, 0.1708,  ..., 0.0000, 0.0000, 0.0000],
          ...,
          [0.0520, 0.0511, 0.0500,  ..., 0.0664, 0.0000, 0.0000],
          [0.0810, 0.0983, 0.0618,  ..., 0.0827, 0.0725, 0.0000],
          [0.0543, 0.0635, 0.0346,  ..., 0.0826, 0.0552, 0.0705]],

         [[1.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
          [0.4454, 0.5546, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
          [0.3290, 0.2731, 0.3979,  ..., 0

In [46]:
# apply attention @ V
attention_output = attention_score @ V
attention_output.shape


torch.Size([4, 4, 16, 16])

In [47]:
# concat
attention_output = attention_output.transpose(1, 2).reshape(batch_size, token_length, d_model)
attention_output.shape
Wo = nn.Linear(d_model, d_model)
output = Wo(attention_output)
output.shape

torch.Size([4, 16, 64])

In [48]:
# layer norm
layer_norm = nn.LayerNorm(d_model)
layer_norm_output = layer_norm(output)



In [49]:
# FFN
output = nn.Linear(d_model, d_model * 4)(layer_norm_output)
output = nn.ReLU()(output)
output = nn.Linear(d_model * 4, d_model)(output)
# residual
output = output + layer_norm_output
output.shape

torch.Size([4, 16, 64])

In [50]:
# final linear layer
output = nn.Linear(d_model, max_token_value + 1)(output)
output.shape

torch.Size([4, 16, 100070])

In [52]:
logits = torch.softmax(output, dim = -1)
logits.shape
encoding.decode([torch.argmax(logits[0, 0]).item()])

' ambitious'

In [56]:
device = 'cuda' if torch.cuda.is_available() else 'cpu' 
import torch
print(torch.__version__)
print(torch.version.cuda)
print(torch.cuda.is_available())  #输出为True，则安装无误

2.2.2+cpu
None
False
