In [80]:
import torch
import torch.nn as nn
import os
import requests
import tiktoken
import pandas as pd
import math

In [81]:
token_length= 16
d_model = 64
batch_size = 4
num_heads = 4

In [82]:
with open('dataset.txt', 'r') as f:
    text = f.read()



In [83]:
encoding = tiktoken.get_encoding('cl100k_base')
tokenized_text = encoding.encode(text)
tokenized_text = torch.tensor(tokenized_text, dtype=torch.long)

In [84]:
train_index = int(len(tokenized_text) * 0.9)
train_data = tokenized_text[:train_index]
validation_data = tokenized_text[train_index:]

In [85]:
data = train_data
indexs = torch.randint(low=0, high=len(data) - token_length, size=(batch_size,))
# 4 * 16 tensor
x_batch = torch.stack([data[idx: idx + token_length] for idx in indexs])
y_batch = torch.stack([data[idx + 1: idx + token_length + 1] for idx in indexs])

In [86]:
# pd.DataFrame(x_batch[0].numpy())
encoding.decode(x_batch[0].numpy())

" solutions and offerings. This involves matching the customer's needs with the appropriate products or"

In [87]:
# get max index of token
max_token_value = tokenized_text.max().item()
max_token_value

100069

In [88]:
# construct a 100070 * 16 matrix, get input embedding
input_embedding_table = nn.Embedding(max_token_value + 1, d_model)
input_embedding_table.weight.data

tensor([[-0.5123, -0.6028,  1.3043,  ..., -1.6790,  0.9367, -0.3207],
        [-0.2108, -0.7986,  1.2182,  ...,  0.7188, -0.9925, -0.7194],
        [ 0.2154, -0.2422,  0.2529,  ...,  0.6183,  0.9441,  0.2124],
        ...,
        [-0.4560, -1.7594,  1.4394,  ..., -0.5974,  1.4885,  1.3696],
        [ 0.5343,  2.5246, -0.1196,  ..., -0.8313, -0.1659, -1.3517],
        [ 1.9935, -1.8589, -0.6141,  ..., -0.7727,  0.1055,  1.2830]])

In [89]:
x_batch_embedding = input_embedding_table(x_batch)
y_batch_embedding = input_embedding_table(y_batch)
x_batch_embedding.shape

torch.Size([4, 16, 64])

In [90]:
# get postional encoding
# first 16 * 16 matrix
postion_encoding_table = torch.zeros(token_length, d_model)
# postion 16 * 1
postion = torch.arange(0, token_length, dtype=torch.float).unsqueeze(1)
div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
# [:, 0::2], ':' = selecting all lines, '0::2' = starting from 0, step by step 2
postion_encoding_table[:, 0::2] = torch.sin(postion * div_term)
postion_encoding_table[:, 1::2] = torch.sin(postion * div_term)
postion_encoding_table = postion_encoding_table.unsqueeze(0).expand(batch_size, -1, -1)

postion_encoding_table.shape

torch.Size([4, 16, 64])

In [91]:
# add postion embedding with input embedding
x = x_batch_embedding + postion_encoding_table
y = y_batch_embedding + postion_encoding_table
x.shape

torch.Size([4, 16, 64])

In [92]:
Wq = nn.Linear(d_model, d_model)
Wk = nn.Linear(d_model, d_model)
Wv = nn.Linear(d_model, d_model)
# linear last two dimension do the matrix mul
Q = Wq(x)
K = Wk(x)
V = Wv(x)

Q.shape

torch.Size([4, 16, 64])

In [93]:
# [4, 16, 4, 16] -> [4, 4, 16, 16]
# why permute? To enable each head to do separate computing which is parallel computing
# 说人话，就是考虑的是token_length与每个head维度之间的关系，而不是num_heads与别的关系；矩阵相乘是
Q = Q.reshape(batch_size, token_length, num_heads, d_model//num_heads).permute(0, 2, 1, 3)
K = K.reshape(batch_size, token_length, num_heads, d_model//num_heads).permute(0, 2, 1, 3)
V = V.reshape(batch_size, token_length, num_heads, d_model//num_heads).permute(0, 2, 1, 3)
Q.shape

torch.Size([4, 4, 16, 16])

In [94]:
# apply the attension fomular
output = Q @ K.transpose(-2, -1) / math.sqrt(d_model // num_heads)

In [95]:
# apply the mask，矩阵上三角（不包括对角线取-inf)
mask = torch.triu(torch.ones(token_length, token_length), diagonal=1).bool()
output = output.masked_fill(mask, float('-inf'))

output

tensor([[[[ 5.9216e-01,        -inf,        -inf,  ...,        -inf,
                  -inf,        -inf],
          [-1.5789e-01,  4.6748e-01,        -inf,  ...,        -inf,
                  -inf,        -inf],
          [-2.5910e-01, -3.7053e-01, -3.1831e-01,  ...,        -inf,
                  -inf,        -inf],
          ...,
          [-2.8675e-02, -5.0447e-01,  1.5097e-01,  ..., -1.0524e-01,
                  -inf,        -inf],
          [ 1.5891e-01, -2.8209e-01, -4.6384e-01,  ...,  1.6642e-01,
           -4.1548e-01,        -inf],
          [ 5.5566e-01, -8.8920e-02, -3.5326e-01,  ...,  2.4195e-01,
            2.4298e-02,  1.0459e-01]],

         [[-5.9204e-01,        -inf,        -inf,  ...,        -inf,
                  -inf,        -inf],
          [-1.6259e-01, -1.9450e-01,        -inf,  ...,        -inf,
                  -inf,        -inf],
          [ 5.8910e-02, -3.4836e-01, -2.8727e-01,  ...,        -inf,
                  -inf,        -inf],
          ...,
     

In [96]:
# softmax, dim = -1对每一行进行softmax
attention_score = torch.softmax(output, dim=-1)
attention_score

tensor([[[[1.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
          [0.3486, 0.6514, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
          [0.3525, 0.3153, 0.3322,  ..., 0.0000, 0.0000, 0.0000],
          ...,
          [0.0586, 0.0364, 0.0701,  ..., 0.0543, 0.0000, 0.0000],
          [0.0693, 0.0446, 0.0372,  ..., 0.0698, 0.0390, 0.0000],
          [0.1146, 0.0601, 0.0462,  ..., 0.0837, 0.0673, 0.0730]],

         [[1.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
          [0.5080, 0.4920, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
          [0.4214, 0.2804, 0.2981,  ..., 0.0000, 0.0000, 0.0000],
          ...,
          [0.0752, 0.1122, 0.0614,  ..., 0.0288, 0.0000, 0.0000],
          [0.0345, 0.1209, 0.0772,  ..., 0.0607, 0.0665, 0.0000],
          [0.0629, 0.0430, 0.0468,  ..., 0.0953, 0.0445, 0.0327]],

         [[1.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
          [0.5350, 0.4650, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
          [0.2839, 0.3668, 0.3493,  ..., 0

In [97]:
# apply attention @ V
attention_output = attention_score @ V
attention_output.shape


torch.Size([4, 4, 16, 16])

In [100]:
# concat
attention_output = attention_output.transpose(1, 2).reshape(batch_size, token_length, d_model)
attention_output.shape
Wo = nn.Linear(d_model, d_model)
output = Wo(attention_output)
output.shape

torch.Size([4, 16, 64])

In [103]:
# layer norm
layer_norm = nn.LayerNorm(d_model)
layer_norm_output = layer_norm(output)



In [108]:
# FFN
output = nn.Linear(d_model, d_model * 4)(layer_norm_output)
output = nn.ReLU()(output)
output = nn.Linear(d_model * 4, d_model)(output)
# residual
output = output + layer_norm_output
output.shape

torch.Size([4, 16, 64])

In [None]:
# final linear layer
output = nn.Linear(d_model, max_token_value + 1)(output)
output.shape

torch.Size([4, 16, 100070])

In [1]:
logits = nn.softmax(output, dim = -1)
logits

NameError: name 'nn' is not defined