# 使用 gpt2 的分词器进行词元 id 编码的示例

In [3]:
import tiktoken
tiktoken.__version__

'0.12.0'

In [None]:
# 需要联网下载，缓存路径配置见 `load.py` 的 read_file_cached 函数源码
tokenizer = tiktoken.get_encoding("gpt2")
# 总的词元个数
tokenizer.n_vocab


50257

In [None]:
text = "Hello, world!<|endoftext|>It's a beautiful day."
# gpt2 的词元表的最后一个词元是<|endoftext|>
integers = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
print(integers)

[15496, 11, 995, 0, 50256, 1026, 338, 257, 4950, 1110, 13]


In [22]:
# 解码
strings=tokenizer.decode(integers)
print(strings)

Hello, world!<|endoftext|>It's a beautiful day.


# 嵌入层将词元ID成词元向量的示例

In [23]:
import torch

In [24]:
# 假设输入词元ID，仅6个单词的小型词汇表，词元向量的维度为3
input_ids = torch.tensor([2, 3, 5, 1])
vocab_size = 6
output_dims = 3

In [26]:
torch.manual_seed(123)
embedding_layer = torch.nn.Embedding(num_embeddings=vocab_size, embedding_dim=output_dims)
print(embedding_layer.weight)

Parameter containing:
tensor([[ 0.3374, -0.1778, -0.1690],
        [ 0.9178,  1.5810,  1.3010],
        [ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-1.1589,  0.3255, -0.6315],
        [-2.8400, -0.7849, -1.4096]], requires_grad=True)


In [None]:
# 将其应用到词元 ID 上, 发现结果对应嵌入层矩阵的第 3 行（索引从0开始)
print(embedding_layer(torch.tensor([3])))

tensor([[-0.4015,  0.9666, -1.1481]], grad_fn=<EmbeddingBackward0>)


# 嵌入向量添加学习式绝对位置编码的示例

In [None]:
import torch
from GPTDataSet import create_dataloader_v1

torch.manual_seed(123)

vocab_size = 50257
output_dims = 256
token_embedding_layer = torch.nn.Embedding(num_embeddings=vocab_size, embedding_dim=output_dims)

In [None]:
max_length = 4
raw_text = "Hello, world! It's a beautiful day. Today is a good day to learn about language models."
# 确保训练数据够 一个 batch 不够 不然会报错 StopIteration
dataloader = create_dataloader_v1(raw_text, batch_size=2, max_length=max_length, stride=max_length, shuffle=False)

In [None]:
data_iter = iter(dataloader)  
inputs, targets = next(data_iter) 
print("Token IDs:\n", inputs)
print("\nInput shapes:\n", inputs.shape)

Token IDs:
 tensor([[15496,    11,   995,     0],
        [  632,   338,   257,  4950]])

Input shapes:
 torch.Size([2, 4])


In [42]:
token_embeddings = token_embedding_layer(inputs)
print(token_embeddings.shape)

torch.Size([2, 4, 256])


In [46]:
context_length = max_length
pos_embedding_layer = torch.nn.Embedding(num_embeddings=context_length, embedding_dim=output_dims)
pos_embeddings = pos_embedding_layer(torch.arange(context_length))
print(torch.arange(context_length))
print(pos_embeddings.shape)

tensor([0, 1, 2, 3])
torch.Size([4, 256])


In [None]:
# 在每个批次的每个 4 * 256 维的词元嵌入张量上都添加一个 4 * 256 维的 pos_embeddings 张量
input_beddings = token_embeddings + pos_embeddings
print(input_beddings.shape)

torch.Size([2, 4, 256])
