In [10]:
text_path = 'sources/the-verdict.txt'
with open(text_path, 'r', encoding='utf-8') as f:
    raw_text = f.read()
print('Total number of characters:', len(raw_text))
print(raw_text[:100])

Total number of characters: 20479
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no g


In [11]:
import re
# 匹配空格和各种符号
preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', raw_text)
# 去掉空格
preprocessed = [item.strip() for item in preprocessed if item.strip()]
print(len(preprocessed))

4690


关于上面的代码，在tokenization过程中需不需要去掉空格，取决于具体的应用。
对于普通文本，空格仅仅作为分割单词的符号，没有语义，则可以去掉以减少计算量。
对于像python代码的文本，空格具有明确的语义（如缩进），则不能去掉。

接下来需要将每个不重复的token组成单词表，并按顺序将每个token映射到一个token id。

In [12]:
all_words = sorted(set(preprocessed))
vocab_size = len(all_words)
print(vocab_size)
print(all_words[:20])

1130
['!', '"', "'", '(', ')', ',', '--', '.', ':', ';', '?', 'A', 'Ah', 'Among', 'And', 'Are', 'Arrt', 'As', 'At', 'Be']


In [13]:
vocab = {token:id for id, token in enumerate(all_words)}
for i, item in enumerate(vocab.items()):
    print(item)
    if i >= 50:
        break

('!', 0)
('"', 1)
("'", 2)
('(', 3)
(')', 4)
(',', 5)
('--', 6)
('.', 7)
(':', 8)
(';', 9)
('?', 10)
('A', 11)
('Ah', 12)
('Among', 13)
('And', 14)
('Are', 15)
('Arrt', 16)
('As', 17)
('At', 18)
('Be', 19)
('Begin', 20)
('Burlington', 21)
('But', 22)
('By', 23)
('Carlo', 24)
('Chicago', 25)
('Claude', 26)
('Come', 27)
('Croft', 28)
('Destroyed', 29)
('Devonshire', 30)
('Don', 31)
('Dubarry', 32)
('Emperors', 33)
('Florence', 34)
('For', 35)
('Gallery', 36)
('Gideon', 37)
('Gisburn', 38)
('Gisburns', 39)
('Grafton', 40)
('Greek', 41)
('Grindle', 42)
('Grindles', 43)
('HAD', 44)
('Had', 45)
('Hang', 46)
('Has', 47)
('He', 48)
('Her', 49)
('Hermia', 50)


将上述过程写成类

In [14]:
class SimpleTokenizerV1:
    def __init__(self, vocab):
        self.str2id = vocab
        self.id2str = {id:string for string, id in vocab.items()}
    def encode(self, text):
        # 匹配空格和各种符号
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        # 去掉空格
        preprocessed = [item.strip() for item in preprocessed if item.strip()]

        ids = [self.str2id[item] for item in preprocessed]
        return ids
    def decode(self, ids):
        text = ' '.join([self.id2str[i] for i in ids])

        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text

In [15]:
tokenizer = SimpleTokenizerV1(vocab)
text = """"It's the last he painted, you know," Mrs. Gisburn said with pardonable pride."""
ids = tokenizer.encode(text)
print(ids)
print(tokenizer.decode(ids))

[1, 56, 2, 850, 988, 602, 533, 746, 5, 1126, 596, 5, 1, 67, 7, 38, 851, 1108, 754, 793, 7]
" It' s the last he painted, you know," Mrs. Gisburn said with pardonable pride.


In [16]:
text = "Hello, do you like tea?" 
try:
    print(tokenizer.encode(text))
except KeyError as e:
    print(f'出错！{e} 不存在')

出错！'Hello' 不存在


## 2.4 添加特殊文本token
上面的tokenizer仅能处理vocab中包含的单词，当遇到Hello时会报错，因为vocab中没有这个词。下面为其添加能够处理未知词的token标记

In [17]:
class SimpleTokenizerV2:
    def __init__(self, vocab):
        self.str2id = vocab
        self.id2str = {id:string for string, id in vocab.items()}
    def encode(self, text):
        # 匹配空格和各种符号
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        # 去掉空格
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        #相比v1添加了未知字符的替代token <|unk|>
        preprocessed = [item if item in self.str2id else '<|unk|>' for item in preprocessed]

        ids = [self.str2id[item] for item in preprocessed]
        return ids
    def decode(self, ids):
        text = ' '.join([self.id2str[i] for i in ids])

        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text
    
# 需要同时将新token添加到字典中
vocab['<|endoftext|>'] = len(vocab)
vocab['<|unk|>'] = len(vocab)

for item in list(vocab.items())[-5:]:
    print(item)

('younger', 1127)
('your', 1128)
('yourself', 1129)
('<|endoftext|>', 1130)
('<|unk|>', 1131)


In [18]:
# 创建一段新的包含两种新token的文本
text1 = "Hello, do you like tea?"
text2 = "In the sunlit terraces of the palace." 
text = " <|endoftext|> ".join((text1, text2))
print(text)

Hello, do you like tea? <|endoftext|> In the sunlit terraces of the palace.


In [19]:
tokenizerv2 = SimpleTokenizerV2(vocab)

print(tokenizerv2.encode(text))
print(tokenizerv2.decode(tokenizerv2.encode(text)))

[1131, 5, 355, 1126, 628, 975, 10, 1130, 55, 988, 956, 984, 722, 988, 1131, 7]
<|unk|>, do you like tea? <|endoftext|> In the sunlit terraces of the <|unk|>.


## 2.5 字节对编码（BPE）
上面使用的是简单的统计文本中的单词生成的字典，而在大模型中使用的是相对复杂的字节对编码。本节仅介绍如何调包，不涉及原理。

首先需要安装tiktoken包
```bash
pip install tiktoken
```

In [20]:
from importlib.metadata import version
import tiktoken

print('tiktoken version:', version("tiktoken"))

tiktoken version: 0.11.0


In [21]:
tokenizer_bpe = tiktoken.get_encoding('gpt2')
text = ( "Hello, do you like tea? <|endoftext|> In the sunlit terraces" 
        "of someunknownPlace." 
        )
ids = tokenizer_bpe.encode(text, allowed_special={'<|endoftext|>'})
print(ids)
text = tokenizer_bpe.decode(ids)
print(text)

[15496, 11, 466, 345, 588, 8887, 30, 220, 50256, 554, 262, 4252, 18250, 8812, 2114, 1659, 617, 34680, 27271, 13]
Hello, do you like tea? <|endoftext|> In the sunlit terracesof someunknownPlace.


## 2.6 用滑动窗口对数据采样
因为大模型实际上执行的是预测下一个词任务，所以采样的y（标签）就是将x（输入）右移一位。

In [22]:
enc_text = tokenizer_bpe.encode(raw_text)
print(len(enc_text))

5145


In [23]:
enc_sample = enc_text[50:]
context_size = 4
x = enc_sample[:context_size]
y = enc_sample[1:context_size+1]
print(f"x: {x}")
print(f"y:      {y}")

x: [290, 4920, 2241, 287]
y:      [4920, 2241, 287, 257]


In [24]:
# Implement above process in a Dataset Class
import torch
from torch.utils.data import Dataset, DataLoader

class GPTDatasetV1(Dataset):
    def __init__(self, text, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []
        # 此处记得调encode函数
        text_enc = tokenizer.encode(text)
        for i in range(0, len(text_enc) - max_length, stride):
            input_chunk = text_enc[i:i+max_length]
            target_chunk = text_enc[i+1:i+max_length+1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, index):
        return self.input_ids[index], self.target_ids[index]

In [25]:
def create_dataloader_v1(text, batch_size=4,
                          max_length=256,
                            stride=128,
                            shuffle=True,
                            drop_last=True,
                            num_workers=0):
    tokenizer = tiktoken.get_encoding('gpt2')
    dataset = GPTDatasetV1(text, tokenizer, max_length, stride)
    # print(dataset[0])
    dataloader = DataLoader(dataset, 
                            batch_size=batch_size, 
                            shuffle=shuffle, 
                            num_workers=num_workers, 
                            drop_last=drop_last)
    
    return dataloader

dataloader = create_dataloader_v1(raw_text, batch_size=1, max_length=4, stride=1, shuffle=False)
data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)



[tensor([[  40,  367, 2885, 1464]]), tensor([[ 367, 2885, 1464, 1807]])]


## 2.7 生成token嵌入
token embedding是指为每个token生成一个维度为embedding dim的向量来表示这个token，显然不同嵌入向量的数目等于词汇表的大小。

In [26]:
# a small vocab_size just for show
vocab_size = 6
embed_dim = 3
torch.manual_seed(42)
embedding_layer = torch.nn.Embedding(vocab_size, embed_dim)
print(embedding_layer.weight)

Parameter containing:
tensor([[ 1.9269,  1.4873, -0.4974],
        [ 0.4396, -0.7581,  1.0783],
        [ 0.8008,  1.6806,  0.3559],
        [-0.6866,  0.6105,  1.3347],
        [-0.2316,  0.0418, -0.2516],
        [ 0.8599, -0.3097, -0.3957]], requires_grad=True)


### 关于torch.nn.Embedding
其参数矩阵的shape为 embedding_num x embedding_dim。作用是将根据输入的数字返回相应序数的嵌入向量。也可以看作将输入数字转换为独热编码，然后与参数矩阵相乘。参见下面的代码

In [None]:
# 观察其输出就是上面的参数矩阵的第二行。
print(embedding_layer(torch.tensor([1])))
# 假设有一个token序列
input_ids = torch.tensor([2, 3, 5, 1])
# 等价于一个[input_len, vacab_size]的独热编码矩阵乘以embedding_layer的参数矩阵
print(embedding_layer(input_ids))

tensor([[ 0.4396, -0.7581,  1.0783]], grad_fn=<EmbeddingBackward0>)
tensor([[ 0.8008,  1.6806,  0.3559],
        [-0.6866,  0.6105,  1.3347],
        [ 0.8599, -0.3097, -0.3957],
        [ 0.4396, -0.7581,  1.0783]], grad_fn=<EmbeddingBackward0>)


## 2.8 编码单词位置
编码分为绝对位置编码和相对位置编码，本节仿照ChatGPT采用的可学习的绝对位置编码。

In [37]:
# 首先用上面写的dataloader构造方法构造一个数据集
context_length = 4
dataloader = create_dataloader_v1(raw_text, batch_size=8, max_length=context_length, stride=4, shuffle=False)
data_iter = iter(dataloader)
inputs, targets = next(data_iter)

print('Token IDs:', inputs)
print('Data shape:', inputs.shape)


Token IDs: tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])
Data shape: torch.Size([8, 4])


In [38]:
# 接下来需要知道我们使用的bpe tokenizer的vocab_size
vocab_size = tokenizer_bpe.max_token_value + 1
embed_dim = 256
embedding_layer = torch.nn.Embedding(vocab_size, embed_dim)
# 将数据嵌入
input_embedding = embedding_layer(inputs)
print(input_embedding.shape) # [batch_size, context_length, embedding_dim]

torch.Size([8, 4, 256])


接下来构造位置编码。对于绝对位置编码，只需要对每个pos构造一个与上面相同维度的嵌入向量即可。

In [None]:
pos_embedding_layer = torch.nn.Embedding(context_length, embed_dim)
pos_embedding = pos_embedding_layer(torch.arange(context_length))
# 可以看到维度和上面的input一样
print(pos_embedding.shape)

torch.Size([4, 256])


In [None]:
# 将位置编码加入到输入中
input_embedding += pos_embedding