In [1]:
corpus = [
    "This is the Hugging Face Course.",
    "This chapter is about tokenization.",
    "This section shows several tokenizer algorithms.",
    "Hopefully, you will be able to understand how they are trained and generate tokens.",
]

In [None]:
#预分词
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("gpt2")

#在预分词的同时计算语料库中每个单词的频率

from collections import defaultdict
#用defaltdict在试图访问不存在的key时不会报错，而是自动调用int()返回一个0
#如果不这样后面添加new_word的时候要先将new_word = 0 才能+=1

word_freqs = defaultdict(int)

for text in corpus:
     #这里word_with_offsets每次都是拿到一句话的分词，第二次会覆盖上一次的分词结果
     word_with_offsets = tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(text)
     new_words = [word for word, offsets in word_with_offsets]
     for word in new_words:
          word_freqs[word] +=1

print(word_freqs)

defaultdict(<class 'int'>, {'This': 3, 'Ġis': 2, 'Ġthe': 1, 'ĠHugging': 1, 'ĠFace': 1, 'ĠCourse': 1, '.': 4, 'Ġchapter': 1, 'Ġabout': 1, 'Ġtokenization': 1, 'Ġsection': 1, 'Ġshows': 1, 'Ġseveral': 1, 'Ġtokenizer': 1, 'Ġalgorithms': 1, 'Hopefully': 1, ',': 1, 'Ġyou': 1, 'Ġwill': 1, 'Ġbe': 1, 'Ġable': 1, 'Ġto': 1, 'Ġunderstand': 1, 'Ġhow': 1, 'Ġthey': 1, 'Ġare': 1, 'Ġtrained': 1, 'Ġand': 1, 'Ġgenerate': 1, 'Ġtokens': 1})


In [None]:
#计算基础词汇表，由语料库中使用的所有字符组成

alphabet = []

for word in word_freqs.keys():
     for letter in word:
          #这里有去重处理
          if letter not in alphabet:
               alphabet.append(letter)
alphabet.sort()

print(alphabet)

[',', '.', 'C', 'F', 'H', 'T', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'v', 'w', 'y', 'z', 'Ġ']


In [None]:
#添加特殊token
vocab = ["<|endoftext|>"] + alphabet.copy()
print(vocab)

['<|endoftext|>', ',', '.', 'C', 'F', 'H', 'T', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'v', 'w', 'y', 'z', 'Ġ']


In [12]:
#将每个单词拆分为单独的字符，方便开始训练

splits = {word:[c for c in word] for word in word_freqs.keys()}
print(splits)

{'This': ['T', 'h', 'i', 's'], 'Ġis': ['Ġ', 'i', 's'], 'Ġthe': ['Ġ', 't', 'h', 'e'], 'ĠHugging': ['Ġ', 'H', 'u', 'g', 'g', 'i', 'n', 'g'], 'ĠFace': ['Ġ', 'F', 'a', 'c', 'e'], 'ĠCourse': ['Ġ', 'C', 'o', 'u', 'r', 's', 'e'], '.': ['.'], 'Ġchapter': ['Ġ', 'c', 'h', 'a', 'p', 't', 'e', 'r'], 'Ġabout': ['Ġ', 'a', 'b', 'o', 'u', 't'], 'Ġtokenization': ['Ġ', 't', 'o', 'k', 'e', 'n', 'i', 'z', 'a', 't', 'i', 'o', 'n'], 'Ġsection': ['Ġ', 's', 'e', 'c', 't', 'i', 'o', 'n'], 'Ġshows': ['Ġ', 's', 'h', 'o', 'w', 's'], 'Ġseveral': ['Ġ', 's', 'e', 'v', 'e', 'r', 'a', 'l'], 'Ġtokenizer': ['Ġ', 't', 'o', 'k', 'e', 'n', 'i', 'z', 'e', 'r'], 'Ġalgorithms': ['Ġ', 'a', 'l', 'g', 'o', 'r', 'i', 't', 'h', 'm', 's'], 'Hopefully': ['H', 'o', 'p', 'e', 'f', 'u', 'l', 'l', 'y'], ',': [','], 'Ġyou': ['Ġ', 'y', 'o', 'u'], 'Ġwill': ['Ġ', 'w', 'i', 'l', 'l'], 'Ġbe': ['Ġ', 'b', 'e'], 'Ġable': ['Ġ', 'a', 'b', 'l', 'e'], 'Ġto': ['Ġ', 't', 'o'], 'Ġunderstand': ['Ġ', 'u', 'n', 'd', 'e', 'r', 's', 't', 'a', 'n', 'd'], 'Ġh

In [None]:
#计算每对字符的频率

def compute_pair_freqs(splits):
     pair_freqs = defaultdict(int)
     for word, freqs in word_freqs.items():
          split = splits[word]
          #这里是组成word的token列表
          if len(split) == 1:
               continue
          for i in range(len(split) -1):
               #注意这里用的是，而不是+号，+号是代表merge，这里还没开始merge
               pair = (split[i] , split[i+1])
               pair_freqs[pair]+=freqs
               #这里的字符频率依赖原单词的频率
     return pair_freqs
pair_freqs = compute_pair_freqs(splits)
for i, key in enumerate(pair_freqs.keys()):
     print(f"{key}:{pair_freqs[key]}")
     if i >5:
          break

('T', 'h'):3
('h', 'i'):3
('i', 's'):5
('Ġ', 'i'):2
('Ġ', 't'):7
('t', 'h'):3
('h', 'e'):2


In [17]:
best_pair = ""
max_freq = None

for pair ,freq in pair_freqs.items():
     if max_freq == None or max_freq < freq:
          best_pair = pair
          max_freq = freq
print(best_pair,max_freq)

('Ġ', 't') 7


In [None]:
merges = {('Ġ', 't'):"Ġt"}
vocab.append("Ġt")
#在splits字典里合并
def merge_pair(a,b,splits):
     #word_freqs保存了最原始的分词结果
     #splits的key就是word_freqs的key
     for word in word_freqs:
          split = splits[word]
          if len(split) == 1:
               continue
          i = 0
          while i < len(split) -1:
               if split[i] == a and split[i+1] == b:
                    split = split[:i] + [a+b] + split[i+2:]
               else:
                    i+=1
          splits[word] = split
     return splits

splits = merge_pair("Ġ", "t", splits)
print(splits["Ġtrained"])


['Ġt', 'r', 'a', 'i', 'n', 'e', 'd']


In [20]:
vocab_size = 50
while len(vocab) < vocab_size:
     pair_freqs = compute_pair_freqs(splits)
     best_pair = ""
     max_freq = None
     for pair, freq in pair_freqs.items():
          if max_freq == None or max_freq < freq:
               max_freq = freq
               best_pair = pair
     splits = merge_pair(*best_pair,splits)
     merges[best_pair] = best_pair[0] + best_pair[1]
     vocab.append(best_pair[0] + best_pair[1])

print(merges)

{('Ġ', 't'): 'Ġt', ('i', 's'): 'is', ('e', 'r'): 'er', ('Ġ', 'a'): 'Ġa', ('Ġt', 'o'): 'Ġto', ('e', 'n'): 'en', ('T', 'h'): 'Th', ('Th', 'is'): 'This', ('o', 'u'): 'ou', ('s', 'e'): 'se', ('Ġto', 'k'): 'Ġtok', ('Ġtok', 'en'): 'Ġtoken', ('n', 'd'): 'nd', ('Ġ', 'is'): 'Ġis', ('Ġt', 'h'): 'Ġth', ('Ġth', 'e'): 'Ġthe', ('i', 'n'): 'in', ('Ġa', 'b'): 'Ġab', ('Ġtoken', 'i'): 'Ġtokeni'}


In [21]:
print(vocab)

['<|endoftext|>', ',', '.', 'C', 'F', 'H', 'T', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'v', 'w', 'y', 'z', 'Ġ', 'Ġt', 'is', 'er', 'Ġa', 'Ġto', 'en', 'Th', 'This', 'ou', 'se', 'Ġtok', 'Ġtoken', 'nd', 'Ġis', 'Ġth', 'Ġthe', 'in', 'Ġab', 'Ġtokeni']


In [None]:
def tokenize(text):
     pre_tokenize_result = tokenizer._tokenizer.pre_tokenizer.pre_tokenize_str(text)
     pre_tokenized_text = [word for word,offset in pre_tokenize_result]
     splits = [[l for l in word] for word in pre_tokenized_text]
     for pair, merge in merges.items():
          for idx, split in enumerate(splits):
               i = 0
               while i < len(split) -1:
                    if split[i] == pair[0] and split[i+1] == pair[1]:
                         split = split[:i] + [merge] + split[i+2:]
                    else:
                         i +=1
               splits[idx] = split
     return sum(splits,[])
#splits是一个列表的列表，sum(iterable, start)
#初始状态：result = [] (这就是那个 start 参数)
#第 1 步：result = result + ['Th', 'is']
#第 2 步：result = result + ['Ġis'] -> ['Th', 'is', 'Ġis']
#第 3 步：result = result + ['.']-> ['Th', 'is', 'Ġis', '.']

In [None]:
#前面是在是训练模型，这里是在调用模型
#这部分是在调用模型，输出的结果依赖于前面的corpus数据
tokenize("This is not a token.")
#this，is，token被认了出来，not因为没有学习过所以被分了


['This', 'Ġis', 'Ġ', 'n', 'o', 't', 'Ġa', 'Ġtoken', '.']

In [None]:
#区分：为什么BPE训练时splits用字典，推理时用列表？
#训练是注重效率（去重），推理是为了保留语序（可还原）
