In [32]:
corpus = [
    "This is the Hugging Face Course.",
    "This chapter is about tokenization.",
    "This section shows several tokenizer algorithms.",
    "Hopefully, you will be able to understand how they are trained and generate tokens.",
]

In [33]:
#将语料库预分为单词

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")


In [34]:
from collections import defaultdict

word_freqs = defaultdict(int)

for text in corpus:
     word_with_offsets = tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(text)
     new_words = [word for word,_ in word_with_offsets]
     for word in new_words:
          word_freqs[word]+=1
word_freqs

defaultdict(int,
            {'This': 3,
             'is': 2,
             'the': 1,
             'Hugging': 1,
             'Face': 1,
             'Course': 1,
             '.': 4,
             'chapter': 1,
             'about': 1,
             'tokenization': 1,
             'section': 1,
             'shows': 1,
             'several': 1,
             'tokenizer': 1,
             'algorithms': 1,
             'Hopefully': 1,
             ',': 1,
             'you': 1,
             'will': 1,
             'be': 1,
             'able': 1,
             'to': 1,
             'understand': 1,
             'how': 1,
             'they': 1,
             'are': 1,
             'trained': 1,
             'and': 1,
             'generate': 1,
             'tokens': 1})

In [35]:
alphabet = []

for word in word_freqs.keys():
     if word[0] not in alphabet:
          alphabet.append(word[0])
     for letter in word[1:]:
          #从第二个字母开始取，一直取到最后
          if f"##{letter}" not in alphabet:
               #f-str ,字符串拼接方式，等价于“##”+letter
               alphabet.append(f"##{letter}")
alphabet.sort()
alphabet

['##a',
 '##b',
 '##c',
 '##d',
 '##e',
 '##f',
 '##g',
 '##h',
 '##i',
 '##k',
 '##l',
 '##m',
 '##n',
 '##o',
 '##p',
 '##r',
 '##s',
 '##t',
 '##u',
 '##v',
 '##w',
 '##y',
 '##z',
 ',',
 '.',
 'C',
 'F',
 'H',
 'T',
 'a',
 'b',
 'c',
 'g',
 'h',
 'i',
 's',
 't',
 'u',
 'w',
 'y']

In [36]:
vacab = vocab = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"] + alphabet.copy()

In [37]:
splits = {
     word:[c if i == 0 else f"##{c}" for i,c in enumerate(word)]
     for word in word_freqs.keys()
}
splits

{'This': ['T', '##h', '##i', '##s'],
 'is': ['i', '##s'],
 'the': ['t', '##h', '##e'],
 'Hugging': ['H', '##u', '##g', '##g', '##i', '##n', '##g'],
 'Face': ['F', '##a', '##c', '##e'],
 'Course': ['C', '##o', '##u', '##r', '##s', '##e'],
 '.': ['.'],
 'chapter': ['c', '##h', '##a', '##p', '##t', '##e', '##r'],
 'about': ['a', '##b', '##o', '##u', '##t'],
 'tokenization': ['t',
  '##o',
  '##k',
  '##e',
  '##n',
  '##i',
  '##z',
  '##a',
  '##t',
  '##i',
  '##o',
  '##n'],
 'section': ['s', '##e', '##c', '##t', '##i', '##o', '##n'],
 'shows': ['s', '##h', '##o', '##w', '##s'],
 'several': ['s', '##e', '##v', '##e', '##r', '##a', '##l'],
 'tokenizer': ['t', '##o', '##k', '##e', '##n', '##i', '##z', '##e', '##r'],
 'algorithms': ['a',
  '##l',
  '##g',
  '##o',
  '##r',
  '##i',
  '##t',
  '##h',
  '##m',
  '##s'],
 'Hopefully': ['H', '##o', '##p', '##e', '##f', '##u', '##l', '##l', '##y'],
 ',': [','],
 'you': ['y', '##o', '##u'],
 'will': ['w', '##i', '##l', '##l'],
 'be': ['b', '##e

In [38]:
def compute_pair_scores(splits):
     letter_freqs = defaultdict(int)
     pair_freqs = defaultdict(int)
     for word, freq in word_freqs.items():
          split = splits[word]
          if len(split) == 1:
               letter_freqs[split[0]] += freq
               continue
          for i in range (len(split) - 1):
               pair = (split[i],split[i+1])
               letter_freqs[split[i]] += freq
               pair_freqs[pair] += freq
          letter_freqs[split[-1]] += freq

     scores = {
          pair: freq/ (letter_freqs[pair[0]] * letter_freqs[pair[1]])
          for pair,freq in pair_freqs.items()
     }
     return scores

pair_scores = compute_pair_scores(splits)
for i,key in enumerate(pair_scores.keys()):
     print(f"{key}:{pair_scores[key]}")
     

('T', '##h'):0.125
('##h', '##i'):0.03409090909090909
('##i', '##s'):0.02727272727272727
('i', '##s'):0.1
('t', '##h'):0.03571428571428571
('##h', '##e'):0.011904761904761904
('H', '##u'):0.1
('##u', '##g'):0.05
('##g', '##g'):0.0625
('##g', '##i'):0.022727272727272728
('##i', '##n'):0.01652892561983471
('##n', '##g'):0.022727272727272728
('F', '##a'):0.14285714285714285
('##a', '##c'):0.07142857142857142
('##c', '##e'):0.023809523809523808
('C', '##o'):0.07692307692307693
('##o', '##u'):0.046153846153846156
('##u', '##r'):0.022222222222222223
('##r', '##s'):0.022222222222222223
('##s', '##e'):0.004761904761904762
('c', '##h'):0.125
('##h', '##a'):0.017857142857142856
('##a', '##p'):0.07142857142857142
('##p', '##t'):0.07142857142857142
('##t', '##e'):0.013605442176870748
('##e', '##r'):0.026455026455026454
('a', '##b'):0.2
('##b', '##o'):0.038461538461538464
('##u', '##t'):0.02857142857142857
('t', '##o'):0.04395604395604396
('##o', '##k'):0.07692307692307693
('##k', '##e'):0.04761904

In [39]:
best_pair = ""
max_score = None

for pair ,score in pair_scores.items():
     if max_score == None or score > max_score:
          best_pair = pair
          max_score = score
print(best_pair,max_score)

('a', '##b') 0.2


In [40]:
vacab.append("ab")

def merge_pair(a,b,splits):
     for word in word_freqs:
          split = splits[word]
          if len(split) == 1:
               continue
          i = 0
          while i < len(split) -1:
               if split[i] == a and split[i+1] == b:
               ##如果b是以“##”开头的，合并时要去掉前缀
                    merge = a + b[2:] if b.startswith("##") else a+b
                    split = split[:i] + [merge] + split[i+2:]
               else:
                    i+=1
          splits[word] = split
     return splits

In [41]:
splits = merge_pair("a","##b",splits)
splits["about"]

['ab', '##o', '##u', '##t']

In [42]:
vocab_size = 70

while len(vocab) < vocab_size:
     scores = compute_pair_scores(splits)
     best_pair,max_score = "",None
     for pair ,score in scores.items():
          if max_score == None or max_score < score:
               best_pair = pair
               max_score = score
     splits = merge_pair(best_pair[0], best_pair[1],splits)
     new_token = (
          best_pair[0] + best_pair[1][2:]
          if best_pair[1].startswith("##")
          else best_pair[0] + best_pair[1]
     )
     vocab.append(new_token)

In [43]:
print(vocab)

['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]', '##a', '##b', '##c', '##d', '##e', '##f', '##g', '##h', '##i', '##k', '##l', '##m', '##n', '##o', '##p', '##r', '##s', '##t', '##u', '##v', '##w', '##y', '##z', ',', '.', 'C', 'F', 'H', 'T', 'a', 'b', 'c', 'g', 'h', 'i', 's', 't', 'u', 'w', 'y', 'ab', '##fu', 'Fa', 'Fac', '##ct', '##ful', '##full', '##fully', 'Th', 'ch', '##hm', 'cha', 'chap', 'chapt', '##thm', 'Hu', 'Hug', 'Hugg', 'sh', 'th', 'is', '##thms', '##za', '##zat', '##ut']


In [44]:
def encode_word(word):
     tokens = []
     while len(word) >0:
          i = len(word)
          while i >0 and word[:i] not in vocab:
               i -=1
          if i == 0:
               return ["[UNK]"]
          tokens.append(word[:i])
          word = word[i:]
          if len(word) > 0:
               word = f"##{word}"
     return tokens

In [45]:
print(encode_word("Hugging"))

['Hugg', '##i', '##n', '##g']


In [48]:
def tokenize(text):
    pre_tokenize_result = tokenizer._tokenizer.pre_tokenizer.pre_tokenize_str(text)
    pre_tokenized_text = [word for word, offset in pre_tokenize_result]
    encoded_words = [encode_word(word) for word in pre_tokenized_text]
    return sum(encoded_words, [])


In [49]:
tokenize("This is the Hugging Face course!")

['Th',
 '##i',
 '##s',
 'is',
 'th',
 '##e',
 'Hugg',
 '##i',
 '##n',
 '##g',
 'Fac',
 '##e',
 'c',
 '##o',
 '##u',
 '##r',
 '##s',
 '##e',
 '[UNK]']