In [1]:
corpus = [
    "This is the Hugging Face Course.",
    "This chapter is about tokenization.",
    "This section shows several tokenizer algorithms.",
    "Hopefully, you will be able to understand how they are trained and generate tokens.",
]

In [2]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("xlnet-base-cased")


In [3]:
from collections import defaultdict

word_freqs = defaultdict(int)
for text in corpus:
     word_with_offsets = tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(text)
     new_words = [word for word,_ in word_with_offsets]
     for word in new_words:
          word_freqs[word] +=1
word_freqs

defaultdict(int,
            {'▁This': 3,
             '▁is': 2,
             '▁the': 1,
             '▁Hugging': 1,
             '▁Face': 1,
             '▁Course.': 1,
             '▁chapter': 1,
             '▁about': 1,
             '▁tokenization.': 1,
             '▁section': 1,
             '▁shows': 1,
             '▁several': 1,
             '▁tokenizer': 1,
             '▁algorithms.': 1,
             '▁Hopefully,': 1,
             '▁you': 1,
             '▁will': 1,
             '▁be': 1,
             '▁able': 1,
             '▁to': 1,
             '▁understand': 1,
             '▁how': 1,
             '▁they': 1,
             '▁are': 1,
             '▁trained': 1,
             '▁and': 1,
             '▁generate': 1,
             '▁tokens.': 1})

In [4]:
char_freqs = defaultdict(int)
subwords_freqs = defaultdict(int)
for word, freq in word_freqs.items():
     for i in range(len(word)):
          char_freqs[word[i]] += freq
          #双重循环获得子词
          for j in range(i +2,len(word)+1):
               subwords_freqs[word[i:j]] += freq
sorted_subwords = sorted(subwords_freqs.items(), key = lambda x : x[1], reverse=True)
#.items()会将字典转换为列表
sorted_subwords[:10]


[('▁t', 7),
 ('is', 5),
 ('er', 5),
 ('▁a', 5),
 ('▁to', 4),
 ('to', 4),
 ('en', 4),
 ('▁T', 3),
 ('▁Th', 3),
 ('▁Thi', 3)]

In [5]:
token_freqs = list(char_freqs.items()) + sorted_subwords[:300 - len(char_freqs)]
token_freqs = {token: freq for token , freq in token_freqs}
token_freqs

{'▁': 31,
 'T': 3,
 'h': 9,
 'i': 13,
 's': 13,
 't': 14,
 'e': 21,
 'H': 2,
 'u': 6,
 'g': 5,
 'n': 11,
 'F': 1,
 'a': 12,
 'c': 3,
 'C': 1,
 'o': 13,
 'r': 9,
 '.': 4,
 'p': 2,
 'b': 3,
 'k': 3,
 'z': 2,
 'w': 3,
 'v': 1,
 'l': 7,
 'm': 1,
 'f': 1,
 'y': 3,
 ',': 1,
 'd': 4,
 '▁t': 7,
 'is': 5,
 'er': 5,
 '▁a': 5,
 '▁to': 4,
 'to': 4,
 'en': 4,
 '▁T': 3,
 '▁Th': 3,
 '▁Thi': 3,
 '▁This': 3,
 'Th': 3,
 'Thi': 3,
 'This': 3,
 'hi': 3,
 'his': 3,
 'th': 3,
 'ou': 3,
 'se': 3,
 '▁tok': 3,
 '▁toke': 3,
 '▁token': 3,
 'tok': 3,
 'toke': 3,
 'token': 3,
 'ok': 3,
 'oke': 3,
 'oken': 3,
 'ke': 3,
 'ken': 3,
 '▁s': 3,
 'ra': 3,
 'nd': 3,
 '▁i': 2,
 '▁is': 2,
 '▁th': 2,
 '▁the': 2,
 'the': 2,
 'he': 2,
 '▁H': 2,
 'in': 2,
 'rs': 2,
 'te': 2,
 '▁ab': 2,
 'ab': 2,
 '▁tokeni': 2,
 '▁tokeniz': 2,
 'tokeni': 2,
 'tokeniz': 2,
 'okeni': 2,
 'okeniz': 2,
 'keni': 2,
 'keniz': 2,
 'eni': 2,
 'eniz': 2,
 'ni': 2,
 'niz': 2,
 'iz': 2,
 'at': 2,
 'ti': 2,
 'tio': 2,
 'tion': 2,
 'io': 2,
 'ion': 2,
 'on':

In [6]:
from math import log

total_sum = sum([freq for token, freq in token_freqs.items()])
model = {token: -log(freq/ total_sum) for token, freq in token_freqs.items()}

In [7]:
#viterbi算法实现

def encode_word(word, model):
     #这是一个字典的列表，每个字典都有下标序号，先按列表下标取值，再按字典取值
     best_segmentations = [{"start":0,"score":1}]+ [
          {"start":None,"score":None} for _ in range(len(word))
     ]
     for start_idx in range(len(word)):
          best_score_at_start = best_segmentations[start_idx]["score"]
          for end_idx in range(start_idx + 1, len(word)+1):
               token = word[start_idx:end_idx]
               if token in model and best_score_at_start is not None:
                    score = model[token] + best_score_at_start
                    if(
                         best_segmentations[end_idx]["score"] is None
                         or best_segmentations[end_idx]["score"] > score
                    ):
                         best_segmentations[end_idx] = {"start":start_idx,"score":score}
     segmentation = best_segmentations[-1]
     if segmentation["score"] is None:
          return ["<unk>"],None
     score = segmentation["score"]
     start = segmentation["start"]
     end = len(word)
     tokens = []
     while start != 0:
          tokens.insert(0,word[start:end])
          next_start = best_segmentations[start]["start"]
          end = start
          start = next_start
     tokens.insert(0,word[start:end])
     return tokens,score


In [9]:
print(encode_word("Hopefully",model))
print(encode_word("This",model))

(['H', 'o', 'p', 'e', 'f', 'u', 'll', 'y'], 41.5157494601402)
(['This'], 6.288267030694535)


In [10]:
#计算语料库上的分词损失
def compute_loss(model):
     loss = 0
     for word, freq in word_freqs.items():
          _,word_loss = encode_word(word,model)
          loss+= freq*word_loss
     return loss

In [11]:
compute_loss(model)

413.10377642940875

In [12]:
#计算每个词的分数，通过计算删除每个词得到的模型的损失
import copy

def compute_scores(model):
     scores = {}
     model_loss = compute_loss(model)
     for token, score in model.items():
          #保留长度为1的token
          if len(token) == 1:
               continue
          model_without_token = copy.deepcopy(model)
          _ = model_without_token.pop(token)
          scores[token] = compute_loss(model_without_token) - model_loss
     return scores

In [13]:
scores = compute_scores(model)
print(scores["ll"])
print(scores["his"])

6.376412403623874
0.0


In [None]:
#为了提高效率，通常，每次删除当前词表的10%

# [
#     # (Token名字,  它的得分Score)
#     ("apple", 0.001),   # 索引 0 (i=0)
#     ("banana", 0.005),  # 索引 1 (i=1)
#     ("orange", 10.5),   # 索引 2 (i=2)
#     ...0
# ]

percent_to_remove = 0.1
while len(model) > 100:
    scores = compute_scores(model)
    sorted_scores = sorted(scores.items(), key=lambda x: x[1])
    # 删除分数最低的percent_to_remov tokens 。
    for i in range(int(len(model) * percent_to_remove)):
        _ = token_freqs.pop(sorted_scores[i][0])

    total_sum = sum([freq for token, freq in token_freqs.items()])
    model = {token: -log(freq / total_sum) for token, freq in token_freqs.items()}

In [17]:
def tokenize(text, model):
    words_with_offsets = tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(text)
    pre_tokenized_text = [word for word, offset in words_with_offsets]
    encoded_words = [encode_word(word, model)[0] for word in pre_tokenized_text]
    return sum(encoded_words, [])


tokenize("This is the Hugging Face course.", model)

['▁This',
 '▁is',
 '▁the',
 '▁Hugging',
 '▁Face',
 '▁',
 'c',
 'ou',
 'r',
 's',
 'e',
 '.']