In [1]:
from datasets import load_dataset

dataset = load_dataset("wikitext",name = "wikitext-2-raw-v1",split="train")
dataset

Dataset({
    features: ['text'],
    num_rows: 36718
})

In [2]:
def get_train_corpus():
     for i in range(0,len(dataset),1000):
          yield dataset[i:i+1000]["text"]

In [3]:
with open("wikitext-2.txt","w",encoding="utf-8") as f:
     for i in range(len(dataset)):
          f.write(dataset[i]["text"] + "\n")

In [4]:
#从头构建tokenizer库，首先要实例化一个带有model的tokenizer对象
#然后normalizer-pre_tokenizer-post_processor-decoder

from tokenizers import (
     decoders,
     models,
     normalizers,
     pre_tokenizers,
     processors,
     trainers,
     Tokenizer,
)
#确定model
tokenizer = Tokenizer(models.WordPiece(unk_token= "[UNK]"))
#建立normalization
#A:直接用hugging face打包好的
tokenizer.normalizer = normalizers.BertNormalizer(lowercase=True)
#B：手动组装流水线
#NFD():unicode标准化，把重音字符拆成字符和重音标志
tokenizer.normalizer = normalizers.Sequence(
     [normalizers.NFD(),normalizers.Lowercase(),normalizers.StripAccents()]
)
print(tokenizer.normalizer.normalize_str("Héllò hôw are ü?"))

hello how are u?


In [5]:
#预分词
tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer()
# Whitespace 会使用空格和所有不是字母、数字或下划线的字符进行分割
#whitespacesplit只使用空格分割
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
tokenizer.pre_tokenizer.pre_tokenize_str("Let's test my pre-tokenizer.")

[('Let', (0, 3)),
 ("'", (3, 4)),
 ('s', (4, 5)),
 ('test', (6, 10)),
 ('my', (11, 13)),
 ('pre', (14, 17)),
 ('-', (17, 18)),
 ('tokenizer', (18, 27)),
 ('.', (27, 28))]

In [6]:
pre_tokenizer = pre_tokenizers.Sequence(
    [pre_tokenizers.WhitespaceSplit(), pre_tokenizers.Punctuation()]
)
pre_tokenizer.pre_tokenize_str("Let's test my pre-tokenizer.")

[('Let', (0, 3)),
 ("'", (3, 4)),
 ('s', (4, 5)),
 ('test', (6, 10)),
 ('my', (11, 13)),
 ('pre', (14, 17)),
 ('-', (17, 18)),
 ('tokenizer', (18, 27)),
 ('.', (27, 28))]

In [7]:
#tokenization 流程的下一步是将输入数据传递给模型。
#我们已经在初始化时指定了我们的模型，但是我们还需要对其进行训练
special_tokens = ["[UNK]","[PAD]","[CLS]","[SEP]", "[MASK]"]
trainer = trainers.WordPieceTrainer(vocab_size=25000,special_tokens=special_tokens)
tokenizer.train_from_iterator(get_train_corpus(),trainer=trainer)
tokenizer.train(["wikitext-2.txt"],trainer = trainer)

encoding = tokenizer.encode("Let's test this tokenizer.")
print(encoding.tokens)

['let', "'", 's', 'test', 'this', 'tok', '##eni', '##zer', '.']


In [8]:
cls_token_id = tokenizer.token_to_id("[CLS]")
sep_token_id = tokenizer.token_to_id("[SEP]")
print(cls_token_id,sep_token_id)

2 3


In [9]:
tokenizer.post_processor = processors.TemplateProcessing(
     #$A:代表输入的句子A，这里是占位符
     #:1是type ID，0是第一句话，1是第二句话
    single=f"[CLS]:0 $A:0 [SEP]:0",
    pair=f"[CLS]:0 $A:0 [SEP]:0 $B:1 [SEP]:1",
    #模板里写的是字符串 "[CLS]"，但计算机只认识数字。
    #里是在告诉处理器：“当你在模板里看到 [CLS] 时，请把它替换成词表中对应的 ID（比如 101）；
    # 看到 [SEP] 时替换成对应的 ID（比如 102）。”
    special_tokens=[("[CLS]", cls_token_id), ("[SEP]", sep_token_id)],
)

In [10]:
encoding = tokenizer.encode("Let's test this tokenizer.")
print(encoding.tokens)

['[CLS]', 'let', "'", 's', 'test', 'this', 'tok', '##eni', '##zer', '.', '[SEP]']


In [11]:
encoding = tokenizer.encode("Let's test this tokenizer...", "on a pair of sentences.")
print(encoding.tokens)
print(encoding.type_ids)

['[CLS]', 'let', "'", 's', 'test', 'this', 'tok', '##eni', '##zer', '...', '[SEP]', 'on', 'a', 'pair', 'of', 'sentences', '.', '[SEP]']
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]


In [12]:
#指定解码器
tokenizer.decoder = decoders.WordPiece(prefix="##")
tokenizer.decode(encoding.ids)

"let ' s test this tokenizer... on a pair of sentences."

In [None]:
#构建BPE tokenizer

tokenizer = Tokenizer(models.BPE())
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space= False)
trainer = trainers.BpeTrainer(vocab_size=25000,special_tokens=["<|endoftext|>"])
#法1：流式训练
tokenizer.train_from_iterator(get_train_corpus(),trainer=trainer)
#法2：文件训练
#tokenizer.model = models.BPE()
#tokenizer.train(["wikitext-2.txt"],trainer=trainer)
tokenizer.post_processor = processors.ByteLevel(trim_offsets=False)

In [18]:
sentence = "Let's test this tokenizer."
encoding = tokenizer.encode(sentence)
print(encoding.tokens)
start, end = encoding.offsets[4]
sentence[start:end]

['L', 'et', "'", 's', 'Ġtest', 'Ġthis', 'Ġto', 'ken', 'izer', '.']


' test'

In [None]:
tokenizer.decoder = decoders.ByteLevel()
tokenizer.decode(encoding.ids)


[0, 0, 0, 0, 0]

In [23]:
tokenizer = Tokenizer(models.Unigram())

from tokenizers import Regex

tokenizer.normalizer = normalizers.Sequence(
     [
          normalizers.Replace("``", '"'),
          normalizers.Replace("''", '"'),
          normalizers.NFKD(),
          normalizers.StripAccents(),
          normalizers.Replace(Regex(" {2,}"), " "),
          #会将两个或更多空格替换为一个
     ]
)

In [24]:
tokenizer.pre_tokenizer = pre_tokenizers.Metaspace()
tokenizer.pre_tokenizer.pre_tokenize_str("Let's test the pre-tokenizer!")

[("▁Let's", (0, 5)),
 ('▁test', (5, 10)),
 ('▁the', (10, 14)),
 ('▁pre-tokenizer!', (14, 29))]

In [26]:
special_tokens = ["<cls>", "<sep>", "<unk>", "<pad>", "<mask>", "<s>", "</s>"]
trainer = trainers.UnigramTrainer(
     vocab_size=25000,special_tokens=special_tokens,unk_token="<unk>"
)
tokenizer.train_from_iterator(get_train_corpus(),trainer=trainer)

In [27]:
encoding = tokenizer.encode("Let's test this tokenizer.")
print(encoding.tokens)

['▁Let', "'", 's', '▁test', '▁this', '▁to', 'ken', 'izer', '.']


In [28]:
cls_token_id = tokenizer.token_to_id("<cls>")
sep_token_id = tokenizer.token_to_id("<sep>")
print(cls_token_id,sep_token_id)

0 1


In [30]:
tokenizer.post_processor = processors.TemplateProcessing(
     single="$A:0 <sep>:0 <cls>:2",
     pair = "$A:0 <sep>:0 $B:1 <sep>:1 <cls>:2",
     special_tokens=[("<sep>",sep_token_id),("<cls>",cls_token_id)]
)

In [31]:
encoding = tokenizer.encode("Let's test this tokenizer...", "on a pair of sentences!")
print(encoding.tokens)
print(encoding.type_ids)

['▁Let', "'", 's', '▁test', '▁this', '▁to', 'ken', 'izer', '.', '.', '.', '<sep>', '▁', 'on', '▁', 'a', '▁pair', '▁of', '▁sentence', 's', '!', '<sep>', '<cls>']
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2]


In [32]:
tokenizer.decoder = decoders.Metaspace()