# 测试tokenizer 和 vocab字典操作

In [36]:
from transformers import BertTokenizer

# 加载编码器
tokenizer = BertTokenizer.from_pretrained("bert-base-chinese")

print(tokenizer)

BertTokenizer(name_or_path='bert-base-chinese', vocab_size=21128, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}


In [37]:
# 测试文本
texts = [
    "你好，欢迎使用分词器测试Demo！",
    "批量编码可以一次处理多个句子。",
    "让我们看看效果如何。"
]

# 批量编码
out = tokenizer.batch_encode_plus(
    texts,
    padding=True,
    truncation=True,
    return_tensors='pt',
    return_special_tokens_mask=True,
)
for k, v in out.items():
    print(k, v)

# input_ids 就是字符编码后的idx，如101表示[cls]标志，873表示你
# special_tokens_mask 是不是特殊符号，比如CLS PAD
# attention_mask 表示应该关注哪些部分，区分有效输入和填充部分
# token_type_ids 分类任务用不到，问答模型会用到，不分上下文关系

input_ids tensor([[ 101,  872, 1962, 8024, 3614, 6816,  886, 4500, 1146, 6404, 1690, 3844,
         6407,  100, 8013,  102,    0],
        [ 101, 2821, 7030, 5356, 4772, 1377,  809,  671, 3613, 1905, 4415, 1914,
          702, 1368, 2094,  511,  102],
        [ 101, 6375, 2769,  812, 4692, 4692, 3126, 3362, 1963,  862,  511,  102,
            0,    0,    0,    0,    0]])
token_type_ids tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
special_tokens_mask tensor([[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1],
        [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
        [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]])
attention_mask tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]])


### 如何操作编码的字典？

In [38]:
vocab = tokenizer.get_vocab()
print(vocab)
print(len(vocab))
# for k, v in vocab.items():
#     print(k, v)

{'[PAD]': 0, '[unused1]': 1, '[unused2]': 2, '[unused3]': 3, '[unused4]': 4, '[unused5]': 5, '[unused6]': 6, '[unused7]': 7, '[unused8]': 8, '[unused9]': 9, '[unused10]': 10, '[unused11]': 11, '[unused12]': 12, '[unused13]': 13, '[unused14]': 14, '[unused15]': 15, '[unused16]': 16, '[unused17]': 17, '[unused18]': 18, '[unused19]': 19, '[unused20]': 20, '[unused21]': 21, '[unused22]': 22, '[unused23]': 23, '[unused24]': 24, '[unused25]': 25, '[unused26]': 26, '[unused27]': 27, '[unused28]': 28, '[unused29]': 29, '[unused30]': 30, '[unused31]': 31, '[unused32]': 32, '[unused33]': 33, '[unused34]': 34, '[unused35]': 35, '[unused36]': 36, '[unused37]': 37, '[unused38]': 38, '[unused39]': 39, '[unused40]': 40, '[unused41]': 41, '[unused42]': 42, '[unused43]': 43, '[unused44]': 44, '[unused45]': 45, '[unused46]': 46, '[unused47]': 47, '[unused48]': 48, '[unused49]': 49, '[unused50]': 50, '[unused51]': 51, '[unused52]': 52, '[unused53]': 53, '[unused54]': 54, '[unused55]': 55, '[unused56]': 5

### 如何添加新的token或者新的special token？

In [39]:
print("阳" in vocab)
print("阳光" in vocab)

tokenizer.add_tokens("阳光")
vocab = tokenizer.get_vocab() # 假如以后要重新获取

print("阳光" in vocab)
print(len(vocab))

True
False
True
21129


In [44]:
tokenizer.add_special_tokens({"eos_token":"[EOS]"})
print(tokenizer)

# 可以看到EOS添加进去了

BertTokenizer(name_or_path='bert-base-chinese', vocab_size=21128, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'eos_token': '[EOS]', 'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	21128: AddedToken("阳光", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
	2112

In [43]:
vocab = tokenizer.get_vocab() # 假如以后要重新获取
for k, v in vocab.items():
    print(k, v)

[PAD] 0
[unused1] 1
[unused2] 2
[unused3] 3
[unused4] 4
[unused5] 5
[unused6] 6
[unused7] 7
[unused8] 8
[unused9] 9
[unused10] 10
[unused11] 11
[unused12] 12
[unused13] 13
[unused14] 14
[unused15] 15
[unused16] 16
[unused17] 17
[unused18] 18
[unused19] 19
[unused20] 20
[unused21] 21
[unused22] 22
[unused23] 23
[unused24] 24
[unused25] 25
[unused26] 26
[unused27] 27
[unused28] 28
[unused29] 29
[unused30] 30
[unused31] 31
[unused32] 32
[unused33] 33
[unused34] 34
[unused35] 35
[unused36] 36
[unused37] 37
[unused38] 38
[unused39] 39
[unused40] 40
[unused41] 41
[unused42] 42
[unused43] 43
[unused44] 44
[unused45] 45
[unused46] 46
[unused47] 47
[unused48] 48
[unused49] 49
[unused50] 50
[unused51] 51
[unused52] 52
[unused53] 53
[unused54] 54
[unused55] 55
[unused56] 56
[unused57] 57
[unused58] 58
[unused59] 59
[unused60] 60
[unused61] 61
[unused62] 62
[unused63] 63
[unused64] 64
[unused65] 65
[unused66] 66
[unused67] 67
[unused68] 68
[unused69] 69
[unused70] 70
[unused71] 71
[unused72] 72
[u

In [47]:
out = tokenizer.encode(text="阳光照在大地上[EOS]")
print(out)

# 解码
print(tokenizer.decode(out))


[101, 21128, 4212, 1762, 1920, 1765, 677, 21129, 102]
[CLS] 阳光 照 在 大 地 上 [EOS] [SEP]
