# Tokenizer 的基本使用

In [1]:
# AutoTokenizer 包可以根据传入的参数（如模型名）自动判断所需的 tokenizer
from transformers import AutoTokenizer

# 样例字符串
sen = "这是一段测试文本"

## Step1: 加载与保存

In [4]:
# 从 hugging face 加载，输入模型名称即可加载对应的分词器
tokenizer = AutoTokenizer.from_pretrained("uer/roberta-base-finetuned-dianping-chinese")
tokenizer

BertTokenizerFast(name_or_path='uer/roberta-base-finetuned-dianping-chinese', vocab_size=21128, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [5]:
# 自动下载的 model 和 tokenizer 等组件位于 C:\Users\username\.cache\huggingface\hub 中
# 也可以把 tokenizer 单独保存到指定路径
tokenizer.save_pretrained("./roberta_tokenizer")

('./roberta_tokenizer\\tokenizer_config.json',
 './roberta_tokenizer\\special_tokens_map.json',
 './roberta_tokenizer\\vocab.txt',
 './roberta_tokenizer\\added_tokens.json',
 './roberta_tokenizer\\tokenizer.json')

In [4]:
# 可以从本地加载保存的 tokenizer
tokenizer = AutoTokenizer.from_pretrained("./roberta_tokenizer")
tokenizer

BertTokenizerFast(name_or_path='./roberta_tokenizer', vocab_size=21128, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

## Step2: 句子分词

In [5]:
tokens = tokenizer.tokenize(sen)
print(tokens)   # ['这', '是', '一', '段', '测', '试', '文', '本']

['这', '是', '一', '段', '测', '试', '文', '本']

In [10]:
tokens = tokenizer.tokenize([sen, sen])
print(tokens)   # ['这', '是', '一', '段', '测', '试', '文', '本', '这', '是', '一', '段', '测', '试', '文', '本']

['这', '是', '一', '段', '测', '试', '文', '本', '这', '是', '一', '段', '测', '试', '文', '本']


## Step3: 查看词表

In [8]:
tokenizer.vocab

{'##净': 14169,
 'ま': 567,
 '##copyright': 13291,
 '疡': 4550,
 '抢': 2843,
 '枇': 3355,
 '##尘': 15269,
 '贺': 6590,
 'ne': 10564,
 '庸': 2435,
 '##馬': 20736,
 '臾': 5640,
 '勖': 1241,
 '##粱': 18175,
 '##⒋': 13574,
 '褥': 6191,
 'doc': 9656,
 '釁': 7022,
 'alex': 10179,
 '##フト': 10868,
 '屹': 2256,
 'yumi': 11697,
 '##nne': 12866,
 '莫': 5811,
 '816': 10937,
 '秀': 4899,
 '##質': 19606,
 '3p': 12108,
 '019': 13146,
 'positioning': 11187,
 '##適': 19957,
 'qe': 11534,
 '##伏': 13883,
 '喷': 1613,
 '##愜': 15753,
 'burberry': 11143,
 '##柱': 16450,
 'z2': 12568,
 '翌': 5422,
 '##eh': 12742,
 '##當': 17591,
 '##绸': 18396,
 '##霄': 20503,
 'yougou': 10852,
 '楓': 3502,
 '346': 12380,
 '☞': 483,
 '譬': 6357,
 '##没': 16823,
 '遨': 6899,
 '含': 1419,
 '##姣': 15061,
 '旖': 3185,
 '录': 2497,
 '##懋': 15805,
 '##踮': 19737,
 '颦': 7589,
 '##ora': 11455,
 '##郎': 20004,
 '##爰': 17319,
 '##萌': 18903,
 '蚩': 6019,
 '臻': 5638,
 '##绞': 18376,
 '##绎': 18363,
 '##昱': 16279,
 '婶': 2050,
 '吐': 1402,
 '##驕': 20766,
 '榫': 3530,
 '許': 625

In [9]:
tokenizer.vocab_size

21128

## Step4: 索引转换

In [12]:
tokens = tokenizer.tokenize(sen)

# token 序列 -> id 序列
ids = tokenizer.convert_tokens_to_ids(tokens)
print(ids)

[6821, 3221, 671, 3667, 3844, 6407, 3152, 3315]


In [13]:
# id 序列-> token 序列
tokens = tokenizer.convert_ids_to_tokens(ids)
print(tokens)

['这', '是', '一', '段', '测', '试', '文', '本']


In [14]:
# token 序列 -> 字符串
str_sen = tokenizer.convert_tokens_to_string(tokens)
print(str_sen)

这 是 一 段 测 试 文 本


In [24]:
# 以上流程是: 原始字符串 -> token 序列 -> id 序列
# Transformers 库提供了称为“编码”的便捷方式: 原始字符串 -> id 序列
# 也有称为“解码”的反向方法：id 序列 -> 原始字符串
ids = tokenizer.encode(sen, add_special_tokens=True)        # add_special_tokens 在 tokenize 时序列设置特殊 token
print(ids)                                                  # 注意到首尾多了特殊 token [CLS](101) 和 [SEP](102)
str_sen = tokenizer.decode(ids, skip_special_tokens=False)  # skip_special_tokens 可以跳过可能存在的特殊 token
print(str_sen)
str_sen = tokenizer.decode(ids, skip_special_tokens=True)
print(str_sen)


[101, 6821, 3221, 671, 3667, 3844, 6407, 3152, 3315, 102]
[CLS] 这 是 一 段 测 试 文 本 [SEP]
这 是 一 段 测 试 文 本


## Step5: 填充和截断

In [15]:
# 填充
ids = tokenizer.encode(sen, padding="max_length", max_length=15)
print(ids)  # [101, 6821, 3221, 671, 3667, 3844, 6407, 3152, 3315, 102, 0, 0, 0, 0, 0]

[101, 6821, 3221, 671, 3667, 3844, 6407, 3152, 3315, 102, 0, 0, 0, 0, 0]


In [19]:
# 截断
ids = tokenizer.encode(sen, max_length=5, truncation=False)
print(ids)  # [101, 6821, 3221, 671, 102]

[101, 6821, 3221, 671, 3667, 3844, 6407, 3152, 3315, 102]


## Step6: 其他输入部分

In [29]:
ids = tokenizer.encode(sen, padding="max_length", max_length=15)

# 除 token 外，Transformer 类模型的输入往往还有一些附加信息
attention_mask = [1 if idx != 0 else 0 for idx in ids]  # attention_mask 用于遮盖 zero padding 部分
token_type_ids = [0] * len(ids)                         # bert 有一个判断上下句任务，模型预训练时需要 token 所属句子 id 信息
ids, attention_mask, token_type_ids

([101, 6821, 3221, 671, 3667, 3844, 6407, 3152, 3315, 102, 0, 0, 0, 0, 0],
 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [32]:
# 附加信息无需手动编写，tokenizer 中已经提供
inputs = tokenizer.encode_plus(sen, padding="max_length", max_length=15)
print(inputs)

# 另一种调用方法
inputs = tokenizer(sen, padding="max_length", max_length=15)
print(inputs)

{'input_ids': [101, 6821, 3221, 671, 3667, 3844, 6407, 3152, 3315, 102, 0, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]}
{'input_ids': [101, 6821, 3221, 671, 3667, 3844, 6407, 3152, 3315, 102, 0, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]}


## Step7: 处理batch数据

In [34]:
sens = [
    "AABBCCDDEEFF",
    "哈哈哈哈哈哈哈哈哈哈哈",
    "你好你好你好你好"
]
res = tokenizer(sens)
res

{'input_ids': [[101, 9563, 10214, 8860, 9879, 8854, 9049, 102], [101, 1506, 1506, 1506, 1506, 1506, 1506, 1506, 1506, 1506, 1506, 1506, 102], [101, 872, 1962, 872, 1962, 872, 1962, 872, 1962, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

In [36]:
%%time
# 单条循环处理，慢
for i in range(1000):
    tokenizer(sen)

CPU times: total: 172 ms
Wall time: 242 ms


In [39]:
%%time
# 成 batch 批量计算，快
tokenizer([sen] * 1000)

CPU times: total: 78.1 ms
Wall time: 27.9 ms


{'input_ids': [[101, 6821, 3221, 671, 3667, 3844, 6407, 3152, 3315, 102], [101, 6821, 3221, 671, 3667, 3844, 6407, 3152, 3315, 102], [101, 6821, 3221, 671, 3667, 3844, 6407, 3152, 3315, 102], [101, 6821, 3221, 671, 3667, 3844, 6407, 3152, 3315, 102], [101, 6821, 3221, 671, 3667, 3844, 6407, 3152, 3315, 102], [101, 6821, 3221, 671, 3667, 3844, 6407, 3152, 3315, 102], [101, 6821, 3221, 671, 3667, 3844, 6407, 3152, 3315, 102], [101, 6821, 3221, 671, 3667, 3844, 6407, 3152, 3315, 102], [101, 6821, 3221, 671, 3667, 3844, 6407, 3152, 3315, 102], [101, 6821, 3221, 671, 3667, 3844, 6407, 3152, 3315, 102], [101, 6821, 3221, 671, 3667, 3844, 6407, 3152, 3315, 102], [101, 6821, 3221, 671, 3667, 3844, 6407, 3152, 3315, 102], [101, 6821, 3221, 671, 3667, 3844, 6407, 3152, 3315, 102], [101, 6821, 3221, 671, 3667, 3844, 6407, 3152, 3315, 102], [101, 6821, 3221, 671, 3667, 3844, 6407, 3152, 3315, 102], [101, 6821, 3221, 671, 3667, 3844, 6407, 3152, 3315, 102], [101, 6821, 3221, 671, 3667, 3844, 6407, 

# Fast/Slow Tokenizer
- Transformer 库提供了两种 tokenizer
   1. FastTokenizer: 基于 Rust 实现，速度快，可以提供更多附加信息
   2. SlowTokenizer: 基于 python 实现，速度慢

In [41]:
sen = "快慢Tokenizer测试"

In [42]:
sen = "快慢Tokenizer测试"
fast_tokenizer = AutoTokenizer.from_pretrained("uer/roberta-base-finetuned-dianping-chinese")
fast_tokenizer # 类型名有后缀 Fast

BertTokenizerFast(name_or_path='uer/roberta-base-finetuned-dianping-chinese', vocab_size=21128, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [43]:
# 设置 use_fast=False 来构造 SlowTokenizer
slow_tokenizer = AutoTokenizer.from_pretrained("uer/roberta-base-finetuned-dianping-chinese", use_fast=False)
slow_tokenizer # 类型名无后缀 Fast

BertTokenizer(name_or_path='uer/roberta-base-finetuned-dianping-chinese', vocab_size=21128, model_max_length=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [46]:
%%time
for i in range(10000):
    fast_tokenizer(sen)

CPU times: total: 1.56 s
Wall time: 1.68 s


In [47]:
%%time
for i in range(10000):
    slow_tokenizer(sen)

CPU times: total: 3.56 s
Wall time: 3.71 s


In [50]:
%%time
fast_tokenizer([sen] * 10000)

CPU times: total: 1.02 s
Wall time: 349 ms


{'input_ids': [[101, 2571, 2714, 8228, 11285, 11789, 8180, 3844, 6407, 102], [101, 2571, 2714, 8228, 11285, 11789, 8180, 3844, 6407, 102], [101, 2571, 2714, 8228, 11285, 11789, 8180, 3844, 6407, 102], [101, 2571, 2714, 8228, 11285, 11789, 8180, 3844, 6407, 102], [101, 2571, 2714, 8228, 11285, 11789, 8180, 3844, 6407, 102], [101, 2571, 2714, 8228, 11285, 11789, 8180, 3844, 6407, 102], [101, 2571, 2714, 8228, 11285, 11789, 8180, 3844, 6407, 102], [101, 2571, 2714, 8228, 11285, 11789, 8180, 3844, 6407, 102], [101, 2571, 2714, 8228, 11285, 11789, 8180, 3844, 6407, 102], [101, 2571, 2714, 8228, 11285, 11789, 8180, 3844, 6407, 102], [101, 2571, 2714, 8228, 11285, 11789, 8180, 3844, 6407, 102], [101, 2571, 2714, 8228, 11285, 11789, 8180, 3844, 6407, 102], [101, 2571, 2714, 8228, 11285, 11789, 8180, 3844, 6407, 102], [101, 2571, 2714, 8228, 11285, 11789, 8180, 3844, 6407, 102], [101, 2571, 2714, 8228, 11285, 11789, 8180, 3844, 6407, 102], [101, 2571, 2714, 8228, 11285, 11789, 8180, 3844, 6407,

In [51]:
%%time
slow_tokenizer([sen] * 10000)

CPU times: total: 2.89 s
Wall time: 3.05 s


{'input_ids': [[101, 2571, 2714, 8228, 11285, 11789, 8180, 3844, 6407, 102], [101, 2571, 2714, 8228, 11285, 11789, 8180, 3844, 6407, 102], [101, 2571, 2714, 8228, 11285, 11789, 8180, 3844, 6407, 102], [101, 2571, 2714, 8228, 11285, 11789, 8180, 3844, 6407, 102], [101, 2571, 2714, 8228, 11285, 11789, 8180, 3844, 6407, 102], [101, 2571, 2714, 8228, 11285, 11789, 8180, 3844, 6407, 102], [101, 2571, 2714, 8228, 11285, 11789, 8180, 3844, 6407, 102], [101, 2571, 2714, 8228, 11285, 11789, 8180, 3844, 6407, 102], [101, 2571, 2714, 8228, 11285, 11789, 8180, 3844, 6407, 102], [101, 2571, 2714, 8228, 11285, 11789, 8180, 3844, 6407, 102], [101, 2571, 2714, 8228, 11285, 11789, 8180, 3844, 6407, 102], [101, 2571, 2714, 8228, 11285, 11789, 8180, 3844, 6407, 102], [101, 2571, 2714, 8228, 11285, 11789, 8180, 3844, 6407, 102], [101, 2571, 2714, 8228, 11285, 11789, 8180, 3844, 6407, 102], [101, 2571, 2714, 8228, 11285, 11789, 8180, 3844, 6407, 102], [101, 2571, 2714, 8228, 11285, 11789, 8180, 3844, 6407,

- 观察 FastTokenizer 返回的额外信息 `offsets_mapping`

In [68]:
inputs = fast_tokenizer(sen, return_offsets_mapping=True) # (只有 FastTokenizer 可以设置 return_offsets_mapping=True)
print(sen)                       # 打印原始字符串
print(inputs.word_ids())         # 打印各个 token 对应到原始字符串的 “词索引”，注意到原始字符串中 ”Tokenizer“ 这个词被拆成了4个token (只有 FastTokenizer 可以调用这个)
print(inputs['offset_mapping'])  # offset_mapping 指示了各个 token 对应的原始字符串索引区域


快慢Tokenizer测试
[None, 0, 1, 2, 2, 2, 2, 3, 4, None]
[(0, 0), (0, 1), (1, 2), (2, 4), (4, 7), (7, 10), (10, 11), (11, 12), (12, 13), (0, 0)]


# 特殊 Tokenizer 的加载
- 有些模型的 tokenizer 是作者自己实现上传的，这时要用 `trust_remote_code=True` 来加载这些远程仓库的 tokenizer
- 首次下载后可以把这些 tokenizer 保存到本地，之后加载时也需设定 `trust_remote_code=True`

In [2]:
from transformers import AutoTokenizer
# 有些模型的 tokenizer 是作者自己实现上传的，这时要用 trust_remote_code=True 来加载这些远程仓库的 tokenizer
tokenizer = AutoTokenizer.from_pretrained("Skywork/Skywork-13B-base", trust_remote_code=True)
tokenizer

You are using the legacy behaviour of the <class 'transformers_modules.Skywork.Skywork-13B-base.bc35915066fbbf15b77a1a4a74e9b574ab167816.tokenization_skywork.SkyworkTokenizer'>. This means that tokens that come after special tokens will not be properly handled. 


SkyworkTokenizer(name_or_path='Skywork/Skywork-13B-base', vocab_size=65519, model_max_length=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
}