## Tokenizer基本使用

In [2]:
# AutoTokenizer,会根据你传入的词是什么样子的来分类传递到需要的选择的模型上
from transformers import AutoTokenizer

In [9]:
sen = "弱小的我也有大梦想!"

### Step1 加载与保存

In [4]:
# 从HuggingFace服务器上加载，如果本地有，就从本地上加载
tokenizer = AutoTokenizer.from_pretrained("uer/roberta-base-finetuned-dianping-chinese")
tokenizer

BertTokenizerFast(name_or_path='uer/roberta-base-finetuned-dianping-chinese', vocab_size=21128, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [5]:
# 还可以手动指定tokenizer保存在本地的位置
tokenizer.save_pretrained("./roberta_tokenizer")

('./roberta_tokenizer\\tokenizer_config.json',
 './roberta_tokenizer\\special_tokens_map.json',
 './roberta_tokenizer\\vocab.txt',
 './roberta_tokenizer\\added_tokens.json',
 './roberta_tokenizer\\tokenizer.json')

In [7]:
# 如果已经保存到本地了，那我们from_pretrained就可以选择从本地加载，而不会再去远程HuggingFace下载模型了
tokenizer = AutoTokenizer.from_pretrained("./roberta_tokenizer")
tokenizer 

BertTokenizerFast(name_or_path='./roberta_tokenizer', vocab_size=21128, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

### step2句子分词

In [10]:
tokens = tokenizer.tokenize(sen)
tokens  # 这个句子分词是不固定，知识说我们当前选择的这个分词器模型会将我们的句子拆分成这个样子

['弱', '小', '的', '我', '也', '有', '大', '梦', '想', '!']

### step3 查看词典

In [12]:
tokenizer.vocab  # 有些有两个##表示在做一些子词的处理，可以缩小词表，很多词可以用目前的词组成

{'##饕': 20699,
 '饶': 7657,
 '##龕': 21043,
 '蒐': 5883,
 '姬': 2010,
 'r9': 12674,
 '鲈': 7827,
 '子': 2094,
 'hadoop': 10956,
 '##ge': 8441,
 '##囧': 14790,
 'dan': 11404,
 '##捨': 16000,
 '##菽': 18897,
 '蓝': 5905,
 '訝': 6252,
 '##瘓': 17656,
 '祈': 4857,
 '##群': 18465,
 '##nel': 12827,
 '亚': 762,
 '##圭': 14821,
 '##眦': 17755,
 '2756': 9397,
 'nas': 10157,
 '458': 12562,
 'bean': 13188,
 '企': 821,
 '##饍': 20696,
 '咧': 1485,
 'library': 11616,
 '耻': 5459,
 '3s': 13247,
 '##嬌': 15138,
 '##尘': 15269,
 '糜': 5134,
 '##泡': 16853,
 '綾': 5213,
 '罹': 5395,
 'ч': 255,
 '##谤': 19527,
 '睡': 4717,
 '粪': 5116,
 'nike': 8702,
 'ibm': 8699,
 '刹': 1172,
 '##雜': 20486,
 '惟': 2668,
 '实': 2141,
 '艙': 5676,
 '単': 1299,
 '##香': 20733,
 '##呈': 14496,
 '钿': 7186,
 '驸': 7726,
 '##logy': 11121,
 '鄞': 6969,
 '丨': 701,
 '##笔': 18068,
 '芊': 5691,
 '銭': 7074,
 '##蓄': 18955,
 '怯': 2598,
 'dnf': 11315,
 '##帚': 15424,
 '##霑': 20512,
 '℃': 360,
 '铿': 7217,
 'fgo': 11401,
 '##帐': 15419,
 '##惆': 15716,
 '轎': 6754,
 '繳': 5260,
 '

In [13]:
tokenizer.vocab_size

21128

### step4 索引转换

In [14]:
# 将词序列转换成词典对应的id序列
ids = tokenizer.convert_tokens_to_ids(tokens)
ids

[2483, 2207, 4638, 2769, 738, 3300, 1920, 3457, 2682, 106]

In [15]:
# 还可以将id序列转换成词序列
tokens = tokenizer.convert_ids_to_tokens(ids)
tokens

['弱', '小', '的', '我', '也', '有', '大', '梦', '想', '!']

In [16]:
# 还可以将tokens转换成句子
str_sen = tokenizer.convert_tokens_to_string(tokens)
str_sen

'弱 小 的 我 也 有 大 梦 想!'

更便捷的方式

In [17]:
# 将字符串直接转换成分词id，而不需要分词之后再转换成id序列
ids = tokenizer.encode(sen)  # 这种编码方式会在整个句子的开头加上开始符号cls，在句子的末尾加上末尾符号sep
ids

[101, 2483, 2207, 4638, 2769, 738, 3300, 1920, 3457, 2682, 106, 102]

In [19]:
# 如果不想看到特俗符号的话，那么就可以添加一个add_special_tokens设置为false；就是不添加特俗符号
ids = tokenizer.encode(sen, add_special_tokens=False)
ids

[2483, 2207, 4638, 2769, 738, 3300, 1920, 3457, 2682, 106]

In [20]:
# decode用来解码，将序列解码成原始句子
str_sen = tokenizer.decode(ids)
str_sen

'弱 小 的 我 也 有 大 梦 想!'

In [21]:
# 同样也有一个skip_special_token参数来告诉你要不要跳过这个special token
str_sen = tokenizer.decode(ids, skip_special_tokens=True)
str_sen

'弱 小 的 我 也 有 大 梦 想!'

### step5 填充与截断

In [22]:
# 所有的数据要填充或者截断成统一长度的字符
# 填充 padding
ids = tokenizer.encode(sen, padding="max_length", max_length=15)
ids

[101, 2483, 2207, 4638, 2769, 738, 3300, 1920, 3457, 2682, 106, 102, 0, 0, 0]

In [23]:
# 截断
ids = tokenizer.encode(sen, max_length=5, truncation=True)
ids

[101, 2483, 2207, 4638, 102]

### step6 其他输入部分

In [24]:
# 当你的数据存在填充，那你需要告诉模型哪些部分是填充，哪些部分是有效的输入，这个时候就需要一个attention_mask
ids = tokenizer.encode(sen, padding='max_length', max_length=15)
ids

[101, 2483, 2207, 4638, 2769, 738, 3300, 1920, 3457, 2682, 106, 102, 0, 0, 0]

In [25]:
attention_mask = [1 if idx != 0 else 0 for idx in ids]  # 判断哪些是填充，哪些不是填充
token_type_ids = [0] * len(ids)  # 判断是第几个句子的词语
ids, attention_mask, token_type_ids

([101, 2483, 2207, 4638, 2769, 738, 3300, 1920, 3457, 2682, 106, 102, 0, 0, 0],
 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

### step7 快速的调用方式

In [26]:
# encode_plus自动帮你解决分词，转换序列，填充（截断），attention_mask,token_type_ids
inputs = tokenizer.encode_plus(sen, padding='max_length', max_length=15)
inputs

{'input_ids': [101, 2483, 2207, 4638, 2769, 738, 3300, 1920, 3457, 2682, 106, 102, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]}

In [27]:
inputs = tokenizer(sen, padding='max_length', max_length=15)
inputs

{'input_ids': [101, 2483, 2207, 4638, 2769, 738, 3300, 1920, 3457, 2682, 106, 102, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]}

### step8 批处理一批数据

In [29]:
# 对多条数据的处理,这种的处理方式更快
sens = ["弱小的我也有大梦想!", "有梦想谁都了不起", "追逐梦想的新，比梦想本身，更可贵"]
res = tokenizer(sens)
res

{'input_ids': [[101, 2483, 2207, 4638, 2769, 738, 3300, 1920, 3457, 2682, 106, 102], [101, 3300, 3457, 2682, 6443, 6963, 749, 679, 6629, 102], [101, 6841, 6852, 3457, 2682, 4638, 3173, 8024, 3683, 3457, 2682, 3315, 6716, 8024, 3291, 1377, 6586, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

In [32]:
%%time
# 单条循环处理
for i in range(1000):
    tokenizer(sen)

CPU times: total: 172 ms
Wall time: 166 ms


In [36]:
%%time
# batch处理，这种方式更快
res = tokenizer([sen] * 1000)

CPU times: total: 172 ms
Wall time: 49.9 ms


In [37]:
tokenizer

BertTokenizerFast(name_or_path='./roberta_tokenizer', vocab_size=21128, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

## Fast / Slow  Tokenizer

In [38]:
sen = "弱小的我也有大Dreaming!"

In [40]:
# 默认使用的是快速的tokenizer,能支持fast的建议使用fast
fast_tokenizer = AutoTokenizer.from_pretrained("uer/roberta-base-finetuned-dianping-chinese")
fast_tokenizer

BertTokenizerFast(name_or_path='uer/roberta-base-finetuned-dianping-chinese', vocab_size=21128, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [42]:
# 慢的tokenizer,实现的速度比较慢
slow_tokenizer = AutoTokenizer.from_pretrained("uer/roberta-base-finetuned-dianping-chinese", use_fast=False)
slow_tokenizer

BertTokenizer(name_or_path='uer/roberta-base-finetuned-dianping-chinese', vocab_size=21128, model_max_length=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [49]:
%%time
# 单条的循环处理
for i in range(1000):
    fast_tokenizer(sen)

CPU times: total: 484 ms
Wall time: 515 ms


In [44]:
%%time
# 单条的循环处理
for i in range(1000):
    slow_tokenizer(sen)

CPU times: total: 891 ms
Wall time: 1.24 s


In [51]:
%%time
# batch处理
res = fast_tokenizer([sen] * 1000)

CPU times: total: 141 ms
Wall time: 58 ms


In [52]:
%%time
# batch处理
res = slow_tokenizer([sen] * 1000)

CPU times: total: 750 ms
Wall time: 850 ms


In [53]:
inputs = fast_tokenizer(sen, return_offsets_mapping=True)
inputs

{'input_ids': [101, 2483, 2207, 4638, 2769, 738, 3300, 1920, 10252, 8221, 106, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'offset_mapping': [(0, 0), (0, 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, 6), (6, 7), (7, 12), (12, 15), (15, 16), (0, 0)]}

In [54]:
inputs.word_ids()  # 会记录每一个词

[None, 0, 1, 2, 3, 4, 5, 6, 7, 7, 8, None]

## 特殊的Tokenizer的加载

In [55]:
from transformers import AutoTokenizer
# 有些模型并不是huggingface自己实现的，有些是别人实现的上传到上面去的，因此在使用这些特殊的Tokenizer的时候需要使用到trust_remote_code=True
tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True)
tokenizer
# 报错的原因好像是这个模型不支持更新版本的transformer
# 参考：https://github.com/THUDM/VisualGLM-6B/issues/333

tokenizer_config.json: 100%|██████████| 441/441 [00:00<00:00, 147kB/s]
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
tokenization_chatglm.py: 100%|██████████| 17.0k/17.0k [00:00<00:00, 5.70MB/s]
A new version of the following files was downloaded from https://huggingface.co/THUDM/chatglm-6b:
- tokenization_chatglm.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
ice_text.model: 100%|██████████| 2.71M/2.71M [00:00<00:00, 4.37MB/s]


AttributeError: 'ChatGLMTokenizer' object has no attribute 'sp_tokenizer'

In [3]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("Langboat/bloom-389m-zh")

tokenizer_config.json:   0%|          | 0.00/268 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


tokenizer.json:   0%|          | 0.00/2.47M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/96.0 [00:00<?, ?B/s]

In [4]:
tokenizer.save_pretrained("./Langboat/bloom-389m-zh")

('./Langboat/bloom-389m-zh\\tokenizer_config.json',
 './Langboat/bloom-389m-zh\\special_tokens_map.json',
 './Langboat/bloom-389m-zh\\tokenizer.json')

In [5]:
tokenizer = AutoTokenizer.from_pretrained("./Langboat/bloom-389m-zh")
tokenizer

BloomTokenizerFast(name_or_path='./Langboat/bloom-389m-zh', vocab_size=42437, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}