# 基础组件之Tokenizer

## Tokenizer基本使用方法

In [1]:
from transformers import AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
sen = "弱小的我也有大梦想!"

### 加载与保存: `from_pretrained` / `save_pretrained`

In [3]:
# 从HuggingFace加载，输入模型名称，即可加载对于的分词器
tokenizer = AutoTokenizer.from_pretrained("uer/roberta-base-finetuned-dianping-chinese")
tokenizer

BertTokenizerFast(name_or_path='uer/roberta-base-finetuned-dianping-chinese', vocab_size=21128, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [4]:
# tokenizer 保存到本地
tokenizer.save_pretrained("./roberta_tokenizer")

('./roberta_tokenizer/tokenizer_config.json',
 './roberta_tokenizer/special_tokens_map.json',
 './roberta_tokenizer/vocab.txt',
 './roberta_tokenizer/added_tokens.json',
 './roberta_tokenizer/tokenizer.json')

In [5]:
# 从本地加载tokenizer
tokenizer = AutoTokenizer.from_pretrained("./roberta_tokenizer/")
tokenizer

BertTokenizerFast(name_or_path='./roberta_tokenizer/', vocab_size=21128, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [6]:
tokenizer.pad_token_id, tokenizer.pad_token

(0, '[PAD]')

### 句子分词: `tokenize`

In [7]:
tokens = tokenizer.tokenize(sen)
tokens

['弱', '小', '的', '我', '也', '有', '大', '梦', '想', '!']

### 查看词典: `vocab`

In [8]:
tokenizer.vocab
# 带"##"的是把一个完整的词拆成多个子词，
# 从而缩小词表，很多词使用几个子词来组成

{'##吸': 14486,
 'pizza': 10315,
 '##倶': 14021,
 '##徴': 15603,
 'ng': 8885,
 '##懑': 15806,
 '##蓉': 18957,
 '##奧': 15010,
 '##氏': 16751,
 '##薦': 19013,
 '##et': 8418,
 '滙': 4002,
 '純': 5155,
 '∙': 379,
 '##疡': 17607,
 '##芝': 18755,
 '##ory': 9428,
 '许': 6387,
 '瀏': 4104,
 'になる': 9322,
 '##\u2028': 13502,
 '##湧': 17020,
 'de': 8363,
 '##宕': 15190,
 '##荞': 18836,
 '##骇': 20794,
 '##卻': 14377,
 '##擲': 16153,
 '##盧': 17735,
 '##退': 19899,
 '265': 8689,
 '##钾': 20242,
 '##偿': 14042,
 '菅': 5822,
 '鄺': 6976,
 '##簇': 18134,
 '哐': 1513,
 '唷': 1550,
 '蜈': 6048,
 '娓': 2022,
 '每': 3680,
 '900': 8567,
 '藓': 5968,
 '脂': 5544,
 '瓶': 4486,
 '##脚': 18615,
 '驭': 7717,
 '##『': 13656,
 '##智': 16312,
 '缎': 5352,
 '悼': 2656,
 '郵': 6960,
 '##殃': 16707,
 'cafe': 8377,
 '##豚': 19552,
 '蛇': 6026,
 'valley': 11994,
 '蕻': 5944,
 '増': 1868,
 '辮': 6799,
 '裔': 6167,
 '##觐': 19290,
 '##猴': 17404,
 '[unused86]': 86,
 '弁': 2459,
 '##茂': 18801,
 '訕': 6247,
 '3a': 10667,
 '##棱': 16539,
 '揆': 2986,
 '##玲': 17443,
 '州': 2336

In [9]:
tokenizer.vocab_size

21128

### 索引转换: `convert_tokens_to_ids`/`convert_ids_to_tokens`

In [10]:
# 将词序列转换为id序列
ids = tokenizer.convert_tokens_to_ids(tokens)
ids

[2483, 2207, 4638, 2769, 738, 3300, 1920, 3457, 2682, 106]

In [11]:
# 将id序列转换为token序列
tokens = tokenizer.convert_ids_to_tokens(ids)
tokens

['弱', '小', '的', '我', '也', '有', '大', '梦', '想', '!']

In [12]:
# 将token序列转换为string
str_sen = tokenizer.convert_tokens_to_string(tokens)
str_sen

'弱 小 的 我 也 有 大 梦 想!'

###  字符串和id序列的转换: `encode`/`decode`

In [13]:
# 将字符串转换为id序列，又称之为编码
ids = tokenizer.encode(sen, add_special_tokens=True)
ids

[101, 2483, 2207, 4638, 2769, 738, 3300, 1920, 3457, 2682, 106, 102]

In [14]:
tokenizer.encode(sen, add_special_tokens=False)

[2483, 2207, 4638, 2769, 738, 3300, 1920, 3457, 2682, 106]

In [15]:
# 将id序列转换为字符串，又称之为解码
str_sen = tokenizer.decode(ids, skip_special_tokens=False)
str_sen

'[CLS] 弱 小 的 我 也 有 大 梦 想! [SEP]'

In [16]:
tokenizer.decode(ids, skip_special_tokens=True)

'弱 小 的 我 也 有 大 梦 想!'

### 填充与截断: `padding`/`truncation`

In [17]:
# 填充
ids = tokenizer.encode(sen, padding="max_length", max_length=15)
ids

[101, 2483, 2207, 4638, 2769, 738, 3300, 1920, 3457, 2682, 106, 102, 0, 0, 0]

In [18]:
# 截断
ids = tokenizer.encode(sen, max_length=5, truncation=True)
ids

[101, 2483, 2207, 4638, 102]

In [19]:
tokenizer.encode(sen)

[101, 2483, 2207, 4638, 2769, 738, 3300, 1920, 3457, 2682, 106, 102]

In [20]:
# 两个都截断
tokenizer.encode([sen, sen], max_length=12, truncation=True)

[101, 2483, 2207, 4638, 2769, 102, 2483, 2207, 4638, 2769, 738, 102]

In [21]:
# 只截断了第一个
tokenizer.encode([sen, sen], max_length=20, truncation="only_first")

[101,
 2483,
 2207,
 4638,
 2769,
 738,
 3300,
 1920,
 102,
 2483,
 2207,
 4638,
 2769,
 738,
 3300,
 1920,
 3457,
 2682,
 106,
 102]

### 其他输入部分: `attention_mask`/`token_type_ids`

* `attention_mask`：标记哪些部分的token是有意义的，哪些部分是padding的

* `token_type_ids`：标记哪些部分的token属于第一个句子，哪些部分的token属于第二个句子

In [22]:
ids = tokenizer.encode(sen, padding="max_length", max_length=15)
ids

[101, 2483, 2207, 4638, 2769, 738, 3300, 1920, 3457, 2682, 106, 102, 0, 0, 0]

In [23]:
# attention_mask:
# 0表示padding，1表示非padding
# token_type_ids:
# 0表示第一个句子，1表示第二个句子
# 对于单句任务，token_type_ids全为0
# 对于双句任务，token_type_ids的前半部分为0，后半部分为1
attention_mask = [1 if idx != tokenizer.pad_token_id else 0 for idx in ids]
token_type_ids = [0] * len(ids)
ids, attention_mask, token_type_ids

([101, 2483, 2207, 4638, 2769, 738, 3300, 1920, 3457, 2682, 106, 102, 0, 0, 0],
 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

### 快速调用方式

In [24]:
inputs = tokenizer.encode_plus(sen, padding="max_length", max_length=15)
inputs

{'input_ids': [101, 2483, 2207, 4638, 2769, 738, 3300, 1920, 3457, 2682, 106, 102, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]}

In [25]:
inputs = tokenizer(sen, padding="max_length", max_length=15)
inputs

{'input_ids': [101, 2483, 2207, 4638, 2769, 738, 3300, 1920, 3457, 2682, 106, 102, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]}

### 处理batch数据

In [26]:
sens = ["弱小的我也有大梦想", "有梦想谁都了不起", "追逐梦想的心，比梦想本身，更可贵"]
res = tokenizer(sens)
res

{'input_ids': [[101, 2483, 2207, 4638, 2769, 738, 3300, 1920, 3457, 2682, 102], [101, 3300, 3457, 2682, 6443, 6963, 749, 679, 6629, 102], [101, 6841, 6852, 3457, 2682, 4638, 2552, 8024, 3683, 3457, 2682, 3315, 6716, 8024, 3291, 1377, 6586, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

In [27]:
%%time
# 单条循环处理
for i in range(1000):
    tokenizer(sen)

CPU times: user 30.9 ms, sys: 0 ns, total: 30.9 ms
Wall time: 30.7 ms


In [28]:
%%time
# 处理batch数据
res = tokenizer([sen] * 1000)

CPU times: user 36.1 ms, sys: 0 ns, total: 36.1 ms
Wall time: 6.1 ms


In [29]:
tokenizer

BertTokenizerFast(name_or_path='./roberta_tokenizer/', vocab_size=21128, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

## Fast / Slow Tokenizer
Fast Tokenizer：

* 基于Rust实现，速度快

* offsets_mapping、word_ids

Slow Tokenizer：

* 基于python实现，速度慢

In [30]:
sen = "弱小的我也有大Dreaming!"

In [31]:
fast_tokenizer = AutoTokenizer.from_pretrained(
    "uer/roberta-base-finetuned-dianping-chinese"
)
fast_tokenizer

BertTokenizerFast(name_or_path='uer/roberta-base-finetuned-dianping-chinese', vocab_size=21128, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [32]:
slow_tokenizer = AutoTokenizer.from_pretrained(
    "uer/roberta-base-finetuned-dianping-chinese", use_fast=False
)
slow_tokenizer

BertTokenizer(name_or_path='uer/roberta-base-finetuned-dianping-chinese', vocab_size=21128, model_max_length=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [33]:
%%time
# 单条循环处理
for i in range(10000):
    fast_tokenizer(sen)

CPU times: user 309 ms, sys: 2.74 ms, total: 312 ms
Wall time: 312 ms


In [34]:
%%time
# 单条循环处理
for i in range(10000):
    slow_tokenizer(sen)

CPU times: user 819 ms, sys: 1.92 ms, total: 821 ms
Wall time: 823 ms


In [35]:
%%time
# 处理batch数据
res = fast_tokenizer([sen] * 10000)

CPU times: user 323 ms, sys: 15.1 ms, total: 338 ms
Wall time: 88.1 ms


In [36]:
%%time
# 处理batch数据
res = slow_tokenizer([sen] * 10000)

CPU times: user 672 ms, sys: 0 ns, total: 672 ms
Wall time: 672 ms


In [37]:
# sen = "弱小的我也有大Dreaming!"
# return_offsets_mapping: dreaming被分为dream和ing
# word_ids指的是对应原句的哪个词，dream对应第7词，ing也是
# offsets_mapping指的是每个token对应的index，(7,12)指的是dream，(12,15)指的是ing，(15,16)为!
inputs = fast_tokenizer(sen, return_offsets_mapping=True)
inputs

{'input_ids': [101, 2483, 2207, 4638, 2769, 738, 3300, 1920, 10252, 8221, 106, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'offset_mapping': [(0, 0), (0, 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, 6), (6, 7), (7, 12), (12, 15), (15, 16), (0, 0)]}

In [38]:
inputs.word_ids()

[None, 0, 1, 2, 3, 4, 5, 6, 7, 7, 8, None]

In [39]:
inputs = slow_tokenizer(sen, return_offsets_mapping=True)

NotImplementedError: return_offset_mapping is not available when using Python tokenizers. To use this feature, change your tokenizer to one deriving from transformers.PreTrainedTokenizerFast. More information on available tokenizers at https://github.com/huggingface/transformers/pull/2674

## 特殊Tokenizer的加载

不同模型对应的分词器的效果也不同，有的会在结尾加上`<\s>`，有的会在开头加`<CLS>`在结尾加`<EOS>`

In [40]:
from transformers import AutoTokenizer

In [41]:
# 新版本的transformers（>4.34），加载 THUDM/chatglm 会报错，因此这里替换为了天宫的模型
tokenizer = AutoTokenizer.from_pretrained(
    "Skywork/Skywork-13B-base", trust_remote_code=True
)
tokenizer

You are using the legacy behaviour of the <class 'transformers_modules.Skywork.Skywork-13B-base.bc35915066fbbf15b77a1a4a74e9b574ab167816.tokenization_skywork.SkyworkTokenizer'>. This means that tokens that come after special tokens will not be properly handled. 


SkyworkTokenizer(name_or_path='Skywork/Skywork-13B-base', vocab_size=65519, model_max_length=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
}

In [42]:
tokenizer.save_pretrained("skywork_tokenizer")

('skywork_tokenizer/tokenizer_config.json',
 'skywork_tokenizer/special_tokens_map.json',
 'skywork_tokenizer/tokenizer.model',
 'skywork_tokenizer/added_tokens.json')

In [43]:
tokenizer = AutoTokenizer.from_pretrained("skywork_tokenizer", trust_remote_code=True)

In [44]:
tokenizer.decode(tokenizer.encode(sen))

'<s>弱小的我也有大Dreaming!'