# Tokenizer基本使用

In [1]:
from transformers import AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
sen ="吃葡萄不吐葡萄皮"

## step1 加载与保存

In [3]:
# 从huggingface加载、输入模型名称，即可加载对应的分词器
tokenizer = AutoTokenizer.from_pretrained("uer/roberta-base-finetuned-chinanews-chinese")
tokenizer

BertTokenizerFast(name_or_path='uer/roberta-base-finetuned-chinanews-chinese', vocab_size=21128, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)

In [4]:
#tokenizer保存到本地
#"./"表示当前工作目录
tokenizer.save_pretrained("./roberta_tokenizer")

('./roberta_tokenizer\\tokenizer_config.json',
 './roberta_tokenizer\\special_tokens_map.json',
 './roberta_tokenizer\\vocab.txt',
 './roberta_tokenizer\\added_tokens.json',
 './roberta_tokenizer\\tokenizer.json')

In [5]:
#从本地加载tokenizer
tokenizer=AutoTokenizer.from_pretrained("./roberta_tokenizer")
tokenizer

BertTokenizerFast(name_or_path='./roberta_tokenizer', vocab_size=21128, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)

## step2 句子分词

In [6]:
tokens=tokenizer.tokenize(sen)
tokens

['吃', '葡', '萄', '不', '吐', '葡', '萄', '皮']

## step3 查看词典


In [7]:
# tokenizer.vocab：字典类型，存储 “词元（token）- 唯一 ID” 的映射，用于文本编码（转 ID）和解码（ID 转文本），也可查看词汇表具体内容
# tokenizer.vocab_size：整数，代表词汇表中词元的总数，用于预估模型资源需求、判断分词器复杂度，或在自定义逻辑（如构建词嵌入层）中设置参数
tokenizer.vocab
tokenizer.vocab_size

21128

## step4 索引转换

In [8]:
# 将词序列转换为id序列
ids=tokenizer.convert_tokens_to_ids(tokens)
ids

[1391, 5868, 5843, 679, 1402, 5868, 5843, 4649]

In [9]:
# 将id序列转换为token序列
tokens=tokenizer.convert_ids_to_tokens(ids)
tokens

['吃', '葡', '萄', '不', '吐', '葡', '萄', '皮']

In [10]:
# 将token序列转换为string
str_sen=tokenizer.convert_tokens_to_string
str_sen

<bound method PreTrainedTokenizerFast.convert_tokens_to_string of BertTokenizerFast(name_or_path='./roberta_tokenizer', vocab_size=21128, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)>

# 更便捷的实现方式

In [11]:
# 编码：将字符串转换为为id序列
ids=tokenizer.encode(sen,add_special_tokens=True)
ids

[101, 1391, 5868, 5843, 679, 1402, 5868, 5843, 4649, 102]

In [12]:
# 解码:将id序列转换为字符串
str_sen=tokenizer.decode(ids,skip_special_tokens=False)
str_sen

'[CLS] 吃 葡 萄 不 吐 葡 萄 皮 [SEP]'

## step5 填充与截断

In [13]:
# 填充
# 在自然语言处理中，不同文本的长度往往不一致，但模型通常需要固定长度的输入。
# 填充的作用是将较短的文本补充到指定的最大长度，使得所有文本在输入模型时长度统一。
ids=tokenizer.encode(sen,padding="max_length",max_length=15)
ids

[101, 1391, 5868, 5843, 679, 1402, 5868, 5843, 4649, 102, 0, 0, 0, 0, 0]

In [14]:
# 截断
# 当文本编码后的 ID 序列长度超过模型能处理的最大长度时，就需要进行截断。
# 截断的作用是将过长的文本截取到指定的最大长度，保证输入模型的文本长度在模型可处理范围内。
ids=tokenizer.encode(sen,max_length=5,truncation=True)
ids

[101, 1391, 5868, 5843, 102]

## step6 输入其他部分

In [15]:
# 输入id序列：将文本sen编码为模型可理解的整数ID序列
ids=tokenizer.encode(sen,padding="max_length",max_length=15)
ids

[101, 1391, 5868, 5843, 679, 1402, 5868, 5843, 4649, 102, 0, 0, 0, 0, 0]

In [16]:
# 注意力掩码（这里忽略上一步padding填充的无效值）
attention_mask=[1 if idx != 0 else 0 for idx in ids]
# 分段id
token_type_ids=[0]*len(ids)
ids,attention_mask,token_type_ids

([101, 1391, 5868, 5843, 679, 1402, 5868, 5843, 4649, 102, 0, 0, 0, 0, 0],
 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

## step8 处理batch数据

In [17]:
sens = ["弱小的我也有大梦想",
        "有梦想谁都了不起",
        "追逐梦想的心，比梦想本身，更可贵"]
res=tokenizer(sens)
res

{'input_ids': [[101, 2483, 2207, 4638, 2769, 738, 3300, 1920, 3457, 2682, 102], [101, 3300, 3457, 2682, 6443, 6963, 749, 679, 6629, 102], [101, 6841, 6852, 3457, 2682, 4638, 2552, 8024, 3683, 3457, 2682, 3315, 6716, 8024, 3291, 1377, 6586, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

In [18]:
%%time
# 单条循环处理
for i in range(1000):
    tokenizer(sen)

CPU times: total: 281 ms
Wall time: 277 ms


In [19]:
%%time
# 处理batch数据
res=tokenizer([sen]*1000)

CPU times: total: 250 ms
Wall time: 51.4 ms


In [20]:
tokenizer

BertTokenizerFast(name_or_path='./roberta_tokenizer', vocab_size=21128, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)

## fast/slow tokenizer

In [21]:
sen="吃葡萄不吐葡萄皮"

In [22]:
fast_tokenizer=AutoTokenizer.from_pretrained("uer/roberta-base-finetuned-dianping-chinese")
fast_tokenizer

BertTokenizerFast(name_or_path='uer/roberta-base-finetuned-dianping-chinese', vocab_size=21128, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)

In [23]:
slow_tokenizer=AutoTokenizer.from_pretrained("uer/roberta-base-finetuned-dianping-chinese",use_fast=False)
slow_tokenizer

BertTokenizer(name_or_path='uer/roberta-base-finetuned-dianping-chinese', vocab_size=21128, model_max_length=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)

In [24]:
%%time
# 单条循环
for i in range(10000):
    fast_tokenizer(sen)

CPU times: total: 891 ms
Wall time: 930 ms


In [25]:
%%time
# 单条循环
for i in range(10000):
    slow_tokenizer

CPU times: total: 0 ns
Wall time: 6.83 ms


In [26]:
%%time
#处理batch数据
res=fast_tokenizer([sen]*10000)

CPU times: total: 2.11 s
Wall time: 548 ms


In [27]:
%%time
#处理batch数据
res=slow_tokenizer([sen]*10000)

CPU times: total: 2.89 s
Wall time: 2.99 s


In [28]:
inputs=fast_tokenizer(sen,return_offsets_mapping=True)
inputs

{'input_ids': [101, 1391, 5868, 5843, 679, 1402, 5868, 5843, 4649, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'offset_mapping': [(0, 0), (0, 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, 6), (6, 7), (7, 8), (0, 0)]}

In [29]:
inputs.word_ids()

[None, 0, 1, 2, 3, 4, 5, 6, 7, None]

In [30]:
# ？？？？？？？
inputs=slow_tokenizer(sen,return_offsets_mapping=True)

NotImplementedError: return_offset_mapping is not available when using Python tokenizers. To use this feature, change your tokenizer to one deriving from transformers.PreTrainedTokenizerFast. More information on available tokenizers at https://github.com/huggingface/transformers/pull/2674

## 特殊tokenizer加载

In [None]:
from transformers import AutoTokenizer