# TOKENIZER基本使用

In [41]:
from transformers import AutoTokenizer

In [42]:
sen = "弱小的我也有大梦想!"

## Step1 加载与保存

In [43]:
# 从HUGGINGFACE加载，输入模型名称，即可加载对于的分词器
tokenizer = AutoTokenizer.from_pretrained("uer/roberta-base-finetuned-dianping-chinese")

tokenizer

BertTokenizerFast(name_or_path='uer/roberta-base-finetuned-dianping-chinese', vocab_size=21128, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)

In [44]:
# TOKENIZER保存到本地

tokenizer.save_pretrained("./roberta_tokenizer")

('./roberta_tokenizer\\tokenizer_config.json',
 './roberta_tokenizer\\special_tokens_map.json',
 './roberta_tokenizer\\vocab.txt',
 './roberta_tokenizer\\added_tokens.json',
 './roberta_tokenizer\\tokenizer.json')

In [45]:
# 从本地加载TOKENIZER
tokenizer = AutoTokenizer.from_pretrained("./roberta_tokenizer/")

tokenizer

BertTokenizerFast(name_or_path='./roberta_tokenizer/', vocab_size=21128, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)

## Step2 句子分词

In [46]:
tokens = tokenizer.tokenize(sen)

tokens

['弱', '小', '的', '我', '也', '有', '大', '梦', '想', '!']

## Step3 查看词典

In [47]:
tokenizer.vocab

{'丧': 700,
 '骸': 7760,
 '佰': 880,
 '娘': 2023,
 'ｼ': 8090,
 'kevin': 9583,
 '稷': 4938,
 '燃': 4234,
 '悟': 2640,
 '璨': 4471,
 '盐': 4663,
 '100': 8135,
 '##橐': 16636,
 '##癜': 17677,
 '哋': 1508,
 'sg': 12954,
 '##啻': 14638,
 '##漆': 17081,
 '##軟': 19784,
 '##xon': 12854,
 '业': 689,
 'ゝ': 587,
 '綾': 5213,
 '衬': 6137,
 '##孟': 15163,
 '##瑁': 17496,
 '##card': 11644,
 '売': 1899,
 '+': 116,
 '##1': 8148,
 '蘊': 5980,
 '##應': 15803,
 '##矶': 17825,
 '##扁': 15850,
 '牧': 4288,
 'qa': 10818,
 '##54': 9488,
 '##ize': 11789,
 'camp': 12275,
 '##奘': 15005,
 'pp': 8570,
 'ecfa': 12496,
 '##疡': 17607,
 '屠': 2248,
 '##key': 9938,
 '浣': 3854,
 '##邺': 19999,
 '12345': 9700,
 '8g': 10019,
 '茧': 5753,
 '##data': 11792,
 '##瘢': 17662,
 '凸': 1137,
 '耶': 5456,
 'thai': 12967,
 '##悄': 15689,
 '##昧': 16275,
 '醛': 7010,
 '##鈎': 20101,
 '413': 12561,
 'pre': 11685,
 '詣': 6274,
 '堂': 1828,
 '広': 2410,
 '钒': 7156,
 'this': 8554,
 '##鏤': 20187,
 '釋': 7026,
 '瀆': 4101,
 'may': 8480,
 '##晷': 16311,
 '##與': 18702,
 '蒋': 5882

In [48]:
tokenizer.vocab_size

21128

## Step4 索引转换

In [49]:
# 将词序列转换为ID序列
ids = tokenizer.convert_tokens_to_ids(tokens)

ids

[2483, 2207, 4638, 2769, 738, 3300, 1920, 3457, 2682, 106]

In [50]:
# 将ID序列转换为TOKEN序列
tokens = tokenizer.convert_ids_to_tokens(ids)

tokens

['弱', '小', '的', '我', '也', '有', '大', '梦', '想', '!']

In [51]:
# 将TOKEN序列转换为STRING
str_sen = tokenizer.convert_tokens_to_string(tokens)

str_sen

'弱 小 的 我 也 有 大 梦 想!'

###  更便捷的实现方式

In [52]:
# 将字符串转换为ID序列，又称之为编码
ids = tokenizer.encode(sen, add_special_tokens=True)

ids

[101, 2483, 2207, 4638, 2769, 738, 3300, 1920, 3457, 2682, 106, 102]

In [53]:
# 将ID序列转换为字符串，又称之为解码
str_sen = tokenizer.decode(ids, skip_special_tokens=False)

str_sen

'[CLS] 弱 小 的 我 也 有 大 梦 想! [SEP]'

## Step5 填充与截断

In [54]:
# 填充
ids = tokenizer.encode(sen, padding="max_length", max_length=15)
ids

[101, 2483, 2207, 4638, 2769, 738, 3300, 1920, 3457, 2682, 106, 102, 0, 0, 0]

In [55]:
# 截断
ids = tokenizer.encode(sen, max_length=5, truncation=True)
ids

[101, 2483, 2207, 4638, 102]

## Step6 其他输入部分

In [56]:
ids = tokenizer.encode(sen, padding="max_length", max_length=15)
ids

[101, 2483, 2207, 4638, 2769, 738, 3300, 1920, 3457, 2682, 106, 102, 0, 0, 0]

In [57]:
attention_mask = [1 if idx != 0 else 0 for idx in ids]

token_type_ids = [0] * len(ids)

ids, attention_mask, token_type_ids

([101, 2483, 2207, 4638, 2769, 738, 3300, 1920, 3457, 2682, 106, 102, 0, 0, 0],
 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

## Step7 快速调用方式

In [58]:
inputs = tokenizer.encode_plus(sen, padding="max_length", max_length=15)

inputs

{'input_ids': [101, 2483, 2207, 4638, 2769, 738, 3300, 1920, 3457, 2682, 106, 102, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]}

In [59]:
inputs = tokenizer(sen, padding="max_length", max_length=15)

inputs

{'input_ids': [101, 2483, 2207, 4638, 2769, 738, 3300, 1920, 3457, 2682, 106, 102, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]}

## Step8 处理BATCH数据

In [60]:
sens = [
    "弱小的我也有大梦想",
    "有梦想谁都了不起",
    "追逐梦想的心，比梦想本身，更可贵"
]

res = tokenizer(sens)

res

{'input_ids': [[101, 2483, 2207, 4638, 2769, 738, 3300, 1920, 3457, 2682, 102], [101, 3300, 3457, 2682, 6443, 6963, 749, 679, 6629, 102], [101, 6841, 6852, 3457, 2682, 4638, 2552, 8024, 3683, 3457, 2682, 3315, 6716, 8024, 3291, 1377, 6586, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

In [61]:
%%time

# 单条循环处理
for i in range(1000):
    
    tokenizer(sen)

CPU times: total: 78.1 ms
Wall time: 82.8 ms


In [62]:
%%time

# 处理BATCH数据

res = tokenizer([sen] * 1000)

CPU times: total: 172 ms
Wall time: 16 ms


In [63]:
tokenizer

BertTokenizerFast(name_or_path='./roberta_tokenizer/', vocab_size=21128, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)

# Fast / Slow Tokenizer

In [64]:
sen = "弱小的我也有大Dreaming!"

In [65]:
fast_tokenizer = AutoTokenizer.from_pretrained("uer/roberta-base-finetuned-dianping-chinese")

fast_tokenizer

BertTokenizerFast(name_or_path='uer/roberta-base-finetuned-dianping-chinese', vocab_size=21128, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)

In [66]:
slow_tokenizer = AutoTokenizer.from_pretrained("uer/roberta-base-finetuned-dianping-chinese", use_fast=False)

slow_tokenizer

BertTokenizer(name_or_path='uer/roberta-base-finetuned-dianping-chinese', vocab_size=21128, model_max_length=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)

In [67]:
%%time

# 单条循环处理
for i in range(10000):

    fast_tokenizer(sen)

CPU times: total: 703 ms
Wall time: 704 ms


In [68]:
%%time

# 单条循环处理
for i in range(10000):

    slow_tokenizer(sen)

CPU times: total: 1.81 s
Wall time: 1.81 s


In [69]:
%%time

# 处理BATCH数据
res = fast_tokenizer([sen] * 10000)

CPU times: total: 922 ms
Wall time: 296 ms


In [70]:
%%time

# 处理BATCH数据
res = slow_tokenizer([sen] * 10000)

CPU times: total: 1.73 s
Wall time: 1.73 s


In [78]:
inputs = fast_tokenizer(sen, return_offsets_mapping=True)

inputs

{'input_ids': [101, 2483, 2207, 4638, 2769, 738, 3300, 1920, 10252, 8221, 106, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'offset_mapping': [(0, 0), (0, 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, 6), (6, 7), (7, 12), (12, 15), (15, 16), (0, 0)]}

In [79]:
inputs.word_ids() #？？？？

[None, 0, 1, 2, 3, 4, 5, 6, 7, 7, 8, None]

In [80]:
# inputs = slow_tokenizer(sen, return_offsets_mapping=True) 没有实现RETURN_OFFSETS_MAPPING功能，执行会报错！
inputs = slow_tokenizer(sen)

inputs

{'input_ids': [101, 2483, 2207, 4638, 2769, 738, 3300, 1920, 10252, 8221, 106, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

# 特殊TOKENIZER的加载

In [81]:
from transformers import AutoTokenizer

In [82]:
tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True)

tokenizer

ChatGLMTokenizer(name_or_path='THUDM/chatglm-6b', vocab_size=130344, model_max_length=2048, is_fast=False, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<sop>', 'eos_token': '<eop>', 'unk_token': '<unk>', 'pad_token': '<pad>', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)

In [83]:
tokenizer.save_pretrained("chatglm_tokenizer")

('chatglm_tokenizer\\tokenizer_config.json',
 'chatglm_tokenizer\\special_tokens_map.json',
 'chatglm_tokenizer\\ice_text.model',
 'chatglm_tokenizer\\added_tokens.json')

In [84]:
tokenizer = AutoTokenizer.from_pretrained("chatglm_tokenizer", trust_remote_code=True)

In [85]:
tokenizer.decode(tokenizer.encode(sen))

'弱小的我也有大Dreaming!'