# Tokenizer 基本使用

In [1]:
## 学术资源加速
import subprocess
import os

result = subprocess.run('bash -c "source /etc/network_turbo && env | grep proxy"', shell=True, capture_output=True, text=True)
output = result.stdout
for line in output.splitlines():
    if '=' in line:
        var, value = line.split('=', 1)
        os.environ[var] = value

In [2]:
from transformers import AutoTokenizer

# 加载预训练的Tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# 待处理的文本
text = "Transformers are the core of modern NLP tasks."

# 使用Tokenizer进行编码
encoded_input = tokenizer(text, return_tensors='pt')

# 访问编码结果
input_ids = encoded_input['input_ids']
attention_mask = encoded_input['attention_mask']

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [2]:
from transformers import AutoTokenizer

In [3]:
sen = "吃葡萄不吐葡萄皮!"

## Step1 加载与保存

In [4]:
# 从HuggingFace加载，输入模型名称，即可加载对于的分词器
tokenizer = AutoTokenizer.from_pretrained("uer/roberta-base-finetuned-dianping-chinese")
tokenizer

BertTokenizerFast(name_or_path='uer/roberta-base-finetuned-dianping-chinese', vocab_size=21128, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [5]:
# tokenizer 保存到本地
tokenizer.save_pretrained("./roberta_tokenizer")

('./roberta_tokenizer/tokenizer_config.json',
 './roberta_tokenizer/special_tokens_map.json',
 './roberta_tokenizer/vocab.txt',
 './roberta_tokenizer/added_tokens.json',
 './roberta_tokenizer/tokenizer.json')

In [6]:
# 从本地加载tokenizer
tokenizer = AutoTokenizer.from_pretrained("./roberta_tokenizer/")
tokenizer

BertTokenizerFast(name_or_path='./roberta_tokenizer/', vocab_size=21128, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

## Step2 句子分词

In [7]:
tokens = tokenizer.tokenize(sen)
tokens

['吃', '葡', '萄', '不', '吐', '葡', '萄', '皮', '!']

## Step3 查看词典

In [8]:
tokenizer.vocab

{'##叉': 14406,
 '##蘆': 19035,
 '吟': 1412,
 '##煌': 17259,
 '手': 2797,
 'に': 558,
 '##肋': 18547,
 '話': 6282,
 '##爆': 17312,
 'by': 8120,
 '##eep': 12293,
 'm1': 10211,
 '霜': 7458,
 'time': 8759,
 '輾': 6747,
 '##怅': 15637,
 '##篡': 18127,
 'neo': 12169,
 '##莽': 18875,
 'ieee': 12272,
 '贲': 6584,
 '##噢': 14745,
 '##轩': 19816,
 '##嗖': 14683,
 '倍': 945,
 '##shi': 9655,
 '##media': 10970,
 'gtx': 11069,
 '##沂': 16809,
 '拢': 2879,
 '脘': 5557,
 '侄': 888,
 '##扉': 15853,
 '刃': 1145,
 '##ᄆ': 13459,
 '＾': 8048,
 '##擘': 16143,
 '##魅': 20848,
 '##晝': 16299,
 '⌒': 404,
 '##bo': 8820,
 '##邨': 19988,
 '##雅': 20471,
 'notes': 13165,
 '踟': 6676,
 '##艺': 18743,
 '##捻': 16008,
 '##绩': 18384,
 '亞': 765,
 '543': 11895,
 'eq': 11601,
 'pure': 13179,
 '掺': 2982,
 '##聖': 18526,
 '##爺': 17327,
 '##．': 21082,
 '←': 368,
 '蜱': 6061,
 '肄': 5485,
 '留': 4522,
 'member': 11120,
 '##men': 11839,
 '##值': 14023,
 '##椰': 16553,
 '肴': 5510,
 '綏': 5193,
 '痈': 4569,
 '偈': 970,
 '##偵': 14037,
 '0t': 10691,
 '##絵': 18246,
 '籤': 

In [9]:
tokenizer.vocab_size

21128

## Step4 索引转换

In [10]:
# 将词序列转换为id序列
ids = tokenizer.convert_tokens_to_ids(tokens)
ids

[1391, 5868, 5843, 679, 1402, 5868, 5843, 4649, 106]

In [11]:
# 将id序列转换为token序列
tokens = tokenizer.convert_ids_to_tokens(ids)
tokens

['吃', '葡', '萄', '不', '吐', '葡', '萄', '皮', '!']

In [12]:
# 将token序列转换为string
str_sen = tokenizer.convert_tokens_to_string(tokens)
str_sen

'吃 葡 萄 不 吐 葡 萄 皮!'

###  更便捷的实现方式

In [13]:
# 将字符串转换为id序列，又称之为编码
ids = tokenizer.encode(sen, add_special_tokens=True)
ids

[101, 1391, 5868, 5843, 679, 1402, 5868, 5843, 4649, 106, 102]

In [14]:
# 将id序列转换为字符串，又称之为解码
str_sen = tokenizer.decode(ids, skip_special_tokens=False)
str_sen

'[CLS] 吃 葡 萄 不 吐 葡 萄 皮! [SEP]'

## Step5 填充与截断

In [15]:
# 填充
ids = tokenizer.encode(sen, padding="max_length", max_length=15)
ids

[101, 1391, 5868, 5843, 679, 1402, 5868, 5843, 4649, 106, 102, 0, 0, 0, 0]

In [16]:
# 截断
ids = tokenizer.encode(sen, max_length=5, truncation=True)
ids

[101, 1391, 5868, 5843, 102]

## Step6 其他输入部分

In [17]:
ids = tokenizer.encode(sen, padding="max_length", max_length=15)
ids

[101, 1391, 5868, 5843, 679, 1402, 5868, 5843, 4649, 106, 102, 0, 0, 0, 0]

In [18]:
attention_mask = [1 if idx != 0 else 0 for idx in ids]
token_type_ids = [0] * len(ids)
ids, attention_mask, token_type_ids

([101, 1391, 5868, 5843, 679, 1402, 5868, 5843, 4649, 106, 102, 0, 0, 0, 0],
 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

## Step7 快速调用方式

In [19]:
inputs = tokenizer.encode_plus(sen, padding="max_length", max_length=15)
inputs

{'input_ids': [101, 1391, 5868, 5843, 679, 1402, 5868, 5843, 4649, 106, 102, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0]}

In [20]:
inputs = tokenizer(sen, padding="max_length", max_length=15)
inputs

{'input_ids': [101, 1391, 5868, 5843, 679, 1402, 5868, 5843, 4649, 106, 102, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0]}

## Step8 处理batch数据

In [21]:
sens = ["吃葡萄不吐葡萄皮",
        "不吃葡萄到吐葡萄皮",
        "顺势而为"]
res = tokenizer(sens)
res

{'input_ids': [[101, 1391, 5868, 5843, 679, 1402, 5868, 5843, 4649, 102], [101, 679, 1391, 5868, 5843, 1168, 1402, 5868, 5843, 4649, 102], [101, 7556, 1232, 5445, 711, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1]]}

In [22]:
%%time
# 单条循环处理
for i in range(1000):
    tokenizer(sen)

CPU times: user 45.3 ms, sys: 0 ns, total: 45.3 ms
Wall time: 44.6 ms


In [23]:
%%time
# 处理batch数据
res = tokenizer([sen] * 1000)

CPU times: user 27.7 ms, sys: 15.6 ms, total: 43.2 ms
Wall time: 7.68 ms


In [23]:
tokenizer

BertTokenizerFast(name_or_path='./roberta_tokenizer/', vocab_size=21128, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

# Fast / Slow Tokenizer

In [24]:
sen = "吃葡萄不吐葡萄皮!"

In [25]:
fast_tokenizer = AutoTokenizer.from_pretrained("uer/roberta-base-finetuned-dianping-chinese")
fast_tokenizer

BertTokenizerFast(name_or_path='uer/roberta-base-finetuned-dianping-chinese', vocab_size=21128, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [26]:
slow_tokenizer = AutoTokenizer.from_pretrained("uer/roberta-base-finetuned-dianping-chinese", use_fast=False)
slow_tokenizer

BertTokenizer(name_or_path='uer/roberta-base-finetuned-dianping-chinese', vocab_size=21128, model_max_length=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [27]:
%%time
# 单条循环处理
for i in range(10000):
    fast_tokenizer(sen)

CPU times: user 443 ms, sys: 0 ns, total: 443 ms
Wall time: 442 ms


In [28]:
%%time
# 单条循环处理
for i in range(10000):
    slow_tokenizer(sen)

CPU times: user 1.39 s, sys: 0 ns, total: 1.39 s
Wall time: 1.39 s


In [29]:
%%time
# 处理batch数据
res = fast_tokenizer([sen] * 10000)

CPU times: user 323 ms, sys: 146 ms, total: 468 ms
Wall time: 172 ms


In [30]:
%%time
# 处理batch数据
res = slow_tokenizer([sen] * 10000)

CPU times: user 1.1 s, sys: 15.8 ms, total: 1.12 s
Wall time: 1.12 s


In [31]:
inputs = fast_tokenizer(sen, return_offsets_mapping=True)
inputs

{'input_ids': [101, 1391, 5868, 5843, 679, 1402, 5868, 5843, 4649, 106, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'offset_mapping': [(0, 0), (0, 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, 6), (6, 7), (7, 8), (8, 9), (0, 0)]}

In [32]:
inputs.word_ids()

[None, 0, 1, 2, 3, 4, 5, 6, 7, 8, None]

In [33]:
inputs = slow_tokenizer(sen, return_offsets_mapping=True)

NotImplementedError: return_offset_mapping is not available when using Python tokenizers. To use this feature, change your tokenizer to one deriving from transformers.PreTrainedTokenizerFast. More information on available tokenizers at https://github.com/huggingface/transformers/pull/2674

# 特殊Tokenizer的加载

In [45]:
from transformers import AutoTokenizer

In [46]:
tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm-6b", trust_remote_code=True)
tokenizer

Downloading (…)okenizer_config.json:   0%|          | 0.00/441 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Explicitly passing a `revision` is encouraged when loading a model with custom code to ensure no malicious code has been contributed in a newer revision.


Downloading (…)enization_chatglm.py:   0%|          | 0.00/17.0k [00:00<?, ?B/s]

Downloading ice_text.model:   0%|          | 0.00/2.71M [00:00<?, ?B/s]

ChatGLMTokenizer(name_or_path='THUDM/chatglm-6b', vocab_size=130344, model_max_length=2048, is_fast=False, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<sop>', 'eos_token': '<eop>', 'unk_token': '<unk>', 'pad_token': '<pad>', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)

In [47]:
tokenizer.save_pretrained("chatglm_tokenizer")

('chatglm_tokenizer\\tokenizer_config.json',
 'chatglm_tokenizer\\special_tokens_map.json',
 'chatglm_tokenizer\\ice_text.model',
 'chatglm_tokenizer\\added_tokens.json')

In [49]:
tokenizer = AutoTokenizer.from_pretrained("chatglm_tokenizer", trust_remote_code=True)

Explicitly passing a `revision` is encouraged when loading a model with custom code to ensure no malicious code has been contributed in a newer revision.


In [50]:
tokenizer.decode(tokenizer.encode(sen))

'弱小的我也有大Dreaming!'