In [1]:
from transformers import AutoTokenizer

## 1. 加载模型

In [2]:
sen = "弱小的我也有梦想"
# 从HuggingFace加载，输入模型名称，即可加载对于的分词器
tokenizer = AutoTokenizer.from_pretrained("tabularisai/multilingual-sentiment-analysis")
tokenizer

Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


DistilBertTokenizerFast(name_or_path='tabularisai/multilingual-sentiment-analysis', vocab_size=119547, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)

## 2. 句子分词

In [6]:
token = tokenizer.tokenize(sen)
token

['弱', '小', '的', '我', '也', '有', '梦', '想']

## 3. 索引转换

In [8]:
# 将词序列转换为id序列
ids = tokenizer.convert_tokens_to_ids(token)
ids

[3727, 3459, 5718, 3976, 2135, 4461, 4614, 3898]

In [9]:
# 将id序列转换为token序列
tokens = tokenizer.convert_ids_to_tokens(ids)
tokens

['弱', '小', '的', '我', '也', '有', '梦', '想']

In [10]:
# 将tokens序列转为字符串
str_sen = tokenizer.convert_tokens_to_string(tokens)
str_sen


'弱 小 的 我 也 有 梦 想'

### 更便捷的方式

In [None]:
# 将字符串转换为id序列，又称之为编码
ids = tokenizer.encode(sen, add_special_tokens=True) # 如果不想添加特殊字符，则add_special_tokens=False
ids

[101, 3727, 3459, 5718, 3976, 2135, 4461, 4614, 3898, 102]

In [12]:
# 将id序列转换为字符串，又称之为解码
str_sen = tokenizer.decode(ids, skip_special_tokens=False)
str_sen

'[CLS] 弱 小 的 我 也 有 梦 想 [SEP]'

## 4. 填充与截断

In [13]:
# 填充
ids = tokenizer.encode(sen, padding="max_length", max_length=15)
ids

[101, 3727, 3459, 5718, 3976, 2135, 4461, 4614, 3898, 102, 0, 0, 0, 0, 0]

In [14]:
# 截断
ids = tokenizer.encode(sen, max_length=5, truncation=True)
ids

[101, 3727, 3459, 5718, 102]

## 5. 其他输入部分

In [15]:
ids = tokenizer.encode(sen, padding="max_length", max_length=15)
ids

[101, 3727, 3459, 5718, 3976, 2135, 4461, 4614, 3898, 102, 0, 0, 0, 0, 0]

In [None]:
attention_mask = [1 if idx != 0 else 0 for idx in ids]
token_type_ids = [0] * len(ids) # 告诉模型输入序列中哪些 token 属于“第一句话”，哪些属于“第二句话”
ids, attention_mask, token_type_ids

([101, 3727, 3459, 5718, 3976, 2135, 4461, 4614, 3898, 102, 0, 0, 0, 0, 0],
 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

## 6. 快速调用方式

In [17]:
inputs = tokenizer.encode_plus(sen, padding="max_length", max_length=15)
inputs

{'input_ids': [101, 3727, 3459, 5718, 3976, 2135, 4461, 4614, 3898, 102, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]}

In [None]:
# 用这个方法直接完成前面的所有
inputs = tokenizer(sen, padding="max_length", max_length=15)
inputs

{'input_ids': [101, 3727, 3459, 5718, 3976, 2135, 4461, 4614, 3898, 102, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]}

## 7. 处理batch数据

In [20]:
sens = ["弱小的我也有大梦想",
        "有梦想谁都了不起",
        "追逐梦想的心，比梦想本身，更可贵"]
res = tokenizer(sens, return_token_type_ids=True)
res

{'input_ids': [[101, 3727, 3459, 5718, 3976, 2135, 4461, 3197, 4614, 3898, 102], [101, 4461, 4614, 3898, 7363, 7838, 2146, 2080, 7533, 102], [101, 7717, 7728, 4614, 3898, 5718, 3792, 10064, 4839, 4614, 3898, 4476, 7590, 10064, 4449, 2756, 7495, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

## Fast/Slow Tokenizer

In [1]:
from transformers import AutoTokenizer
sen = "弱小的我也有大Dreaming!"

In [2]:
fast_tokenizer = AutoTokenizer.from_pretrained("uer/roberta-base-finetuned-dianping-chinese")
fast_tokenizer

Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


tokenizer_config.json:   0%|          | 0.00/295 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

BertTokenizerFast(name_or_path='uer/roberta-base-finetuned-dianping-chinese', vocab_size=21128, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)

In [3]:
slow_tokenizer = AutoTokenizer.from_pretrained("uer/roberta-base-finetuned-dianping-chinese", use_fast=False)
slow_tokenizer

BertTokenizer(name_or_path='uer/roberta-base-finetuned-dianping-chinese', vocab_size=21128, model_max_length=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)

In [4]:
%%time
# 单条循环处理
for i in range(10000):
    fast_tokenizer(sen)

CPU times: user 313 ms, sys: 3.28 ms, total: 316 ms
Wall time: 313 ms


In [5]:
%%time
# 单条循环处理
for i in range(10000):
    slow_tokenizer(sen)

CPU times: user 859 ms, sys: 0 ns, total: 859 ms
Wall time: 859 ms


In [6]:
%%time
# 处理batch数据
res = fast_tokenizer([sen] * 10000)

CPU times: user 511 ms, sys: 272 ms, total: 783 ms
Wall time: 262 ms


In [7]:
%%time
# 处理batch数据
res = slow_tokenizer([sen] * 10000)

CPU times: user 806 ms, sys: 2.33 ms, total: 808 ms
Wall time: 808 ms


In [None]:
inputs = fast_tokenizer(sen, return_offsets_mapping=True)
# offset_mapping：每个 token 在原始文本中的字符级起止位置
print(inputs)
print("offset_mapping",inputs.offset_mapping) # 显然是把dreaming分成了"dream"和"##ing"两个token，##前缀表示这是一个字词

{'input_ids': [101, 2483, 2207, 4638, 2769, 738, 3300, 1920, 10252, 8221, 106, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'offset_mapping': [(0, 0), (0, 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, 6), (6, 7), (7, 12), (12, 15), (15, 16), (0, 0)]}
offset_mapping [(0, 0), (0, 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, 6), (6, 7), (7, 12), (12, 15), (15, 16), (0, 0)]


In [None]:
inputs.word_ids() # 每个token属于原始输入中的第几个词

[None, 0, 1, 2, 3, 4, 5, 6, 7, 7, 8, None]

In [None]:
# return_offsets_mapping 只有slow tokenizer才有
inputs = slow_tokenizer(sen, return_offsets_mapping=True)

NotImplementedError: return_offset_mapping is not available when using Python tokenizers. To use this feature, change your tokenizer to one deriving from transformers.PreTrainedTokenizerFast. More information on available tokenizers at https://github.com/huggingface/transformers/pull/2674

## 特殊的Tokenizer加载
不止可以使用官方提供的Tokenizer，还可以自己实现并上传，下面模型使用的分词器就是自己实现的，所以需要加上trust_remote_code=True

In [15]:
from transformers import AutoTokenizer

In [16]:
# 新版本的transformers（>4.34），加载 THUDM/chatglm 会报错，因此这里替换为了天宫的模型
tokenizer = AutoTokenizer.from_pretrained("Skywork/Skywork-13B-base", trust_remote_code=True)
tokenizer

tokenizer_config.json:   0%|          | 0.00/857 [00:00<?, ?B/s]

tokenization_skywork.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/Skywork/Skywork-13B-base:
- tokenization_skywork.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


tokenizer.model:   0%|          | 0.00/994k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/411 [00:00<?, ?B/s]



SkyworkTokenizer(name_or_path='Skywork/Skywork-13B-base', vocab_size=65519, model_max_length=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
}
)

In [None]:
# 保存到本地
tokenizer.save_pretrained("skywork_tokenizer")
# 从本地加载 记得加trust_remote_code=True
tokenizer = AutoTokenizer.from_pretrained("skywork_tokenizer", trust_remote_code=True)
tokenizer
