### 准备工作
[参考资料](https://colab.research.google.com/github/google/sentencepiece/blob/master/python/sentencepiece_python_module_example.ipynb)

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
!cp /content/gdrive/MyDrive/big_models_learn/data/create_tokenizer_test_data/test_pretrain_data.txt .

In [None]:
#!cp /content/gdrive/MyDrive/big_models_learn/data/create_tokenizer_test_data/test_tokenizer.* .

### 安装sentecepiece


In [None]:
!pip install sentencepiece

Collecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.99


### 导入sentencepiece库

In [None]:
import sentencepiece as spm

In [None]:
spm.SentencePieceTrainer.train(input='/content/test_pretrain_data.txt',
                               model_prefix='test_tokenizer',
                               vocab_size=8000,
                               character_coverage=0.9995,
                               model_type="bpe",
                               byte_fallback=True,
                               user_defined_symbols=['foo', 'bar'])


### 加载训练产生的tokenizer模型

In [None]:
sp = spm.SentencePieceProcessor(model_file='test_tokenizer.model')
# 获取词汇表的大小
vocab_size = sp.GetPieceSize()
print(vocab_size)

8000


### 编码中文

In [None]:
print(sp.encode_as_pieces('生成大模型发展迅速，人工智能时代到来'))
print(sp.encode_as_ids('生成大模型发展迅速，人工智能时代到来'))

['▁', '生', '成', '大', '模', '型', '发展', '迅', '速', ',', '人工', '智能', '时代', '到', '来']
[1412, 1376, 1391, 1367, 1781, 1637, 276, 2758, 1749, 1352, 987, 430, 774, 1383, 1400]


### 解码中文

In [None]:
# decode: id => text
print(sp.decode_pieces(['▁', '生', '成', '大', '模', '型', '发展', '迅', '速', ',', '人工', '智能', '时代', '到', '来']))
print(sp.decode_ids([1412, 1376, 1391, 1367, 1781, 1637, 276, 2758, 1749, 1352, 987, 430, 774, 1383, 1400]))

生成大模型发展迅速,人工智能时代到来
生成大模型发展迅速,人工智能时代到来


### 针对字典中不存在的字符解码

In [None]:
!grep "嗀" /content/test_tokenizer.vocab

In [None]:
print(sp.encode_as_pieces('嗀'))
print(sp.encode_as_ids('嗀'))

['▁', '<0xE5>', '<0x97>', '<0x80>']
[1412, 234, 156, 133]


In [None]:
print(sp.decode_pieces(['▁', '<0xE5>', '<0x97>', '<0x80>']))
print(sp.decode_ids([1412, 234, 156, 133]))

嗀
嗀


### 增加新的token

In [None]:
!cp /content/gdrive/MyDrive/big_models_learn/data/create_tokenizer_test_data/special_tokens.json .

In [None]:
import sentencepiece.sentencepiece_model_pb2 as model
m = model.ModelProto()
m.ParseFromString(open("test_tokenizer.model", "rb").read())

337665

In [None]:
import json
with open("special_tokens.json") as fn:
  special_tokens = json.load(fn)
special_tokens = special_tokens["additional_special_tokens"]
special_tokens

['<extra_id_0>',
 '<extra_id_1>',
 '<extra_id_2>',
 '<extra_id_3>',
 '<extra_id_4>',
 '<extra_id_5>',
 '<extra_id_6>',
 '<extra_id_7>',
 '<extra_id_8>',
 '<extra_id_9>',
 '<extra_id_10>',
 '<extra_id_11>',
 '<extra_id_12>',
 '<extra_id_13>',
 '<extra_id_14>',
 '<extra_id_15>',
 '<extra_id_16>',
 '<extra_id_17>',
 '<extra_id_18>',
 '<extra_id_19>',
 '<extra_id_20>',
 '<extra_id_21>',
 '<extra_id_22>',
 '<extra_id_23>',
 '<extra_id_24>',
 '<extra_id_25>',
 '<extra_id_26>',
 '<extra_id_27>',
 '<extra_id_28>',
 '<extra_id_29>',
 '<extra_id_30>',
 '<extra_id_31>',
 '<extra_id_32>',
 '<extra_id_33>',
 '<extra_id_34>',
 '<extra_id_35>',
 '<extra_id_36>',
 '<extra_id_37>',
 '<extra_id_38>',
 '<extra_id_39>',
 '<extra_id_40>',
 '<extra_id_41>',
 '<extra_id_42>',
 '<extra_id_43>',
 '<extra_id_44>',
 '<extra_id_45>',
 '<extra_id_46>',
 '<extra_id_47>',
 '<extra_id_48>',
 '<extra_id_49>',
 '<extra_id_50>',
 '<extra_id_51>',
 '<extra_id_52>',
 '<extra_id_53>',
 '<extra_id_54>',
 '<extra_id_55>',
 '

In [None]:
for token in special_tokens:
    new_token = model.ModelProto().SentencePiece()
    new_token.piece = token
    new_token.score = 0
    m.pieces.append(new_token)

In [None]:
with open('new_test_tokenizer.model', 'wb') as f:
    f.write(m.SerializeToString())

In [None]:
sp = spm.SentencePieceProcessor()
sp.load("new_test_tokenizer.model")

True

In [None]:
# 获取词汇表的大小
vocab_size = sp.GetPieceSize()
print(vocab_size)
# 构建词汇表字典
vocab_dict = {sp.IdToPiece(i): i for i in range(vocab_size)}
# 输出字典
print(vocab_dict)
print({key: value for key, value in vocab_dict.items()  if value >= 8000 })

8100
{'<unk>': 0, '<s>': 1, '</s>': 2, 'foo': 3, 'bar': 4, '<0x00>': 5, '<0x01>': 6, '<0x02>': 7, '<0x03>': 8, '<0x04>': 9, '<0x05>': 10, '<0x06>': 11, '<0x07>': 12, '<0x08>': 13, '<0x09>': 14, '<0x0A>': 15, '<0x0B>': 16, '<0x0C>': 17, '<0x0D>': 18, '<0x0E>': 19, '<0x0F>': 20, '<0x10>': 21, '<0x11>': 22, '<0x12>': 23, '<0x13>': 24, '<0x14>': 25, '<0x15>': 26, '<0x16>': 27, '<0x17>': 28, '<0x18>': 29, '<0x19>': 30, '<0x1A>': 31, '<0x1B>': 32, '<0x1C>': 33, '<0x1D>': 34, '<0x1E>': 35, '<0x1F>': 36, '<0x20>': 37, '<0x21>': 38, '<0x22>': 39, '<0x23>': 40, '<0x24>': 41, '<0x25>': 42, '<0x26>': 43, '<0x27>': 44, '<0x28>': 45, '<0x29>': 46, '<0x2A>': 47, '<0x2B>': 48, '<0x2C>': 49, '<0x2D>': 50, '<0x2E>': 51, '<0x2F>': 52, '<0x30>': 53, '<0x31>': 54, '<0x32>': 55, '<0x33>': 56, '<0x34>': 57, '<0x35>': 58, '<0x36>': 59, '<0x37>': 60, '<0x38>': 61, '<0x39>': 62, '<0x3A>': 63, '<0x3B>': 64, '<0x3C>': 65, '<0x3D>': 66, '<0x3E>': 67, '<0x3F>': 68, '<0x40>': 69, '<0x41>': 70, '<0x42>': 71, '<0x43>'