In [1]:
from tokenizers import decoders, models, pre_tokenizers, trainers, Tokenizer
import os 
import json

In [17]:
# 读取数据
def read_data(path):
    with open(path, 'r', encoding='utf-8') as f:
        for line in f:
            data = json.loads(line)
            yield data['text']  
        

In [18]:
# BPE分词器
tokenizer = Tokenizer(models.BPE())
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)

In [19]:
# 定义特殊tokens
special_tokens = ['<pad>', '<unk>', '<s>', '</s>']

In [20]:
# 初始化训练器
trainer = trainers.BpeTrainer(
    vocab_size=6400,
    special_tokens = special_tokens,
    show_progress=True,
    initial_alphabet = pre_tokenizers.ByteLevel.alphabet()
    
)

In [None]:
texts = read_data('./dataset/tokenizer_train.jsonl')

In [22]:
tokenizer.train_from_iterator(texts, trainer)






In [23]:
tokenizer.decoder = decoders.ByteLevel()

In [24]:
tokenizer_dir = "./tokenizer"
os.makedirs(tokenizer_dir, exist_ok=True)
tokenizer.save(os.path.join(tokenizer_dir, "tokenizer.json"))

In [25]:
tokenizer.model.save(tokenizer_dir)

['./tokenizer/vocab.json', './tokenizer/merges.txt']

In [26]:
config = {
        "add_bos_token": False,
        "add_eos_token": False,
        "add_prefix_space": True,
        "added_tokens_decoder": {
            "0": {
                "content": "<unk>",
                "lstrip": False,
                "normalized": False,
                "rstrip": False,
                "single_word": False,
                "special": True
            },
            "1": {
                "content": "<s>",
                "lstrip": False,
                "normalized": False,
                "rstrip": False,
                "single_word": False,
                "special": True
            },
            "2": {
                "content": "</s>",
                "lstrip": False,
                "normalized": False,
                "rstrip": False,
                "single_word": False,
                "special": True
            }
        },
        "additional_special_tokens": [],
        "bos_token": "<s>",
        "clean_up_tokenization_spaces": False,
        "eos_token": "</s>",
        "legacy": True,
        "model_max_length": 100000,
        "pad_token": None,
        "sp_model_kwargs": {},
        "spaces_between_special_tokens": False,
        "tokenizer_class": "PreTrainedTokenizerFast",
        "unk_token": "<unk>",
        "use_default_system_prompt": False,
        "chat_template": "{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ system_message }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<s>user\\n' + content + '</s>\\n<s>assistant\\n' }}{% elif message['role'] == 'assistant' %}{{ content + '</s>' + '\\n' }}{% endif %}{% endfor %}"
    }

# 保存配置文件
with open(os.path.join(tokenizer_dir, "tokenizer_config.json"), "w", encoding="utf-8") as config_file:
    json.dump(config, config_file, ensure_ascii=False, indent=4)

In [27]:
# 测试
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("./tokenizer")
tokenizer.encode("您好")

[804, 588]

In [29]:
tokenizer.encode("<pad>"), tokenizer.encode("<unk>"), tokenizer.encode("<s>"), tokenizer.encode("</s>")

([0], [1], [2], [3])

In [31]:
tokenizer.decode(588)

'好'

In [32]:
tokenizer.vocab_size

6400