#### tokenizer 的训练需要保存如下四个文件
- tokenizer.json
- vocab.json
- merges.txt
- tokenizer_config.json

In [1]:
from tokenizers import decoders, models, pre_tokenizers, trainers, Tokenizer
import os 
import json 

In [2]:
# 读取数据，一般返回一个迭代器，后续训练 tokenizer 需要传入一个迭代器对象
def read_data(path):
    with open(path, 'r', encoding='utf8') as f:
        for line in f:
            data = json.loads(line)
            res = data['text'].replace('<|endoftext|>','')
            yield res

In [3]:
# BPE 分词器
tokenizer = Tokenizer(models.BPE())
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)

In [4]:
# 定义特殊tokens
special_tokens = ['<pad>', '<unk>', '<s>', '</s>']

In [5]:
# 初始化训练器 
trainer = trainers.BpeTrainer(
    vocab_size=6400,
    special_tokens=special_tokens,
    show_progress=True,
    initial_alphabet=pre_tokenizers.ByteLevel.alphabet()
)

In [8]:
texts = read_data('/code/zhaoxudong03/RL/verl_base_zxd/01_llm_releate/data/tokenize.jsonl')

In [11]:
tokenizer.train_from_iterator(texts, trainer)
tokenizer.decoder = decoders.ByteLevel()
tokenizer_dir = '/code/zhaoxudong03/RL/verl_base_zxd/01_llm_releate/train_llm/tokenizer'
os.makedirs(tokenizer_dir, exist_ok=True)
tokenizer.save(os.path.join(tokenizer_dir, 'tokenizer.json'))






In [None]:
# 保存 vocab.json 和 merges.txt 文件
tokenizer.model.save(tokenizer_dir)

In [None]:
config = {
        "add_bos_token": False,
        "add_eos_token": False,
        "add_prefix_space": True,
        "added_tokens_decoder": {
            "0": {
                "content": "<unk>",
                "lstrip": False,
                "normalized": False,
                "rstrip": False,
                "single_word": False,
                "special": True
            },
            "1": {
                "content": "<s>",
                "lstrip": False,
                "normalized": False,
                "rstrip": False,
                "single_word": False,
                "special": True
            },
            "2": {
                "content": "</s>",
                "lstrip": False,
                "normalized": False,
                "rstrip": False,
                "single_word": False,
                "special": True
            }
        },
        "additional_special_tokens": [],
        "bos_token": "<s>",
        "clean_up_tokenization_spaces": False,
        "eos_token": "</s>",
        "legacy": True,
        "model_max_length": 100000,
        "pad_token": None,
        "sp_model_kwargs": {},
        "spaces_between_special_tokens": False,
        "tokenizer_class": "PreTrainedTokenizerFast",
        "unk_token": "<unk>",
        "use_default_system_prompt": False,
        "chat_template": "{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ system_message }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<s>user\\n' + content + '</s>\\n<s>assistant\\n' }}{% elif message['role'] == 'assistant' %}{{ content + '</s>' + '\\n' }}{% endif %}{% endfor %}"
    }

# 保存配置文件
with open(os.path.join(tokenizer_dir, "tokenizer_config.json"), "w", encoding="utf-8") as config_file:
    json.dump(config, config_file, ensure_ascii=False, indent=4)

In [None]:
## 测试
print(tokenizer.encode("<pad>"), tokenizer.encode("<unk>"), tokenizer.encode("<s>"), tokenizer.encode("</s>"))
print(tokenizer.decode(508))
print(tokenizer.vocab_size)