In [1]:
import pandas as pd
from collections import defaultdict
from tokenizers import BertWordPieceTokenizer, Tokenizer
from tokenizers.trainers import WordPieceTrainer
from tokenizers.models import WordPiece
from tokenizers.pre_tokenizers import Whitespace, BertPreTokenizer
import os
import json
from transformers import (
    BertTokenizer,
    BertForPreTraining,
    BertForMaskedLM,
    BertTokenizerFast,
    BertConfig,
    DataCollatorForLanguageModeling,
    TrainingArguments,
    Trainer,
)
from datasets import load_dataset, Dataset


In [2]:
# 这种方式构建的词更多些
special_tokens = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"]
vocab_size = 2000
files = ["ifeng_data/train.txt", "ifeng_data/test.txt"]
tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))
trainer = WordPieceTrainer(vocab_size=vocab_size, special_tokens=special_tokens)
tokenizer.pre_tokenizer = BertPreTokenizer()
tokenizer.train(files, trainer)
model_path = "pretrained_bert2"
os.makedirs(model_path, exist_ok=True)
# TODO: 但是具体该怎么保存还有疑惑, BertTokenizerFast 无法加载
# tokenizer.save("pretrained_bert2/tokenizer.json")
tokenizer.model.save(model_path)

max_length = 64
tokenizer.enable_truncation(max_length=max_length)
# with open(os.path.join(model_path, "vocab_config.json"), "w", encoding="utf-8") as f:
#     tokenizer_config = {
#         "do_lower_case": False,
#         "unk_token": "[UNK]",
#         "sep_token": "[SEP]",
#         "pad_token": "[PAD]",
#         "cls_token": "[CLS]",
#         "mask_token": "[MASK]",
#         "model_max_length": max_length,
#         "max_len": max_length,
#     }
#     json.dump(tokenizer_config, f)

In [None]:
vocab_size = 2000
files = ["ifeng_data/train.txt", "ifeng_data/test.txt"]
max_length = 64
truncate_longer_samples = True
special_tokens = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"]

# 构建并训练分词器, 使用这种方式构建的分词器可能不完整, 会缺失部分中文单字
tokenizer = BertWordPieceTokenizer()
tokenizer.train(
    files=files,
    vocab_size=vocab_size,
    special_tokens=special_tokens,
)
tokenizer.enable_truncation(max_length=max_length)

In [None]:
model_path = "pretrained_bert"
os.makedirs(model_path, exist_ok=True)


tokenizer.save_model(model_path)
with open(os.path.join(model_path, "vocab_config.json"), "w", encoding="utf-8") as f:
    tokenizer_config = {
        "do_lower_case": False,
        "unk_token": "[UNK]",
        "sep_token": "[SEP]",
        "pad_token": "[PAD]",
        "cls_token": "[CLS]",
        "mask_token": "[MASK]",
        "model_max_length": max_length,
        "max_len": max_length,
    }
    json.dump(tokenizer_config, f)


In [3]:
# 重新加载分词器, 使用快速版分词器会快很多
tokenizer = BertTokenizerFast.from_pretrained("pretrained_bert2")

In [5]:
vocab_size = tokenizer.vocab_size
vocab_size

5964

In [7]:
tokenizer("杨采钰明艳复古风[MASK]片")

{'input_ids': [2, 1456, 2765, 2782, 1379, 2326, 727, 519, 2939, 4, 1806, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [None]:
def encode_with_truncation(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",
        max_length=max_length,
        return_special_tokens_mask=True,
    )


def encode_without_truncation(examples):
    return tokenizer(
        examples["text"],
        return_special_tokens_mask=True,
    )

# 加载训练数据集
d = load_dataset("csv", data_files={"train": "ifeng_data/train.txt", "test": "ifeng_data/test.txt"}, sep="\t", names=["text"])
d

In [None]:
encode = encode_with_truncation if truncate_longer_samples else encode_without_truncation
# 使用完整数据集
train_dataset = d["train"].map(encode, batched=True)
test_dataset = d["test"].map(encode, batched=True)

if truncate_longer_samples:
    train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])
    test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])
else:
    train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "special_tokens_mask"])
    test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "special_tokens_mask"])


In [None]:
# 加载模型
model_config = BertConfig(vocab_size=vocab_size, max_position_embeddings=max_length)
model = BertForMaskedLM(config=model_config)

In [None]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

In [None]:
# 训练参数
training_args = TrainingArguments(
    output_dir=model_path,
    evaluation_strategy="steps",
    overwrite_output_dir=True,
    num_train_epochs=10,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    per_device_eval_batch_size=64,
    logging_steps=500,
    save_steps=500,
    save_total_limit=3,
    load_best_model_at_end=True,
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

In [None]:
trainer.train()

In [None]:
trainer.save_model()