#### 数据预处理

In [1]:
from datasets import load_dataset
from transformers import AutoModelForMaskedLM
from transformers import AutoTokenizer
from transformers import DataCollatorForLanguageModeling
from transformers import TrainingArguments, Trainer

In [2]:
EPOCHS = 4  # 模型学习训练集的轮次
BATCH_SIZE = 4  # 模型并行学习的样本数量
PRETRAINED_MODEL_NAME_OR_PATH = "../models/chinese-lert-base"  # 预训练模型路径
CSV_PATH = "../datasets/语言建模数据集.csv"  # 只有文本的无监督数据集
OUPUT_CHECKPOINT_PATH = "../models/fill_mask_checkpoint"  # 训练过程中产生的模型文件
OUTPUT_MODEL_PATH = "../models/fill_mask_model"  # 微调后的文本分类模型路径
VAL_SIZE = 0.1  # 验证集的占比
MAX_LENGTH = 256  # 输入文本的token长度，lert模型最大只能是512

In [3]:
dataset = load_dataset("csv", data_files=CSV_PATH)
eli5 = dataset["train"].train_test_split(test_size=VAL_SIZE)

tokenizer = AutoTokenizer.from_pretrained(
    PRETRAINED_MODEL_NAME_OR_PATH,
    max_length=MAX_LENGTH,
    truncation=True,
    padding="max_length",
)


def preprocess_function(examples):
    return tokenizer(
        examples["text"], max_length=MAX_LENGTH, truncation=True, padding="max_length"
    )


lm_dataset = eli5.map(preprocess_function, batched=True)

Map:   0%|          | 0/188 [00:00<?, ? examples/s]

Map:   0%|          | 0/21 [00:00<?, ? examples/s]

#### 模型微调

In [6]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer
)

model = AutoModelForMaskedLM.from_pretrained(PRETRAINED_MODEL_NAME_OR_PATH)

# 使得模型的tensor连续，避免报错
for param in model.parameters():
    param.data = param.data.contiguous()

training_args = TrainingArguments(
    output_dir=OUPUT_CHECKPOINT_PATH,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=EPOCHS,
    weight_decay=0.01,
    metric_for_best_model="eval_loss",
    load_best_model_at_end=True,
    save_total_limit=1,
    fp16=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_dataset["train"],
    eval_dataset=lm_dataset["test"],
    data_collator=data_collator,
)

trainer.train()
model.half()
model.save_pretrained(OUTPUT_MODEL_PATH)
tokenizer.save_pretrained(OUTPUT_MODEL_PATH)

Some weights of the model checkpoint at ../models/chinese-lert-base were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch,Training Loss,Validation Loss
1,No log,1.45313
2,No log,1.413619
3,No log,1.544467
4,No log,1.427079


There were missing keys in the checkpoint model loaded: ['cls.predictions.decoder.weight', 'cls.predictions.decoder.bias'].


('../models/fill_mask_model\\tokenizer_config.json',
 '../models/fill_mask_model\\special_tokens_map.json',
 '../models/fill_mask_model\\vocab.txt',
 '../models/fill_mask_model\\added_tokens.json',
 '../models/fill_mask_model\\tokenizer.json')

#### 模型推理

In [9]:
from transformers import pipeline

mask_filler = pipeline(
    task="fill-mask",
    model=OUTPUT_MODEL_PATH
)

text = "一般般，盖子不[MASK]。"
mask_result = mask_filler(inputs=text, top_k=3)
print(mask_result)

Device set to use cpu


[{'score': 0.1729099601507187, 'token': 1916, 'token_str': '够', 'sequence': '一 般 般 ， 盖 子 不 够 。'}, {'score': 0.12599192559719086, 'token': 1962, 'token_str': '好', 'sequence': '一 般 般 ， 盖 子 不 好 。'}, {'score': 0.10024310648441315, 'token': 1920, 'token_str': '大', 'sequence': '一 般 般 ， 盖 子 不 大 。'}]
