In [11]:
import pandas as pd
from collections import defaultdict
from tokenizers import BertWordPieceTokenizer
import os
import json
from transformers import (
    BertTokenizer,
    BertForPreTraining,
    BertForMaskedLM,
    BertTokenizerFast,
    BertConfig,
    DataCollatorForLanguageModeling,
    TrainingArguments,
    Trainer,
)
from datasets import load_dataset, Dataset


In [2]:
# 数据来源是
# https://tianchi-competition.oss-cn-hangzhou.aliyuncs.com/531810/train_set.csv.zip
# https://tianchi-competition.oss-cn-hangzhou.aliyuncs.com/531810/test_a.csv.zip

df_train = pd.read_csv("data/tianchi_news/train_set.csv", sep="\t")
df_test = pd.read_csv("data/tianchi_news/test_a.csv", sep="\t")

vocab = defaultdict(int)
for line in df_train["text"].to_list() + df_test["text"].to_list():
    for word in line.strip().split(" "):
        vocab[word] += 1


In [3]:
print(len(vocab))
special_tokens = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]", "<S>", "<T>"]
all_tokens = special_tokens + sorted(vocab.keys())
print(len(all_tokens))


6977
6984


In [4]:
df_train["text"].to_csv("train.txt", header=None, index=None)
df_test["text"].to_csv("test.txt", header=None, index=None)


In [5]:
vocab_size = len(all_tokens)
files = ["train.txt", "test.txt"]
max_length = 1024
truncate_longer_samples = True

# 构建并训练分词器
# tokenizer = BertWordPieceTokenizer()
# tokenizer.train(
#     files=files,
#     vocab_size=vocab_size,
#     special_tokens=special_tokens,
# )
# tokenizer.enable_truncation(max_length=max_length)


In [6]:
model_path = "pretrained_bert"
os.makedirs(model_path, exist_ok=True)

# 直接用这个也是可以的
with open(os.path.join(model_path, "vocab.txt"), "w", encoding="utf-8") as f:
    f.write("\n".join(all_tokens))

# 决定不用这个了
# tokenizer.save_model(model_path)
with open(os.path.join(model_path, "vocab_config.json"), "w", encoding="utf-8") as f:
    tokenizer_config = {
        "do_lower_case": True,
        "unk_token": "[UNK]",
        "sep_token": "[SEP]",
        "pad_token": "[PAD]",
        "cls_token": "[CLS]",
        "mask_token": "[MASK]",
        "model_max_length": max_length,
        "max_len": max_length,
    }
    json.dump(tokenizer_config, f)


In [7]:
# 重新加载分词器, 使用快速版分词器会快很多
tokenizer = BertTokenizerFast.from_pretrained(model_path)

In [8]:
def encode_with_truncation(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",
        max_length=max_length,
        return_special_tokens_mask=True,
    )


def encode_without_truncation(examples):
    return tokenizer(
        examples["text"],
        return_special_tokens_mask=True,
    )

In [9]:
# 加载训练数据集
d = load_dataset("csv", data_files={"train": "train.txt", "test": "test.txt"}, sep="\t", names=["text"])
d

Using custom data configuration default-d8fd779be73f89cd


Downloading and preparing dataset csv/default to C:\Users\tzh\.cache\huggingface\datasets\csv\default-d8fd779be73f89cd\0.0.0\6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e...


100%|██████████| 2/2 [00:00<?, ?it/s]
100%|██████████| 2/2 [00:00<00:00, 978.26it/s]


Dataset csv downloaded and prepared to C:\Users\tzh\.cache\huggingface\datasets\csv\default-d8fd779be73f89cd\0.0.0\6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e. Subsequent calls will reuse this data.


100%|██████████| 2/2 [00:00<00:00, 18.02it/s]


DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 200000
    })
    test: Dataset({
        features: ['text'],
        num_rows: 50000
    })
})

In [14]:
encode = encode_with_truncation if truncate_longer_samples else encode_without_truncation
# 使用完整数据集
# train_dataset = d["train"].map(encode, batched=True)
# test_dataset = d["test"].map(encode, batched=True)

# 先使用小批量数据
train_dataset = Dataset.from_dict(d["train"][:10000]).map(encode, batched=True)
test_dataset = Dataset.from_dict(d["test"][:10000]).map(encode, batched=True)

if truncate_longer_samples:
    train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])
    test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])
else:
    train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "special_tokens_mask"])
    test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "special_tokens_mask"])


100%|██████████| 10/10 [00:11<00:00,  1.11s/ba]
100%|██████████| 10/10 [00:11<00:00,  1.18s/ba]


In [15]:
def group_texts(examples):
    # 先变成单个字典, 值是一个巨大的数组
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    # 后面这个是获取 keys 中的第一个 key, 然后取出 concatenated_examples[k]
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    if total_length >= max_length:
        total_length = (total_length // max_length) * max_length
    # 然后将值变成数组的数组 每个小数组的长度是 max_length
    result = {
        k: [t[i : i + max_length] for i in range(0, total_length, max_length)] for k, t in concatenated_examples.items()
    }
    return result


if not truncate_longer_samples:
    train_dataset = train_dataset.map(group_texts, batched=True)
    test_dataset = test_dataset.map(group_texts, batched=True)


In [16]:
# 加载模型
model_config = BertConfig(vocab_size=vocab_size, max_position_embeddings=max_length)
model = BertForMaskedLM(config=model_config)

In [17]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

In [18]:
# 训练参数
training_args = TrainingArguments(
    output_dir=model_path,
    evaluation_strategy="steps",
    overwrite_output_dir=True,
    num_train_epochs=10,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    per_device_eval_batch_size=64,
    logging_steps=500,
    save_steps=500,
    save_total_limit=3,
    load_best_model_at_end=True,
)

In [19]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

In [20]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `BertForMaskedLM.forward` and have been ignored: special_tokens_mask, text. If special_tokens_mask, text are not expected by `BertForMaskedLM.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 10000
  Num Epochs = 10
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 8
  Total optimization steps = 12500
  0%|          | 8/12500 [00:21<8:21:54,  2.41s/it] 

KeyboardInterrupt: 