In [1]:
# train_softmax.py
import random
import numpy as np
import torch
from transformers import BertConfig, Trainer, TrainingArguments, EarlyStoppingCallback, EvalPrediction
from seqeval.metrics import classification_report
from transformers.trainer_utils import EvalLoopOutput
from transformers import AutoTokenizer, get_scheduler
from bert_softmax_data_processing import prepare_datasets, generate_label_map
from bert_softmax_model import BERT_Softmax  # 修改模型导入
import matplotlib.pyplot as plt
import os
from sklearn.metrics import confusion_matrix
import pandas as pd
import time  # 添加 time 模块用于计时

# 生成标签映射
label_map = generate_label_map()
label_map_inv = {v: k for k, v in label_map.items()}

class Config:
    # 设置随机数种子
    seed_val = 42
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    max_length = 512
    batch_size = 32
    num_labels = 13 
    model_name = "../models/bert_pretrained"
    data_path = "../datasets/train/trainset_B.txt"
    output_dir = "../../../hy-tmp/models/Ablation_bert_softmax_lr5e-5_linear"  # 修改输出目录

    num_epochs = 30
    weight_decay = 0.01
    bert_lr = 5e-5
    learning_rate = 5e-5  # BERT-Softmax 通常使用一个学习率就足够了
    warmup_ratio = 0.1

    lr_scheduler_type = "linear"  # linear 似乎比cosine更好
    early_stopping_patience = 5
    max_grad_norm = 1.0

class NERTrainer(Trainer):
    def prediction_step(self, model, inputs, prediction_loss_only, ignore_keys=None):
        with torch.no_grad():
            outputs = model(**inputs)
            loss = outputs.get("loss")
            # Softmax 直接从 logits 获取预测结果
            logits = outputs["logits"]
            predictions = torch.argmax(logits, dim=-1).cpu().numpy()
        return (loss, predictions, inputs["labels"].cpu().numpy())  # 修改返回值为 predictions

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        outputs = model(**inputs)
        loss = outputs["loss"]
        return (loss, outputs) if return_outputs else loss

    def evaluation_loop(self, dataloader, description, prediction_loss_only=False, ignore_keys=None, metric_key_prefix="eval"):
        model = self.model.eval()
        total_loss = 0.0
        all_preds = []
        all_labels = []
        all_attention_masks = []  # 存储 attention_mask

        for batch in dataloader:
            inputs = self._prepare_inputs(batch)
            with torch.no_grad():
                loss, predictions, labels = self.prediction_step(model, inputs, prediction_loss_only)  # prediction_step 直接返回 predictions
                total_loss += loss.item()
                all_preds.extend(predictions)  # predictions 是 numpy 数组，直接 extend
                all_labels.extend(labels)
                all_attention_masks.extend(inputs["attention_mask"].cpu().numpy())  # 存储 attention_mask

        # 过滤填充标签并计算指标
        true_labels = []
        pred_labels = []
        for pred_seq, true_seq, mask in zip(all_preds, all_labels, all_attention_masks):  # 使用存储的 attention_mask
            temp_true_labels = []
            temp_pred_labels = []
            for l, p, m in zip(true_seq, pred_seq, mask):
                if m == 1 and l != -100:  # 过滤 padding mask and ignore_index
                    temp_true_labels.append(label_map_inv[l])
                    temp_pred_labels.append(label_map_inv[p])
            true_labels.append(temp_true_labels)
            pred_labels.append(temp_pred_labels)

        report = classification_report(true_labels, pred_labels, output_dict=True)
        # 输出指标查看
        print(classification_report(true_labels, pred_labels))

        metrics = {
            f"{metric_key_prefix}_loss": total_loss / len(dataloader),
            f"{metric_key_prefix}_precision": report["micro avg"]["precision"],
            f"{metric_key_prefix}_recall": report["micro avg"]["recall"],
            f"{metric_key_prefix}_f1": report["micro avg"]["f1-score"],
        }

        # 生成混淆矩阵 (绘制图像并保存), 代码与 BERT+CRF 的 train.py 相同，无需修改
        true_entities = []
        pred_entities = []
        for tl_seq, pl_seq in zip(true_labels, pred_labels):
            for tl, pl in zip(tl_seq, pl_seq):
                if tl == 'O':
                    true_ent = 'O'
                else:
                    parts = tl.split('-', 1)
                    true_ent = parts[1] if len(parts) > 1 else 'O'
                if pl == 'O':
                    pred_ent = 'O'
                else:
                    parts = pl.split('-', 1)
                    pred_ent = parts[1] if len(parts) > 1 else 'O'
                true_entities.append(true_ent)
                pred_entities.append(pred_ent)

        entity_types = sorted(list(set(true_entities + pred_entities)))
        cm = confusion_matrix(true_entities, pred_entities, labels=entity_types)
        cm_df = pd.DataFrame(cm, index=entity_types, columns=entity_types)

        # ---  绘制混淆矩阵热图 ---  代码与 BERT+CRF 的 train.py 相同，无需修改
        fig = plt.figure(figsize=(12, 10))  # 可调整图像大小
        ax = fig.add_subplot(111)
        cax = ax.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)  # 使用蓝色的 colormap
        fig.colorbar(cax)  # 添加颜色栏
        tick_marks = np.arange(len(entity_types))
        ax.set_xticks(tick_marks)
        ax.set_yticks(tick_marks)
        ax.set_xticklabels(entity_types, rotation=45, ha="right")  # 旋转 x 轴标签，使其不重叠
        ax.set_yticklabels(entity_types)

        # 在混淆矩阵上显示数值
        for i in range(cm.shape[0]):
            for j in range(cm.shape[1]):
                ax.text(j, i, cm[i, j], ha="center", va="center", color='black' if cm[i,j] < cm.max()/2 else 'white')  # 根据数值大小调整颜色

        plt.tight_layout()  # 自动调整子图参数，提供图像边缘周围的填充
        plt.ylabel('True Entity')
        plt.xlabel('Predicted Entity')
        plt.title(f'Confusion Matrix - Epoch {int(self.state.epoch)}')  # 添加标题

        # 获取当前epoch数
        epoch_num = int(self.state.epoch)
        save_dir = os.path.join(self.args.output_dir, 'confusion_matrices')
        os.makedirs(save_dir, exist_ok=True)
        filename = f"confusion_matrix_epoch_{epoch_num}.png"  # 保存为 PNG 图片
        filepath = os.path.join(save_dir, filename)
        plt.savefig(filepath)  # 保存图像
        plt.close(fig)  # 关闭图像，释放内存
        print(f"\nConfusion matrix image for epoch {epoch_num} saved to {filepath}")

        return EvalLoopOutput(
            predictions=all_preds,
            label_ids=all_labels,
            metrics=metrics,
            num_samples=len(all_labels),
        )

# compute_metrics 函数基本相同，只需要根据 prediction_step 的输出进行调整
def compute_metrics(eval_preds: EvalPrediction):
    pred_tags = eval_preds.predictions  # 已经是预测标签，不再是 logits
    true_labels = eval_preds.label_ids
    # attention_mask = eval_preds.inputs.get("attention_mask")  #  移除获取 attention_mask 的代码

    true_labels_list = []
    pred_labels_list = []
    for true_seq, pred_seq in zip(true_labels, pred_tags):  #  移除 mask 参数
        # true_seq_filtered = [label_map_inv[l] for l, m in zip(true_seq, mask) if l != -100 and m == 1]  # 移除 mask 相关条件
        # pred_seq_filtered = [label_map_inv[p] for p, m in zip(pred_seq, mask) if m == 1]  # 移除 mask 相关条件
        true_seq_filtered = [label_map_inv[l] for l in true_seq if l != -100]  # 只过滤 -100 标签
        pred_seq_filtered = [label_map_inv[p] for p in pred_seq]  # pred_seq 不再需要 mask 过滤，evaluation_loop 已处理
        true_labels_list.append(true_seq_filtered)
        pred_labels_list.append(pred_seq_filtered)

    report_dict = classification_report(true_labels_list, pred_labels_list, output_dict=True)
    report_string = classification_report(true_labels_list, pred_labels_list)

    metrics = {
        "eval_precision": report_dict["micro avg"]["precision"],
        "eval_recall": report_dict["micro avg"]["recall"],
        "eval_f1": report_dict["micro avg"]["f1-score"],
        "eval_report_string": report_string,
    }
    return metrics

def collate_fn(batch):
    return {
        "input_ids": torch.stack([x["input_ids"] for x in batch]),
        "attention_mask": torch.stack([x["attention_mask"] for x in batch]),
        "labels": torch.stack([x["labels"] for x in batch]),
    }

def main():
    cfg = Config()
    random.seed(cfg.seed_val)
    np.random.seed(cfg.seed_val)
    torch.manual_seed(cfg.seed_val)
    torch.cuda.manual_seed_all(cfg.seed_val)

    # 初始化模型
    pre_config = BertConfig.from_pretrained(cfg.model_name, num_labels=cfg.num_labels)
    model = BERT_Softmax.from_pretrained(cfg.model_name, config=pre_config).to(cfg.device)  # 修改模型类

    # 加载Tokenizer
    tokenizer = AutoTokenizer.from_pretrained(cfg.model_name)
    train_ds, val_ds = prepare_datasets(cfg.data_path, tokenizer)

    optimizer_grouped_parameters = [
        {
            "name": "bert_base",
            "params": [p for n, p in model.bert.named_parameters() if p.requires_grad],
            "lr": cfg.bert_lr,
        },
        {
            "params": [p for n, p in model.classifier.named_parameters() if p.requires_grad],  # 假设分类器层名为 classifier
            "lr": cfg.learning_rate,  # 使用 learning_rate
        }
    ]
    # 创建优化器，使用 AdamW，并将分组参数传递进去
    optimizer = torch.optim.AdamW(optimizer_grouped_parameters, weight_decay=cfg.weight_decay)

    num_training_steps = len(train_ds) // cfg.batch_size * cfg.num_epochs  # 粗略估计，根据实际情况调整
    num_warmup_steps = int(cfg.warmup_ratio * num_training_steps)

    lr_scheduler = get_scheduler(
        name=cfg.lr_scheduler_type,  # 可以尝试 "cosine"
        optimizer=optimizer,
        num_warmup_steps=num_warmup_steps,
        num_training_steps=num_training_steps,
    )

    # 训练参数
    training_args = TrainingArguments(
        output_dir=cfg.output_dir,
        per_device_train_batch_size=cfg.batch_size,
        per_device_eval_batch_size=cfg.batch_size * 2,
        num_train_epochs=cfg.num_epochs,
        weight_decay=cfg.weight_decay,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="eval_f1",
        remove_unused_columns=False,
        fp16=False,
        max_grad_norm=cfg.max_grad_norm,
        logging_steps=20,
        seed=cfg.seed_val,
        data_seed=cfg.seed_val,
    )

    trainer = NERTrainer(
        model=model,
        args=training_args,
        train_dataset=train_ds,
        eval_dataset=val_ds,
        compute_metrics=compute_metrics,
        data_collator=collate_fn,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=cfg.early_stopping_patience)],  # 确保早停回调被正确添加
        optimizers=(optimizer, lr_scheduler),
    )

    # 计算训练时间
    print("Starting training...")
    start_time = time.time()  # 记录开始时间

    # 存储训练过程中的 loss 和 metrics，代码与 BERT+CRF 的 train.py 相同，无需修改
    train_history = trainer.train()

    end_time = time.time()  # 记录结束时间
    training_time = end_time - start_time  # 计算总训练时间（秒）
    
    # 转换为小时、分钟、秒格式
    hours = int(training_time // 3600)
    minutes = int((training_time % 3600) // 60)
    seconds = training_time % 60
    print(f"Training completed in: {hours} hours, {minutes} minutes, {seconds:.2f} seconds (Total: {training_time:.2f} seconds)")

    # 从 trainer 的 state 中获取日志信息，代码与 BERT+CRF 的 train.py 相同，无需修改
    log_history = trainer.state.log_history

    # 提取训练 loss 和 验证 metrics，代码与 BERT+CRF 的 train.py 相同，无需修改
    train_losses = []
    eval_losses = []
    eval_f1s = []
    eval_precisions = []
    eval_recalls = []
    epochs = []
    logged_epochs = set()

    for log_data in log_history:
        if 'loss' in log_data:
            train_losses.append(log_data['loss'])
            epoch_num = log_data['epoch']
            if epoch_num not in logged_epochs:
                epochs.append(epoch_num)
                logged_epochs.add(epoch_num)
        if 'eval_loss' in log_data:
            eval_losses.append(log_data['eval_loss'])
            epoch_num = log_data['epoch']
            if epoch_num not in logged_epochs:
                epochs.append(epoch_num)
                logged_epochs.add(epoch_num)
        if 'eval_f1' in log_data:
            eval_f1s.append(log_data['eval_f1'])
        if 'eval_precision' in log_data:
            eval_precisions.append(log_data['eval_precision'])
        if 'eval_recall' in log_data:
            eval_recalls.append(log_data['eval_recall'])

    epochs = sorted(list(set(epochs)))

    model.save_pretrained(cfg.output_dir)
    tokenizer.save_pretrained(cfg.output_dir)

    final_eval_metrics = trainer.state.log_history[-1]
    final_report_string = final_eval_metrics.get("eval_report_string")
    if final_report_string:
        print("\nFinal Evaluation Report:\n", final_report_string)

if __name__ == "__main__":
    main()

  from .autonotebook import tqdm as notebook_tqdm
Some weights of BERT_Softmax were not initialized from the model checkpoint at ../models/bert_pretrained and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting training...


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,1.948865,0.008387,0.019115,0.011658
2,No log,0.988615,0.0,0.0,0.0
3,No log,0.639144,0.034188,0.004024,0.007201
4,1.348900,0.356229,0.512974,0.646378,0.572001
5,1.348900,0.249605,0.721794,0.801308,0.759476
6,1.348900,0.194757,0.76561,0.801811,0.783292
7,1.348900,0.168288,0.812561,0.839537,0.825829
8,0.220600,0.151896,0.838011,0.856137,0.846977
9,0.220600,0.146886,0.850374,0.857646,0.853994
10,0.220600,0.144881,0.848889,0.864688,0.856716


              precision    recall  f1-score   support

          NR       0.00      0.00      0.00       794
          NS       0.01      0.05      0.02       685
           T       0.00      0.00      0.00       509

   micro avg       0.01      0.02      0.01      1988
   macro avg       0.01      0.02      0.01      1988
weighted avg       0.01      0.02      0.01      1988


Confusion matrix image for epoch 1 saved to ../../../hy-tmp/models/Ablation_bert_softmax_lr5e-5_linear/confusion_matrices/confusion_matrix_epoch_1.png


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

          NR       0.00      0.00      0.00       794
          NS       0.00      0.00      0.00       685
           T       0.00      0.00      0.00       509

   micro avg       0.00      0.00      0.00      1988
   macro avg       0.00      0.00      0.00      1988
weighted avg       0.00      0.00      0.00      1988


Confusion matrix image for epoch 2 saved to ../../../hy-tmp/models/Ablation_bert_softmax_lr5e-5_linear/confusion_matrices/confusion_matrix_epoch_2.png
              precision    recall  f1-score   support

          NR       0.01      0.00      0.00       794
          NS       0.00      0.00      0.00       685
           T       0.06      0.01      0.02       509

   micro avg       0.03      0.00      0.01      1988
   macro avg       0.02      0.01      0.01      1988
weighted avg       0.02      0.00      0.01      1988


Confusion matrix image for epoch 3 saved to ../../../hy-tmp/models/Ablation_bert_soft