# 中文酒店评论情感分析 - 模型训练

本笔记本将训练和比较基础BERT模型和改进的BERT+BiLSTM+Attention模型。

In [1]:
# 导入必要的库
import sys
import os
sys.path.append('..')

import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import json
import warnings
warnings.filterwarnings('ignore')

# 导入自定义模块
from src.data_preprocessing import DataPreprocessor
from models.base_model import create_base_model
from models.improved_model import create_improved_model, ContrastiveLoss
from src.model_training import ModelTrainer
from src.utils import plot_training_history, set_seed

# 设置随机种子
set_seed(42)

# 设置设备
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"使用设备: {device}")

# 设置中文字体
plt.rcParams['font.sans-serif'] = ['SimHei', 'DejaVu Sans']
plt.rcParams['axes.unicode_minus'] = False

使用设备: cpu


## 1. 数据准备

In [2]:
# 初始化数据预处理器
preprocessor = DataPreprocessor('../ChnSentiCorp_htl_all.csv')

# 检查是否已有处理后的数据
if not os.path.exists('../data/processed/train.csv'):
    print("开始数据预处理...")
    train_df, val_df, test_df = preprocessor.preprocess_data()
    print("数据预处理完成！")
else:
    print("发现已处理的数据，直接加载...")
    train_df = pd.read_csv('../data/processed/train.csv')
    val_df = pd.read_csv('../data/processed/val.csv')
    test_df = pd.read_csv('../data/processed/test.csv')

print(f"训练集大小: {len(train_df)}")
print(f"验证集大小: {len(val_df)}")
print(f"测试集大小: {len(test_df)}")

# 创建数据加载器
batch_size = 16
train_loader, val_loader, test_loader = preprocessor.create_dataloaders(batch_size=batch_size)

print(f"\n数据加载器创建完成:")
print(f"训练批次数: {len(train_loader)}")
print(f"验证批次数: {len(val_loader)}")
print(f"测试批次数: {len(test_loader)}")

当前工作目录: d:\github\sentiment analysis\notebooks
切换后工作目录: d:\github\sentiment analysis


当前工作目录: d:\github\sentiment analysis\notebooks
切换后工作目录: d:\github\sentiment analysis


INFO:src.data_preprocessing:创建DataLoader...


当前工作目录: d:\github\sentiment analysis\notebooks
切换后工作目录: d:\github\sentiment analysis


INFO:src.data_preprocessing:创建DataLoader...


检查文件路径: d:\github\sentiment analysis\data\processed\train.csv
文件是否存在: True
发现已处理的数据，直接加载...
训练集大小: 5435
验证集大小: 776
测试集大小: 1554

数据加载器创建完成:
训练批次数: 340
验证批次数: 49
测试批次数: 98


## 2. 模型架构对比

In [None]:
# 创建基础BERT模型
base_model = create_base_model(model_type='enhanced')
base_model.to(device)

# 创建改进模型
improved_model = create_improved_model(use_contrastive_loss=True)
improved_model.to(device)

# 计算参数数量
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

base_params = count_parameters(base_model)
improved_params = count_parameters(improved_model)

print("模型参数对比:")
print(f"基础BERT模型参数量: {base_params:,}")
print(f"改进模型参数量: {improved_params:,}")
print(f"参数增长: {improved_params - base_params:,} ({(improved_params/base_params - 1)*100:.1f}%)")

# 测试前向传播
sample_batch = next(iter(train_loader))
input_ids = sample_batch['input_ids'][:2].to(device)
attention_mask = sample_batch['attention_mask'][:2].to(device)

print("\n测试前向传播:")
with torch.no_grad():
    base_output = base_model(input_ids, attention_mask)
    improved_output = improved_model(input_ids, attention_mask)
    
    print(f"基础模型输出形状: {base_output['logits'].shape}")
    print(f"改进模型输出形状: {improved_output['logits'].shape}")
    
    if 'attention_weights' in improved_output:
        print(f"注意力权重形状: {improved_output['attention_weights'].shape}")
    
    if 'projection' in improved_output:
        print(f"对比学习投影形状: {improved_output['projection'].shape}")

## 3. 训练基础BERT模型

In [None]:
# 配置训练参数
training_config = {
    'learning_rate': 2e-5,
    'batch_size': batch_size,
    'max_epochs': 5,  # 为了演示，使用较少的epoch
    'patience': 3,
    'save_dir': '../models/saved_models'
}

print("开始训练基础BERT模型...")

# 创建训练器
base_trainer = ModelTrainer(
    model_type='base',
    device=device,
    **training_config
)

# 训练模型
base_history = base_trainer.train(train_loader, val_loader)

print("基础BERT模型训练完成！")

In [None]:
# 可视化基础模型训练历史
plot_training_history(base_history, save_path='../results/base_model_training_history.png')

## 4. 训练改进模型

In [None]:
print("开始训练改进模型...")

# 创建改进模型训练器
improved_trainer = ModelTrainer(
    model_type='improved',
    device=device,
    use_contrastive_loss=True,
    contrastive_weight=0.1,
    **training_config
)

# 训练模型
improved_history = improved_trainer.train(train_loader, val_loader)

print("改进模型训练完成！")

In [None]:
# 可视化改进模型训练历史
plot_training_history(improved_history, save_path='../results/improved_model_training_history.png')

## 5. 训练结果对比

In [None]:
# 对比两个模型的训练结果
def compare_training_results(base_hist, improved_hist):
    """对比训练结果"""
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    
    # 训练损失对比
    axes[0, 0].plot(base_hist['train_loss'], label='基础BERT', marker='o')
    axes[0, 0].plot(improved_hist['train_loss'], label='改进模型', marker='s')
    axes[0, 0].set_title('训练损失对比')
    axes[0, 0].set_xlabel('Epoch')
    axes[0, 0].set_ylabel('Loss')
    axes[0, 0].legend()
    axes[0, 0].grid(True, alpha=0.3)
    
    # 验证损失对比
    axes[0, 1].plot(base_hist['val_loss'], label='基础BERT', marker='o')
    axes[0, 1].plot(improved_hist['val_loss'], label='改进模型', marker='s')
    axes[0, 1].set_title('验证损失对比')
    axes[0, 1].set_xlabel('Epoch')
    axes[0, 1].set_ylabel('Loss')
    axes[0, 1].legend()
    axes[0, 1].grid(True, alpha=0.3)
    
    # 训练准确率对比
    axes[1, 0].plot(base_hist['train_acc'], label='基础BERT', marker='o')
    axes[1, 0].plot(improved_hist['train_acc'], label='改进模型', marker='s')
    axes[1, 0].set_title('训练准确率对比')
    axes[1, 0].set_xlabel('Epoch')
    axes[1, 0].set_ylabel('Accuracy')
    axes[1, 0].legend()
    axes[1, 0].grid(True, alpha=0.3)
    
    # 验证准确率对比
    axes[1, 1].plot(base_hist['val_acc'], label='基础BERT', marker='o')
    axes[1, 1].plot(improved_hist['val_acc'], label='改进模型', marker='s')
    axes[1, 1].set_title('验证准确率对比')
    axes[1, 1].set_xlabel('Epoch')
    axes[1, 1].set_ylabel('Accuracy')
    axes[1, 1].legend()
    axes[1, 1].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.savefig('../results/model_comparison_training.png', dpi=300, bbox_inches='tight')
    plt.show()

compare_training_results(base_history, improved_history)

In [None]:
# 数值对比
def print_final_metrics(base_hist, improved_hist):
    """打印最终指标对比"""
    print("=== 最终训练指标对比 ===")
    print(f"{'指标':<15} {'基础BERT':<12} {'改进模型':<12} {'提升':<10}")
    print("-" * 55)
    
    base_final = {
        'train_loss': base_hist['train_loss'][-1],
        'val_loss': base_hist['val_loss'][-1],
        'train_acc': base_hist['train_acc'][-1],
        'val_acc': base_hist['val_acc'][-1]
    }
    
    improved_final = {
        'train_loss': improved_hist['train_loss'][-1],
        'val_loss': improved_hist['val_loss'][-1],
        'train_acc': improved_hist['train_acc'][-1],
        'val_acc': improved_hist['val_acc'][-1]
    }
    
    metrics_names = {
        'train_loss': '训练损失',
        'val_loss': '验证损失',
        'train_acc': '训练准确率',
        'val_acc': '验证准确率'
    }
    
    for key in base_final:
        base_val = base_final[key]
        improved_val = improved_final[key]
        
        if 'loss' in key:
            improvement = (base_val - improved_val) / base_val * 100
            improvement_str = f"{improvement:+.1f}%"
        else:
            improvement = (improved_val - base_val) / base_val * 100
            improvement_str = f"{improvement:+.1f}%"
        
        print(f"{metrics_names[key]:<15} {base_val:<12.4f} {improved_val:<12.4f} {improvement_str:<10}")

print_final_metrics(base_history, improved_history)

## 6. 模型复杂度分析

In [None]:
# 分析模型的计算复杂度
import time

def measure_inference_time(model, data_loader, device, num_batches=10):
    """测量推理时间"""
    model.eval()
    times = []
    
    with torch.no_grad():
        for i, batch in enumerate(data_loader):
            if i >= num_batches:
                break
                
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            
            start_time = time.time()
            outputs = model(input_ids, attention_mask)
            torch.cuda.synchronize() if device.type == 'cuda' else None
            end_time = time.time()
            
            times.append(end_time - start_time)
    
    return np.mean(times), np.std(times)

# 测量推理时间
print("测量模型推理时间...")

# 基础模型
base_trainer.model.eval()
base_mean_time, base_std_time = measure_inference_time(
    base_trainer.model, val_loader, device
)

# 改进模型
improved_trainer.model.eval()
improved_mean_time, improved_std_time = measure_inference_time(
    improved_trainer.model, val_loader, device
)

print(f"\n推理时间对比（每批次）:")
print(f"基础BERT模型: {base_mean_time:.4f} ± {base_std_time:.4f} 秒")
print(f"改进模型: {improved_mean_time:.4f} ± {improved_std_time:.4f} 秒")
print(f"时间增长: {(improved_mean_time/base_mean_time - 1)*100:.1f}%")

# 计算每个样本的推理时间
base_per_sample = base_mean_time / batch_size
improved_per_sample = improved_mean_time / batch_size

print(f"\n每个样本推理时间:")
print(f"基础BERT模型: {base_per_sample*1000:.2f} 毫秒")
print(f"改进模型: {improved_per_sample*1000:.2f} 毫秒")

## 7. 保存训练结果

In [None]:
# 保存训练总结
training_summary = {
    'experiment_config': training_config,
    'model_comparison': {
        'base_model': {
            'parameters': int(base_params),
            'final_metrics': {
                'train_loss': float(base_history['train_loss'][-1]),
                'val_loss': float(base_history['val_loss'][-1]),
                'train_acc': float(base_history['train_acc'][-1]),
                'val_acc': float(base_history['val_acc'][-1])
            },
            'inference_time_per_batch': float(base_mean_time),
            'inference_time_per_sample': float(base_per_sample)
        },
        'improved_model': {
            'parameters': int(improved_params),
            'final_metrics': {
                'train_loss': float(improved_history['train_loss'][-1]),
                'val_loss': float(improved_history['val_loss'][-1]),
                'train_acc': float(improved_history['train_acc'][-1]),
                'val_acc': float(improved_history['val_acc'][-1])
            },
            'inference_time_per_batch': float(improved_mean_time),
            'inference_time_per_sample': float(improved_per_sample)
        }
    },
    'improvements': {
        'parameter_increase': float((improved_params / base_params - 1) * 100),
        'val_acc_improvement': float((improved_history['val_acc'][-1] - base_history['val_acc'][-1]) / base_history['val_acc'][-1] * 100),
        'val_loss_improvement': float((base_history['val_loss'][-1] - improved_history['val_loss'][-1]) / base_history['val_loss'][-1] * 100),
        'inference_time_increase': float((improved_mean_time / base_mean_time - 1) * 100)
    }
}

# 保存到文件
os.makedirs('../results', exist_ok=True)
with open('../results/training_summary.json', 'w', encoding='utf-8') as f:
    json.dump(training_summary, f, ensure_ascii=False, indent=2)

print("训练总结已保存到 ../results/training_summary.json")

# 打印关键改进指标
print("\n=== 模型改进总结 ===")
print(f"验证准确率提升: {training_summary['improvements']['val_acc_improvement']:.2f}%")
print(f"验证损失降低: {training_summary['improvements']['val_loss_improvement']:.2f}%")
print(f"参数量增加: {training_summary['improvements']['parameter_increase']:.1f}%")
print(f"推理时间增加: {training_summary['improvements']['inference_time_increase']:.1f}%")

## 8. 下一步

1. **模型评估**: 使用 `03_evaluation_analysis.ipynb` 进行详细的模型评估
2. **消融实验**: 使用 `04_ablation_study.ipynb` 分析各组件的贡献
3. **超参数调优**: 尝试不同的学习率、批次大小等参数
4. **模型部署**: 将最佳模型部署到生产环境

### 训练建议

1. **增加训练轮数**: 当前为了演示使用了较少的epoch，实际应用中可以增加到15-20轮
2. **学习率调度**: 可以尝试不同的学习率调度策略
3. **数据增强**: 可以考虑添加数据增强技术
4. **模型集成**: 可以训练多个模型进行集成