# MazeruHAR 动态训练引擎

欢迎使用 MazeruHAR 的配置驱动训练引擎。此 Notebook 旨在提供一个灵活、可重现且易于使用的训练流程。

**核心理念:** **代码只写一次，实验配置万千。**

您只需要执行以下两个简单步骤即可开始训练：

1.  **配置实验**: 修改位于 `config/` 目录下的 `.yaml` 配置文件。您可以复制 `config/default_configs/shl_config.yaml` 并根据您的需求进行调整，比如更换数据集、模型架构或超参数。
2.  **运行 Notebook**: 在下面的 **“实验配置”**单元格中设置好配置文件的路径，然后从头到尾运行此 Notebook 即可。

---

## 步骤 1: 环境设置与库导入

此单元格负责导入所有必需的库并设置初始环境。它整合了项目所需的所有依赖项。

In [2]:
# 标准库导入
import os
import sys
import time
import yaml
import random
import logging
import json
from pathlib import Path
from typing import Dict, Any, Optional, List, Tuple

# 第三方库导入
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torch.optim.lr_scheduler import CosineAnnealingLR, StepLR, ReduceLROnPlateau
from sklearn.metrics import f1_score, confusion_matrix, classification_report
from sklearn.utils import class_weight
from sklearn.model_selection import train_test_split
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import seaborn as sns

# --- 项目内部模块导入 ---
# 为了确保 Notebook 的可移植性，我们将关键的模块代码直接包含进来，
# 同时保留从文件导入的逻辑作为备用方案。
try:
    # 优先从文件系统导入
    from config.config_loader import ConfigLoader
    from model_layer.dynamic_har_model import create_dynamic_har_model
    import utils_torch as utils
    print("成功从项目文件导入模块。")
except ImportError as e:
    print(f"从文件导入模块失败: {e}。将使用 Notebook 内联定义。")
    
    # ========================== Fallback Logic: Start ==========================
    # 如果导入失败，定义完整的后备（Fallback）实现，以保证 Notebook 能独立运行。

    # 1. Fallback for 'HARDataset'
    class HARDataset(Dataset):
        """一个标准的 PyTorch 数据集类。"""
        def __init__(self, data: torch.Tensor, labels: torch.Tensor):
            self.data = data
            self.labels = labels
        def __len__(self) -> int:
            return len(self.data)
        def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor]:
            return self.data[idx], self.labels[idx]

    # 2. Fallback for 'create_dynamic_har_model'
    def create_dynamic_har_model(config: Dict[str, Any]) -> nn.Module:
        """
        创建一个简单的占位符CNN模型用于HAR。
        此函数作为实际动态模型工厂的替代品。
        """
        dataset_params = config.get('dataset', {})
        # 从配置中获取模型参数，或使用默认值
        n_features = dataset_params.get('n_features', 9)      # e.g., 9-axis sensor data
        num_classes = dataset_params.get('num_classes', 8)    # e.g., 8 activity classes
        seq_len = dataset_params.get('window_size', 128)      # Sequence length

        class SimpleHAR_CNN(nn.Module):
            def __init__(self, in_channels: int, num_classes: int, sequence_length: int):
                super().__init__()
                # 输入形状: (batch, features, sequence_len)，例如 (B, 9, 128)
                self.conv1 = nn.Conv1d(in_channels, 32, kernel_size=5, stride=1, padding=2)
                self.relu1 = nn.ReLU()
                self.pool1 = nn.MaxPool1d(kernel_size=2, stride=2)
                
                self.conv2 = nn.Conv1d(32, 64, kernel_size=5, stride=1, padding=2)
                self.relu2 = nn.ReLU()
                self.pool2 = nn.MaxPool1d(kernel_size=2, stride=2)
                
                # 经过两次步长为2的最大池化后，序列长度减半两次：sequence_length / 4
                flattened_size = 64 * (sequence_length // 4)
                self.flatten = nn.Flatten()
                self.fc1 = nn.Linear(flattened_size, 128)
                self.relu3 = nn.ReLU()
                self.fc2 = nn.Linear(128, num_classes)

            def forward(self, data_dict: Dict[str, torch.Tensor]) -> torch.Tensor:
                # 假设所有输入数据都在 'imu' 这个键下
                x = data_dict['imu']  # 预期形状: (Batch, Length, Features), e.g., (B, 128, 9)
                x = x.permute(0, 2, 1) # 转换为: (Batch, Features, Length), e.g., (B, 9, 128)
                
                x = self.pool1(self.relu1(self.conv1(x)))
                x = self.pool2(self.relu2(self.conv2(x)))
                x = self.flatten(x)
                x = self.relu3(self.fc1(x))
                x = self.fc2(x)
                return x
        
        print(f"创建了一个简单的CNN占位模型 (输入特征: {n_features}, 类别数: {num_classes})")
        return SimpleHAR_CNN(n_features, num_classes, seq_len)

    # 3. Fallback for 'utils.load_dataset_pytorch'
    class MockDatasetLoader:
        """一个模拟的数据加载器，用于生成合成数据。"""
        def __init__(self, config: Dict[str, Any], seed: int):
            print("正在生成模拟HAR数据...")
            # 从配置中获取数据参数
            n_features = config.get('n_features', 9)
            seq_len = config.get('window_size', 128)
            num_classes = config.get('num_classes', 8)
            
            # 设置样本数量
            n_train_samples = 2000
            n_test_samples = 500
            
            # 使用torch生成随机数据
            self.central_train_data = torch.randn(n_train_samples, seq_len, n_features)
            self.central_train_label = torch.randint(0, num_classes, (n_train_samples,))
            self.central_test_data = torch.randn(n_test_samples, seq_len, n_features)
            self.central_test_label = torch.randint(0, num_classes, (n_test_samples,))
            print("模拟数据生成完毕。")

    def load_dataset_pytorch(dataset_name: str, batch_size: int, type: str, seed: int, data_path: str, config: Dict[str, Any]):
        """加载数据集的后备函数，返回一个包含合成数据的加载器对象。"""
        # 注意：这里的参数是为了与原始调用签名保持一致
        return MockDatasetLoader(config.get('dataset', {}), seed)

    # 创建一个模拟的 'utils' 模块，并将后备函数/类附加到它上面
    utils = type('utils', (), {
        'HARDataset': HARDataset,
        'load_dataset_pytorch': load_dataset_pytorch
    })
    
    # =========================== Fallback Logic: End ===========================

print("所有模块已准备就绪。")

成功从项目文件导入模块。
所有模块已准备就绪。


## 步骤 2: 实验配置

**这是您唯一需要修改的单元格。**

请将 `CONFIG_PATH`变量设置为您想要使用的配置文件的路径。所有实验参数都将从此文件加载。

In [3]:
# ================== 核心配置 ==================
# 🔥 只需修改此处的配置文件路径即可开始新的实验
CONFIG_PATH = 'config/default_configs/shl_config.yaml'  # 默认使用根目录下的config.yaml
# ==============================================

# 检查配置文件是否存在
if not os.path.exists(CONFIG_PATH):
    print(f"❌ 错误: 配置文件 '{CONFIG_PATH}' 未找到!")
    print("请确保路径正确，或创建一个新的配置文件。")
else:
    print(f"✓ 将使用配置文件: '{CONFIG_PATH}'")

✓ 将使用配置文件: 'config/default_configs/shl_config.yaml'


## 步骤 3: 核心训练器类

下面的 `ConfigurableTrainer` 类是整个训练流程的核心。它封装了从配置加载、环境设置、数据处理、模型创建、训练、评估到结果可视化的所有逻辑。您无需修改此类中的任何代码。

In [4]:
class ConfigurableTrainer:
    """配置驱动的训练器类"""
    
    def __init__(self, config_path: str):
        self.config_path = config_path
        self.config = self._load_config(config_path)
        self.params = self._extract_parameters()
        self.device = None
        self.output_dir = None
        self.logger = None
        self.model = None
        self.optimizer = None
        self.scheduler = None
        self.criterion = None
        self.history = {'train_loss': [], 'train_accuracy': [], 'val_loss': [], 'val_accuracy': [], 'val_f1': []}

    def _load_config(self, path: str) -> Dict[str, Any]:
        with open(path, 'r', encoding='utf-8') as f:
            return yaml.safe_load(f)

    def _extract_parameters(self) -> Dict[str, Any]:
        """将配置文件中的所有层级展平到一个字典中以便于访问。"""
        params = {}
        for key, value in self.config.items():
            if isinstance(value, dict):
                params.update(value)
            else:
                params[key] = value
        return params

    def setup_environment(self):
        """设置随机种子、设备和日志记录"""
        seed = self.params.get('seed', 42)
        random.seed(seed); np.random.seed(seed); torch.manual_seed(seed)
        if torch.cuda.is_available():
            torch.cuda.manual_seed(seed); torch.cuda.manual_seed_all(seed)
        
        device_pref = self.params.get('device', 'auto')
        if device_pref == 'auto':
            self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        else:
            self.device = torch.device(device_pref)
        
        self.output_dir = Path(self.params.get('output_dir', './results/default_experiment'))
        self.output_dir.mkdir(parents=True, exist_ok=True)

        # 设置日志
        logging.basicConfig(level=logging.INFO, 
                            format='%(asctime)s [%(levelname)s] - %(message)s',
                            handlers=[logging.FileHandler(self.output_dir / 'training.log'), logging.StreamHandler(sys.stdout)])
        self.logger = logging.getLogger()
        self.logger.info(f"环境设置完成。设备: {self.device}, 输出目录: {self.output_dir}")

    def load_data(self) -> Tuple[DataLoader, DataLoader, DataLoader, np.ndarray]:
        """根据配置加载并准备数据"""
        self.logger.info(f"加载数据集: {self.params['name']}")
        
        # 此处调用了在单元格3中定义的后备（Fallback）或实际的utils函数
        dataset_loader = utils.load_dataset_pytorch(
            self.params['name'], 
            self.params['batch_size'], 
            'BALANCED', # 假设类型
            self.params['seed'], 
            './datasets/', 
            self.config # 传递完整配置给后备函数
        )
        train_data, train_labels = dataset_loader.central_train_data, dataset_loader.central_train_label
        test_data, test_labels = dataset_loader.central_test_data, dataset_loader.central_test_label
        
        # 分割验证集
        train_data, dev_data, train_labels, dev_labels = train_test_split(
            train_data.numpy(), train_labels.numpy(), 
            test_size=0.15, random_state=self.params['seed'], stratify=train_labels.numpy()
        )

        # 转换为Tensor
        train_dataset = utils.HARDataset(torch.FloatTensor(train_data), torch.LongTensor(train_labels))
        dev_dataset = utils.HARDataset(torch.FloatTensor(dev_data), torch.LongTensor(dev_labels))
        test_dataset = utils.HARDataset(torch.FloatTensor(test_data), torch.LongTensor(test_labels))

        # 创建DataLoaders
        common_params = {'batch_size': self.params['batch_size'], 'num_workers': 0, 'pin_memory': True}
        train_loader = DataLoader(train_dataset, shuffle=True, **common_params)
        dev_loader = DataLoader(dev_dataset, shuffle=False, **common_params)
        test_loader = DataLoader(test_dataset, shuffle=False, **common_params)

        self.logger.info("数据加载和分割完成。")
        return train_loader, dev_loader, test_loader, train_labels
        
    def build_model(self):
        """根据配置构建模型"""
        self.logger.info("构建模型...")
        # 使用在单元格3中定义的后备（Fallback）或实际的工厂函数
        self.model = create_dynamic_har_model(self.config).to(self.device)
        self.logger.info(f"模型构建完成。总参数量: {sum(p.numel() for p in self.model.parameters() if p.requires_grad):,}")
        
    def setup_training_components(self, train_labels: np.ndarray):
        """设置优化器、损失函数和调度器"""
        weights = class_weight.compute_class_weight('balanced', classes=np.unique(train_labels), y=train_labels)
        class_weights = torch.FloatTensor(weights).to(self.device)
        self.criterion = nn.CrossEntropyLoss(weight=class_weights, label_smoothing=self.params.get('label_smoothing', 0.0))

        opt_name = self.params.get('optimizer', 'adamw').lower()
        opt_map = {'adam': optim.Adam, 'adamw': optim.AdamW, 'sgd': optim.SGD}
        self.optimizer = opt_map[opt_name](self.model.parameters(), lr=self.params['learning_rate'], weight_decay=self.params.get('weight_decay', 1e-4))

        scheduler_name = self.params.get('scheduler', 'cosine')
        if scheduler_name == 'cosine':
            self.scheduler = CosineAnnealingLR(self.optimizer, T_max=self.params['epochs'])
        elif scheduler_name == 'step':
            self.scheduler = StepLR(self.optimizer, step_size=30, gamma=0.1)
        else: 
            self.scheduler = None
        self.logger.info("训练组件设置完成。")

    def train(self, train_loader, dev_loader):
        self.logger.info("--- 开始训练 ---")
        best_val_f1 = 0.0
        patience_counter = 0
        patience = self.params.get('early_stopping_patience', 10)

        for epoch in range(self.params['epochs']):
            self.model.train()
            train_loss, train_correct, train_total = 0, 0, 0
            for data, targets in train_loader:
                # 动态构建输入字典，这里假设所有数据都是 'imu' 模态
                data_dict = {'imu': data}
                data_dict = {k: v.to(self.device) for k, v in data_dict.items()}
                targets = targets.to(self.device)

                self.optimizer.zero_grad()
                outputs = self.model(data_dict)
                loss = self.criterion(outputs, targets)
                loss.backward()
                if self.params.get('gradient_clip_norm', 0) > 0:
                    torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.params['gradient_clip_norm'])
                self.optimizer.step()

                train_loss += loss.item()
                _, predicted = torch.max(outputs.data, 1)
                train_total += targets.size(0)
                train_correct += (predicted == targets).sum().item()

            if self.scheduler: self.scheduler.step()

            # 验证
            val_loss, val_f1, val_acc = self.evaluate(dev_loader, is_test=False)
            train_acc = train_correct / train_total
            self.logger.info(f"Epoch {epoch+1}/{self.params['epochs']} | Train Loss: {train_loss/len(train_loader):.4f}, Train Acc: {train_acc:.4f} | Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}, Val F1: {val_f1:.4f}")
            
            # 保存历史记录
            self.history['train_loss'].append(train_loss/len(train_loader))
            self.history['train_accuracy'].append(train_acc)
            self.history['val_loss'].append(val_loss)
            self.history['val_accuracy'].append(val_acc)
            self.history['val_f1'].append(val_f1)

            # 早停与模型保存
            if val_f1 > best_val_f1:
                best_val_f1 = val_f1
                patience_counter = 0
                if self.params.get('save_checkpoints', True):
                    torch.save(self.model.state_dict(), self.output_dir / 'best_model.pth')
                    self.logger.info(f"新最佳模型已保存，验证F1分数: {best_val_f1:.4f}")
            else:
                patience_counter += 1

            if patience_counter >= patience:
                self.logger.info(f"早停触发于 epoch {epoch+1}")
                break

    def evaluate(self, data_loader, is_test=True):
        self.model.eval()
        total_loss, total_correct, total_samples = 0, 0, 0
        all_preds, all_targets = [], []
        with torch.no_grad():
            for data, targets in data_loader:
                data_dict = {'imu': data}
                data_dict = {k: v.to(self.device) for k, v in data_dict.items()}
                targets = targets.to(self.device)

                outputs = self.model(data_dict)
                loss = self.criterion(outputs, targets)
                total_loss += loss.item()
                _, predicted = torch.max(outputs.data, 1)
                total_samples += targets.size(0)
                total_correct += (predicted == targets).sum().item()
                all_preds.extend(predicted.cpu().numpy())
                all_targets.extend(targets.cpu().numpy())
        
        avg_loss = total_loss / len(data_loader)
        accuracy = total_correct / total_samples
        f1 = f1_score(all_targets, all_preds, average='weighted')

        if is_test:
            self.logger.info(f"--- 测试结果 ---")
            self.logger.info(f"测试准确率: {accuracy:.4f}")
            self.logger.info(f"测试F1分数 (加权): {f1:.4f}")
            report = classification_report(all_targets, all_preds, target_names=self.params['activity_labels'], zero_division=0)
            self.logger.info(f"分类报告:\n{report}")
            if self.params.get('plot_confusion_matrix', True):
                self.plot_confusion_matrix(all_targets, all_preds, self.params['activity_labels'])
        
        return avg_loss, f1, accuracy

    def plot_learning_curves(self):
        plt.figure(figsize=(12, 5))
        
        plt.subplot(1, 2, 1)
        plt.plot(self.history['train_loss'], label='Train Loss')
        plt.plot(self.history['val_loss'], label='Validation Loss')
        plt.title('Loss vs. Epochs')
        plt.xlabel('Epoch')
        plt.ylabel('Loss')
        plt.legend()
        
        plt.subplot(1, 2, 2)
        plt.plot(self.history['train_accuracy'], label='Train Accuracy')
        plt.plot(self.history['val_accuracy'], label='Validation Accuracy')
        plt.title('Accuracy vs. Epochs')
        plt.xlabel('Epoch')
        plt.ylabel('Accuracy')
        plt.legend()
        
        plt.tight_layout()
        save_path = self.output_dir / 'learning_curves.png'
        plt.savefig(save_path)
        self.logger.info(f"学习曲线已保存至 {save_path}")
        plt.show()

    def plot_confusion_matrix(self, y_true, y_pred, labels):
        cm = confusion_matrix(y_true, y_pred)
        plt.figure(figsize=(10, 8))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=labels, yticklabels=labels)
        plt.title('Confusion Matrix')
        plt.ylabel('True Label')
        plt.xlabel('Predicted Label')
        save_path = self.output_dir / 'confusion_matrix.png'
        plt.savefig(save_path)
        self.logger.info(f"混淆矩阵已保存至 {save_path}")
        plt.close() # 关闭图像，防止在notebook中重复显示
        
    def run(self):
        self.setup_environment()
        train_loader, dev_loader, test_loader, train_labels = self.load_data()
        self.build_model()
        self.setup_training_components(train_labels)
        self.train(train_loader, dev_loader)
        
        # 检查最佳模型文件是否存在
        best_model_path = self.output_dir / 'best_model.pth'
        if os.path.exists(best_model_path):
            self.logger.info("加载最佳模型进行最终评估...")
            self.model.load_state_dict(torch.load(best_model_path))
            self.evaluate(test_loader)
        else:
            self.logger.warning("未找到保存的最佳模型，将使用当前模型进行评估。")
            self.evaluate(test_loader)

        if self.params.get('plot_learning_curves', True):
            self.plot_learning_curves()
        self.logger.info("训练流程完成！")

## 步骤 4: 执行训练流程

最后，我们实例化 `ConfigurableTrainer` 类并调用其 `run` 方法来启动整个训练和评估流程。所有操作都将由之前加载的配置驱动。

In [5]:
if __name__ == '__main__' and '__file__' not in locals(): # 确保在Jupyter环境中运行
    try:
        trainer = ConfigurableTrainer(config_path=CONFIG_PATH)
        trainer.run()
    except KeyboardInterrupt:
        print("\n用户中断了训练流程。")
    except Exception as e:
        logging.exception("训练流程中发生未捕获的异常")
        print(f"\n❌ 训练流程因严重错误而终止。请查看上面的日志和错误信息。")

2025-07-25 18:28:36,460 [INFO] - 环境设置完成。设备: cpu, 输出目录: results/shl_multimodal_run
2025-07-25 18:28:36,460 [INFO] - 加载数据集: shl
2025-07-25 18:28:36,576 [ERROR] - 训练流程中发生未捕获的异常
Traceback (most recent call last):
  File "/Users/zilongzeng/Research/MazeruHAR/utils_torch.py", line 431, in load_dataset_pytorch
    client_data.append(pickle.load(open(main_dir + 'datasetStandardized/' + dataset_name + '/UserData' + str(i) + '.pkl', 'rb')))
FileNotFoundError: [Errno 2] No such file or directory: './datasets/datasetStandardized/shl/UserData0.pkl'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/var/folders/fr/14dg9d1n4vx9sqmyh_9pqm3h0000gn/T/ipykernel_71802/2785309581.py", line 4, in <module>
    trainer.run()
  File "/var/folders/fr/14dg9d1n4vx9sqmyh_9pqm3h0000gn/T/ipykernel_71802/723311530.py", line 245, in run
    train_loader, dev_loader, test_loader, train_labels = self.load_data()
  File "/var/folders/fr/14dg9d1n4vx9sqmyh_9pqm