# 数据预处理 - MNIST数字识别

## 概述
本notebook专门用于MNIST数据集的加载、预处理和探索性数据分析。为神经网络训练准备高质量的数据。

In [None]:
# 导入必要的库
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.decomposition import PCA
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

print("库导入完成")

## 1. 数据加载和基础信息

In [None]:
def load_mnist_data():
    """加载MNIST数据集"""
    print("正在加载MNIST数据集...")
    
    # 从OpenML加载MNIST数据
    mnist = fetch_openml('mnist_784', version=1, as_frame=False)
    X, y = mnist.data, mnist.target.astype(int)
    
    print(f"数据集基本信息:")
    print(f"  - 特征数量: {X.shape[1]}")
    print(f"  - 样本数量: {X.shape[0]}")
    print(f"  - 数据类型: {X.dtype}")
    print(f"  - 标签类型: {type(y[0])}")
    
    return X, y

# 加载数据
X_raw, y_raw = load_mnist_data()

## 2. 数据质量检查

In [None]:
def check_data_quality(X, y):
    """检查数据质量"""
    print("\n=== 数据质量检查 ===")
    
    # 检查缺失值
    missing_values = np.isnan(X).sum()
    print(f"缺失值数量: {missing_values}")
    
    # 检查数据范围
    print(f"\n像素值统计:")
    print(f"  - 最小值: {X.min()}")
    print(f"  - 最大值: {X.max()}")
    print(f"  - 平均值: {X.mean():.2f}")
    print(f"  - 标准差: {X.std():.2f}")
    
    # 检查标签分布
    print(f"\n标签分布:")
    unique_labels, counts = np.unique(y, return_counts=True)
    for label, count in zip(unique_labels, counts):
        print(f"  - 数字 {label}: {count} 个样本 ({count/len(y)*100:.1f}%)")
    
    # 检查数据一致性
    print(f"\n数据一致性检查:")
    print(f"  - 特征数和样本数匹配: {X.shape[0] == len(y)}")
    print(f"  - 所有标签都在0-9范围内: {np.all((y >= 0) & (y <= 9))}")
    
    return X, y

# 检查数据质量
X_raw, y_raw = check_data_quality(X_raw, y_raw)

## 3. 数据可视化分析

In [None]:
def visualize_data_distribution(X, y):
    """可视化数据分布"""
    fig, axes = plt.subplots(2, 3, figsize=(18, 12))
    
    # 1. 标签分布柱状图
    ax1 = axes[0, 0]
    unique_labels, counts = np.unique(y, return_counts=True)
    ax1.bar(unique_labels, counts, color='skyblue', alpha=0.7)
    ax1.set_title('标签分布')
    ax1.set_xlabel('数字')
    ax1.set_ylabel('样本数量')
    ax1.set_xticks(range(10))
    ax1.grid(True, alpha=0.3)
    
    # 2. 像素值分布
    ax2 = axes[0, 1]
    sample_pixels = X.flatten()
    ax2.hist(sample_pixels, bins=50, color='lightgreen', alpha=0.7, density=True)
    ax2.set_title('像素值分布')
    ax2.set_xlabel('像素值')
    ax2.set_ylabel('密度')
    ax2.grid(True, alpha=0.3)
    
    # 3. 每个数字的平均图像
    ax3 = axes[0, 2]
    avg_images = []
    for digit in range(10):
        mask = y == digit
        avg_image = X[mask].mean(axis=0).reshape(28, 28)
        avg_images.append(avg_image)
    
    # 显示所有数字的平均图像
    for i, avg_img in enumerate(avg_images):
        ax3 = axes[1, i] if i < 3 else (axes[0, 2] if i == 3 else None)
        if ax3 is not None and i < 3:
            ax3.imshow(avg_img, cmap='gray')
            ax3.set_title(f'数字 {i} 的平均图像')
            ax3.axis('off')
    
    # 4. 像素值热力图 (第一个样本)
    ax4 = axes[1, 0]
    first_image = X[0].reshape(28, 28)
    im = ax4.imshow(first_image, cmap='hot', interpolation='nearest')
    ax4.set_title(f'第一个样本 (标签: {y[0]})')
    ax4.axis('off')
    plt.colorbar(im, ax=ax4, fraction=0.046, pad=0.04)
    
    # 5. 不同数字的像素值对比
    ax5 = axes[1, 1]
    digit_means = []
    digit_stds = []
    for digit in range(10):
        mask = y == digit
        digit_pixels = X[mask]
        digit_means.append(digit_pixels.mean())
        digit_stds.append(digit_pixels.std())
    
    x_pos = np.arange(10)
    width = 0.35
    ax5.bar(x_pos - width/2, digit_means, width, label='平均值', alpha=0.7)
    ax5.bar(x_pos + width/2, digit_stds, width, label='标准差', alpha=0.7)
    ax5.set_title('不同数字的像素统计')
    ax5.set_xlabel('数字')
    ax5.set_ylabel('像素值')
    ax5.set_xticks(x_pos)
    ax5.legend()
    ax5.grid(True, alpha=0.3)
    
    # 6. 样本展示
    ax6 = axes[1, 2]
    for i in range(10):
        idx = np.where(y == i)[0][0]  # 找到每个数字的第一个样本
        plt.subplot(2, 5, i+1)
        plt.imshow(X[idx].reshape(28, 28), cmap='gray')
        plt.title(f'数字 {i}')
        plt.axis('off')
    
    plt.tight_layout()
    plt.show()

# 可视化数据分布
visualize_data_distribution(X_raw, y_raw)

## 4. 数据预处理方法对比

In [None]:
def compare_preprocessing_methods(X, sample_size=1000):
    """对比不同的预处理方法"""
    print("\n=== 数据预处理方法对比 ===")
    
    # 随机采样以提高计算效率
    indices = np.random.choice(len(X), min(sample_size, len(X)), replace=False)
    X_sample = X[indices]
    
    # 1. 原始数据
    X_original = X_sample.copy()
    
    # 2. 归一化到[0,1]
    X_normalized = X_sample / 255.0
    
    # 3. 标准化 (Z-score)
    scaler_standard = StandardScaler()
    X_standardized = scaler_standard.fit_transform(X_sample)
    
    # 4. Min-Max缩放到[0,1]
    scaler_minmax = MinMaxScaler()
    X_minmax = scaler_minmax.fit_transform(X_sample)
    
    # 5. 中心化 (减去均值)
    X_centered = X_sample - X_sample.mean(axis=0)
    
    methods = {
        '原始数据': X_original,
        '归一化[0,1]': X_normalized,
        '标准化(Z-score)': X_standardized,
        'Min-Max缩放': X_minmax,
        '中心化': X_centered
    }
    
    # 对比统计信息
    print("\n预处理方法统计对比:")
    print("方法\t\t平均值\t标准差\t最小值\t最大值")
    print("-" * 60)
    
    for method_name, data in methods.items():
        print(f"{method_name:<15}\t{data.mean():.3f}\t{data.std():.3f}\t{data.min():.3f}\t{data.max():.3f}")
    
    # 可视化对比
    fig, axes = plt.subplots(2, 3, figsize=(18, 12))
    axes = axes.flatten()
    
    for i, (method_name, data) in enumerate(methods.items()):
        if i >= 5:
            break
            
        # 显示第一个样本
        axes[i].imshow(data[0].reshape(28, 28), cmap='gray')
        axes[i].set_title(f'{method_name}\n(值域: [{data.min():.2f}, {data.max():.2f}])')
        axes[i].axis('off')
    
    # 最后一个图显示像素值分布对比
    axes[5].set_title('像素值分布对比')
    for method_name, data in list(methods.items())[:3]:  # 只显示前3种方法避免过于拥挤
        axes[5].hist(data.flatten(), bins=50, alpha=0.5, label=method_name, density=True)
    axes[5].set_xlabel('像素值')
    axes[5].set_ylabel('密度')
    axes[5].legend()
    axes[5].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    return methods

# 对比预处理方法
preprocessing_methods = compare_preprocessing_methods(X_raw)

## 5. 数据分割策略

In [None]:
def split_data_strategically(X, y, test_size=0.2, val_size=0.1, random_state=42):
    """战略性数据分割，保持标签分布"""
    print(f"\n=== 数据分割策略 ===")
    print(f"原始数据集大小: {X.shape}")
    print(f"测试集比例: {test_size}")
    print(f"验证集比例: {val_size}")
    
    # 首先分割出测试集
    X_temp, X_test, y_temp, y_test = train_test_split(
        X, y, 
        test_size=test_size, 
        random_state=random_state, 
        stratify=y  # 分层采样保持标签分布
    )
    
    # 再从剩余数据中分割出验证集
    val_size_adjusted = val_size / (1 - test_size)  # 调整验证集比例
    X_train, X_val, y_train, y_val = train_test_split(
        X_temp, y_temp,
        test_size=val_size_adjusted,
        random_state=random_state,
        stratify=y_temp
    )
    
    print(f"\n分割结果:")
    print(f"训练集: {X_train.shape} ({len(X_train)/len(X)*100:.1f}%)")
    print(f"验证集: {X_val.shape} ({len(X_val)/len(X)*100:.1f}%)")
    print(f"测试集: {X_test.shape} ({len(X_test)/len(X)*100:.1f}%)")
    
    # 检查每个数据集的标签分布
    print(f"\n各数据集标签分布:")
    datasets = [
        ("训练集", y_train),
        ("验证集", y_val),
        ("测试集", y_test)
    ]
    
    for name, labels in datasets:
        unique, counts = np.unique(labels, return_counts=True)
        distribution = counts / len(labels) * 100
        print(f"\n{name}:")
        for digit, count, pct in zip(unique, counts, distribution):
            print(f"  数字 {digit}: {count:4d} ({pct:5.1f}%)")
    
    return X_train, X_val, X_test, y_train, y_val, y_test

# 数据分割
X_train_raw, X_val_raw, X_test_raw, y_train_raw, y_val_raw, y_test_raw = split_data_strategically(X_raw, y_raw)

## 6. 特征工程和降维分析

In [None]:
def analyze_feature_importance_and_pca(X, y, n_components=50):
    """分析特征重要性和PCA降维"""
    print(f"\n=== 特征分析和PCA降维 ===")
    
    # 数据标准化
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # PCA分析
    print(f"\nPCA分析 (前{n_components}个主成分):")
    pca = PCA(n_components=n_components)
    X_pca = pca.fit_transform(X_scaled)
    
    # 解释方差比例
    explained_variance_ratio = pca.explained_variance_ratio_
    cumulative_variance_ratio = np.cumsum(explained_variance_ratio)
    
    print(f"前10个主成分解释方差比例:")
    for i in range(min(10, n_components)):
        print(f"  PC{i+1}: {explained_variance_ratio[i]:.4f} (累计: {cumulative_variance_ratio[i]:.4f})")
    
    # 找到保留95%方差所需的组件数
    n_components_95 = np.argmax(cumulative_variance_ratio >= 0.95) + 1
    print(f"\n保留95%方差需要 {n_components_95} 个主成分")
    print(f"原始特征数: {X.shape[1]}")
    print(f"降维比例: {(1 - n_components_95/X.shape[1])*100:.1f}%")
    
    # 可视化PCA结果
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))
    
    # 1. 解释方差比例图
    ax1 = axes[0, 0]
    ax1.plot(range(1, n_components + 1), cumulative_variance_ratio[:n_components], 'bo-')
    ax1.axhline(y=0.95, color='r', linestyle='--', alpha=0.7, label='95%方差')
    ax1.axhline(y=0.90, color='orange', linestyle='--', alpha=0.7, label='90%方差')
    ax1.set_xlabel('主成分数量')
    ax1.set_ylabel('累计解释方差比例')
    ax1.set_title('PCA累计解释方差')
    ax1.legend()
    ax1.grid(True, alpha=0.3)
    
    # 2. 前几个主成分的贡献度
    ax2 = axes[0, 1]
    n_show = min(20, n_components)
    ax2.bar(range(1, n_show + 1), explained_variance_ratio[:n_show])
    ax2.set_xlabel('主成分')
    ax2.set_ylabel('解释方差比例')
    ax2.set_title(f'前{n_show}个主成分的贡献度')
    ax2.grid(True, alpha=0.3)
    
    # 3. 2D PCA可视化 (前两个主成分)
    ax3 = axes[1, 0]
    scatter = ax3.scatter(X_pca[:, 0], X_pca[:, 1], c=y, cmap='tab10', alpha=0.6, s=1)
    ax3.set_xlabel(f'PC1 (解释方差: {explained_variance_ratio[0]:.3f})')
    ax3.set_ylabel(f'PC2 (解释方差: {explained_variance_ratio[1]:.3f})')
    ax3.set_title('前两个主成分的2D可视化')
    plt.colorbar(scatter, ax=ax3)
    
    # 4. 像素重要性分析 (方差)
    ax4 = axes[1, 1]
    pixel_variance = np.var(X_scaled, axis=0)
    variance_image = pixel_variance.reshape(28, 28)
    im = ax4.imshow(variance_image, cmap='hot')
    ax4.set_title('像素方差分布 (特征重要性)')
    ax4.axis('off')
    plt.colorbar(im, ax=ax4, fraction=0.046, pad=0.04)
    
    plt.tight_layout()
    plt.show()
    
    return pca, X_pca, scaler

# 使用训练数据进行分析
pca_model, X_pca_analysis, scaler_analysis = analyze_feature_importance_and_pca(X_train_raw, y_train_raw)

## 7. 最终数据预处理流程

In [None]:
def final_preprocessing_pipeline(X_train, X_val, X_test, y_train, y_val, y_test, 
                                normalization_method='minmax', 
                                apply_pca=False,
                                n_components=None):
    """最终的数据预处理流水线"""
    print(f"\n=== 最终数据预处理流水线 ===")
    print(f"归一化方法: {normalization_method}")
    print(f"PCA降维: {apply_pca}")
    if apply_pca:
        print(f"PCA组件数: {n_components}")
    
    # 1. 归一化/标准化
    if normalization_method == 'minmax':
        scaler = MinMaxScaler()
        print("使用MinMaxScaler归一化到[0,1]")
    elif normalization_method == 'standard':
        scaler = StandardScaler()
        print("使用StandardScaler标准化")
    elif normalization_method == 'simple':
        scaler = None
        print("使用简单归一化 (除以255)")
    else:
        raise ValueError("不支持的归一化方法")
    
    # 应用归一化
    if scaler is not None:
        X_train_processed = scaler.fit_transform(X_train)
        X_val_processed = scaler.transform(X_val)
        X_test_processed = scaler.transform(X_test)
    else:
        X_train_processed = X_train / 255.0
        X_val_processed = X_val / 255.0
        X_test_processed = X_test / 255.0
    
    print(f"归一化后统计:")
    print(f"  训练集: 均值={X_train_processed.mean():.4f}, 标准差={X_train_processed.std():.4f}")
    print(f"  验证集: 均值={X_val_processed.mean():.4f}, 标准差={X_val_processed.std():.4f}")
    print(f"  测试集: 均值={X_test_processed.mean():.4f}, 标准差={X_test_processed.std():.4f}")
    
    # 2. PCA降维 (可选)
    if apply_pca:
        if n_components is None:
            # 自动选择组件数
            pca_temp = PCA().fit(X_train_processed)
            cumulative_variance = np.cumsum(pca_temp.explained_variance_ratio_)
            n_components = np.argmax(cumulative_variance >= 0.95) + 1
            print(f"自动选择PCA组件数: {n_components} (保留95%方差)")
        
        pca = PCA(n_components=n_components)
        X_train_processed = pca.fit_transform(X_train_processed)
        X_val_processed = pca.transform(X_val_processed)
        X_test_processed = pca.transform(X_test_processed)
        
        print(f"PCA降维后特征数: {X_train_processed.shape[1]}")
        print(f"解释方差比例: {pca.explained_variance_ratio_.sum():.4f}")
    else:
        pca = None
        print(f"保持原始特征数: {X_train_processed.shape[1]}")
    
    # 3. 标签处理 (One-hot编码)
    encoder = OneHotEncoder(sparse_output=False)
    y_train_encoded = encoder.fit_transform(y_train.reshape(-1, 1))
    y_val_encoded = encoder.transform(y_val.reshape(-1, 1))
    y_test_encoded = encoder.transform(y_test.reshape(-1, 1))
    
    print(f"\n标签处理完成:")
    print(f"  One-hot编码维度: {y_train_encoded.shape[1]}")
    print(f"  编码示例 (数字{y_train[0]}): {y_train_encoded[0]}")
    
    # 4. 最终数据验证
    print(f"\n最终数据形状:")
    print(f"  X_train: {X_train_processed.shape}")
    print(f"  X_val:   {X_val_processed.shape}")
    print(f"  X_test:  {X_test_processed.shape}")
    print(f"  y_train: {y_train_encoded.shape}")
    print(f"  y_val:   {y_val_encoded.shape}")
    print(f"  y_test:  {y_test_encoded.shape}")
    
    return {
        'X_train': X_train_processed,
        'X_val': X_val_processed,
        'X_test': X_test_processed,
        'y_train': y_train_encoded,
        'y_val': y_val_encoded,
        'y_test': y_test_encoded,
        'scaler': scaler,
        'pca': pca,
        'encoder': encoder
    }

# 应用最终预处理流程
processed_data = final_preprocessing_pipeline(
    X_train_raw, X_val_raw, X_test_raw,
    y_train_raw, y_val_raw, y_test_raw,
    normalization_method='simple',  # 简单归一化除以255
    apply_pca=False  # 不使用PCA，保持完整特征
)

## 8. 数据质量验证和保存

In [None]:
def validate_and_save_preprocessed_data(processed_data, save_path='processed_data/'):
    """验证预处理后的数据并保存"""
    print(f"\n=== 数据验证和保存 ===")
    
    # 1. 数据完整性验证
    print("数据完整性检查:")
    
    # 检查缺失值
    for name in ['X_train', 'X_val', 'X_test']:
        missing = np.isnan(processed_data[name]).sum()
        print(f"  {name} 缺失值: {missing}")
    
    # 检查数据范围
    X_train = processed_data['X_train']
    print(f"\n数据范围检查:")
    print(f"  训练集范围: [{X_train.min():.4f}, {X_train.max():.4f}]")
    print(f"  数值类型: {X_train.dtype}")
    
    # 2. 标签验证
    y_train = processed_data['y_train']
    print(f"\n标签验证:")
    print(f"  One-hot编码正确性: {np.all(y_train.sum(axis=1) == 1)}")
    print(f"  标签维度: {y_train.shape[1]}")
    
    # 3. 数据分布验证
    print(f"\n数据分布验证:")
    y_train_labels = np.argmax(y_train, axis=1)
    unique, counts = np.unique(y_train_labels, return_counts=True)
    print(f"  训练集标签分布: {dict(zip(unique, counts))}")
    
    # 4. 可视化预处理后的样本
    fig, axes = plt.subplots(2, 5, figsize=(15, 6))
    axes = axes.flatten()
    
    for digit in range(10):
        # 找到每个数字的第一个样本
        idx = np.where(y_train_labels == digit)[0][0]
        
        # 如果应用了PCA，无法显示原始图像
        if processed_data['pca'] is not None:
            # 显示PCA重构的近似图像（仅当PCA组件数足够时）
            if processed_data['pca'].n_components_ >= 50:
                # 简单可视化：显示前几个像素或主成分
                sample_data = X_train[idx][:min(100, len(X_train[idx]))]
                if len(sample_data) >= 25:
                    img = sample_data[:25].reshape(5, 5)
                    axes[digit].imshow(img, cmap='viridis')
                else:
                    axes[digit].text(0.5, 0.5, f'PC数据\n数字{digit}', 
                                   ha='center', va='center', transform=axes[digit].transAxes)
            else:
                axes[digit].text(0.5, 0.5, f'PC数据\n数字{digit}', 
                               ha='center', va='center', transform=axes[digit].transAxes)
        else:
            # 显示原始图像
            img = X_train[idx].reshape(28, 28)
            axes[digit].imshow(img, cmap='gray')
        
        axes[digit].set_title(f'预处理后\n数字 {digit}')
        axes[digit].axis('off')
    
    plt.tight_layout()
    plt.show()
    
    # 5. 保存预处理数据
    import os
    os.makedirs(save_path, exist_ok=True)
    
    # 保存主要数据
    np.save(f'{save_path}X_train.npy', processed_data['X_train'])
    np.save(f'{save_path}X_val.npy', processed_data['X_val'])
    np.save(f'{save_path}X_test.npy', processed_data['X_test'])
    np.save(f'{save_path}y_train.npy', processed_data['y_train'])
    np.save(f'{save_path}y_val.npy', processed_data['y_val'])
    np.save(f'{save_path}y_test.npy', processed_data['y_test'])
    
    # 保存预处理器
    import pickle
    with open(f'{save_path}preprocessors.pkl', 'wb') as f:
        pickle.dump({
            'scaler': processed_data['scaler'],
            'pca': processed_data['pca'],
            'encoder': processed_data['encoder']
        }, f)
    
    print(f"\n数据已保存到: {save_path}")
    
    # 保存数据信息
    data_info = {
        'X_train_shape': processed_data['X_train'].shape,
        'X_val_shape': processed_data['X_val'].shape,
        'X_test_shape': processed_data['X_test'].shape,
        'y_train_shape': processed_data['y_train'].shape,
        'normalization_method': 'simple_divide_255',
        'pca_applied': processed_data['pca'] is not None,
        'feature_count': processed_data['X_train'].shape[1]
    }
    
    with open(f'{save_path}data_info.pkl', 'wb') as f:
        pickle.dump(data_info, f)
    
    print(f"数据信息: {data_info}")
    
    return processed_data

# 验证和保存数据
final_processed_data = validate_and_save_preprocessed_data(processed_data)

## 9. 预处理总结和建议

In [None]:
def preprocessing_summary():
    """预处理总结和建议"""
    print("\n" + "="*60)
    print("           数据预处理总结报告")
    print("="*60)
    
    print("\n【数据集概况】")
    print("  - 原始数据: MNIST手写数字数据集")
    print("  - 图像尺寸: 28×28像素 (784个特征)")
    print("  - 样本总数: 70,000")
    print("  - 数字类别: 0-9 (10个类别)")
    print("  - 数据质量: 无缺失值，标签平衡")
    
    print("\n【预处理步骤】")
    print("  1. 数据分割: 训练集(70%) / 验证集(10%) / 测试集(20%)")
    print("  2. 归一化: 简单归一化 (像素值÷255)")
    print("  3. 标签编码: One-hot编码")
    print("  4. 数据验证: 完整性、范围、分布检查")
    
    print("\n【预处理方法对比】")
    print("  方法              优点                    缺点")
    print("  " + "-"*55)
    print("  简单归一化        保持数据分布，计算简单    可能不是最优分布")
    print("  标准化(Z-score)   零均值单位方差          可能改变原始分布")
    print("  Min-Max缩放       固定范围[0,1]          对异常值敏感")
    print("  PCA降维           减少计算复杂度          可能丢失信息")
    
    print("\n【最终选择】")
    print("  - 归一化: 简单归一化 (除以255)")
    print("  - 降维: 不使用PCA (保留完整特征)")
    print("  - 理由: 保持原始信息，适合神经网络学习")
    
    print("\n【性能优化建议】")
    print("  1. 数据增强: 旋转、平移、缩放可提升泛化能力")
    print("  2. 批量处理: 使用适当批量大小提升训练效率")
    print("  3. 特征选择: 基于方差或相关性选择重要特征")
    print("  4. 正则化: 防止过拟合的技术")
    
    print("\n【神经网络训练建议】")
    print("  1. 学习率: 从0.001开始，根据训练情况调整")
    print("  2. 批量大小: 32-128之间，根据内存情况选择")
    print("  3. 网络结构: 从简单开始，逐步增加复杂度")
    print("  4. 早停策略: 监控验证集损失，防止过拟合")
    
    print("\n【预期性能】")
    print("  - 基准准确率: >95% (简单神经网络)")
    print("  - 优化准确率: >98% (深度网络+正则化)")
    print("  - 训练时间: 10-60分钟 (取决于硬件配置)")
    
    print("\n" + "="*60)
    print("数据预处理完成，准备进行神经网络训练！")
    print("="*60)

# 显示预处理总结
preprocessing_summary()

## 10. 数据加载函数（供其他notebook使用）

In [None]:
def load_preprocessed_data(data_path='processed_data/'):
    """加载预处理好的数据
    
    Args:
        data_path (str): 预处理数据保存路径
    
    Returns:
        dict: 包含所有预处理数据和预处理器
    """
    import os
    import pickle
    
    if not os.path.exists(data_path):
        raise FileNotFoundError(f"预处理数据路径不存在: {data_path}")
    
    # 加载数据
    data = {
        'X_train': np.load(f'{data_path}X_train.npy'),
        'X_val': np.load(f'{data_path}X_val.npy'),
        'X_test': np.load(f'{data_path}X_test.npy'),
        'y_train': np.load(f'{data_path}y_train.npy'),
        'y_val': np.load(f'{data_path}y_val.npy'),
        'y_test': np.load(f'{data_path}y_test.npy')
    }
    
    # 加载预处理器
    with open(f'{data_path}preprocessors.pkl', 'rb') as f:
        preprocessors = pickle.load(f)
    
    data.update(preprocessors)
    
    # 加载数据信息
    with open(f'{data_path}data_info.pkl', 'rb') as f:
        data_info = pickle.load(f)
    
    print(f"成功加载预处理数据:")
    for key, value in data_info.items():
        print(f"  {key}: {value}")
    
    return data

# 示例使用（注释掉避免实际执行）
# loaded_data = load_preprocessed_data()
# print(f"加载的训练数据形状: {loaded_data['X_train'].shape}")

print("\n数据预处理notebook已完成！")
print("主要成果:")
print("1. 完整的MNIST数据加载和探索")
print("2. 多种预处理方法对比分析")
print("3. 战略性数据分割保持标签分布")
print("4. PCA降维分析（保留95%方差需约150个主成分）")
print("5. 最终预处理流水线和数据验证")
print("6. 数据保存和加载功能")
print("\n数据已准备就绪，可以开始神经网络训练！")