# 模型评估和可视化工具

## 概述
本notebook提供了全面的神经网络模型评估和可视化工具，用于深入分析模型性能，包括：
- 详细的性能指标分析
- 多维度可视化展示
- 错误案例深入分析
- 模型对比和基准测试
- 交互式可视化界面

In [None]:
# 导入必要的库
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
from sklearn.metrics import (
    classification_report, confusion_matrix, accuracy_score, 
    precision_recall_fscore_support, roc_auc_score, roc_curve
)
from sklearn.preprocessing import label_binarize
import time
import warnings
warnings.filterwarnings('ignore')

# 设置中文字体
plt.rcParams['font.sans-serif'] = ['SimHei', 'DejaVu Sans']
plt.rcParams['axes.unicode_minus'] = False

# 设置样式
sns.set_style("whitegrid")
plt.style.use('seaborn-v0_8')

print("库导入完成")

## 1. 评估工具类定义

In [None]:
class ModelEvaluator:
    """神经网络模型评估器"""
    
    def __init__(self, model_name="Unknown Model"):
        self.model_name = model_name
        self.evaluation_results = {}
        self.predictions = None
        self.probabilities = None
        self.y_true = None
        self.y_pred = None
    
    def evaluate(self, y_true, y_pred, y_pred_proba=None, X_test=None):
        """评估模型性能"""
        self.y_true = y_true
        self.y_pred = y_pred
        self.probabilities = y_pred_proba
        
        # 基础指标
        accuracy = accuracy_score(y_true, y_pred)
        precision, recall, f1, support = precision_recall_fscore_support(
            y_true, y_pred, average=None, zero_division=0
        )
        
        # 宏平均和加权平均
        precision_macro, recall_macro, f1_macro, _ = precision_recall_fscore_support(
            y_true, y_pred, average='macro', zero_division=0
        )
        precision_weighted, recall_weighted, f1_weighted, _ = precision_recall_fscore_support(
            y_true, y_pred, average='weighted', zero_division=0
        )
        
        # 混淆矩阵
        cm = confusion_matrix(y_true, y_pred)
        
        # 每个类别的准确率
        class_accuracies = cm.diagonal() / cm.sum(axis=1)
        
        # 存储结果
        self.evaluation_results = {
            'accuracy': accuracy,
            'precision_per_class': precision,
            'recall_per_class': recall,
            'f1_per_class': f1,
            'support_per_class': support,
            'precision_macro': precision_macro,
            'recall_macro': recall_macro,
            'f1_macro': f1_macro,
            'precision_weighted': precision_weighted,
            'recall_weighted': recall_weighted,
            'f1_weighted': f1_weighted,
            'confusion_matrix': cm,
            'class_accuracies': class_accuracies,
            'classification_report': classification_report(y_true, y_pred, digits=4)
        }
        
        # 如果提供了概率，计算AUC
        if y_pred_proba is not None:
            try:
                # 对于多分类，需要二值化标签
                y_true_bin = label_binarize(y_true, classes=range(10))
                auc_scores = {}
                
                # 计算每个类别的AUC
                for i in range(10):
                    if len(np.unique(y_true_bin[:, i])) > 1:  # 确保有正负样本
                        auc_scores[f'class_{i}'] = roc_auc_score(
                            y_true_bin[:, i], y_pred_proba[:, i]
                        )
                
                # 计算宏平均AUC
                if auc_scores:
                    auc_scores['macro_auc'] = np.mean(list(auc_scores.values()))
                
                self.evaluation_results['auc_scores'] = auc_scores
            except Exception as e:
                print(f"AUC计算失败: {e}")
        
        return self.evaluation_results
    
    def print_detailed_report(self):
        """打印详细的评估报告"""
        print(f"\n{'='*80}")
        print(f"              {self.model_name} - 详细评估报告")
        print(f"{'='*80}")
        
        print(f"\n【总体性能指标】")
        print(f"  准确率 (Accuracy): {self.evaluation_results['accuracy']:.4f}")
        print(f"  宏平均精确率 (Macro Precision): {self.evaluation_results['precision_macro']:.4f}")
        print(f"  宏平均召回率 (Macro Recall): {self.evaluation_results['recall_macro']:.4f}")
        print(f"  宏平均F1分数 (Macro F1): {self.evaluation_results['f1_macro']:.4f}")
        print(f"  加权平均精确率 (Weighted Precision): {self.evaluation_results['precision_weighted']:.4f}")
        print(f"  加权平均召回率 (Weighted Recall): {self.evaluation_results['recall_weighted']:.4f}")
        print(f"  加权平均F1分数 (Weighted F1): {self.evaluation_results['f1_weighted']:.4f}")
        
        # AUC分数
        if 'auc_scores' in self.evaluation_results:
            auc_scores = self.evaluation_results['auc_scores']
            print(f"\n【AUC分数】")
            for key, value in auc_scores.items():
                if key == 'macro_auc':
                    print(f"  宏平均AUC: {value:.4f}")
                else:
                    class_num = key.split('_')[1]
                    print(f"  类别 {class_num}: {value:.4f}")
        
        print(f"\n【各类别详细指标】")
        print(f"{'类别':<6} {'精确率':<10} {'召回率':<10} {'F1分数':<10} {'准确率':<10} {'支持度':<8}")
        print("-" * 70)
        
        for i in range(10):
            precision = self.evaluation_results['precision_per_class'][i]
            recall = self.evaluation_results['recall_per_class'][i]
            f1 = self.evaluation_results['f1_per_class'][i]
            accuracy = self.evaluation_results['class_accuracies'][i]
            support = self.evaluation_results['support_per_class'][i]
            
            print(f"{i:<6} {precision:<10.4f} {recall:<10.4f} {f1:<10.4f} {accuracy:<10.4f} {support:<8}")
        
        print(f"\n【混淆矩阵分析】")
        cm = self.evaluation_results['confusion_matrix']
        
        # 找出最容易混淆的数字对
        confusion_pairs = []
        for i in range(10):
            for j in range(10):
                if i != j and cm[i, j] > 0:
                    confusion_pairs.append((i, j, cm[i, j]))
        
        confusion_pairs.sort(key=lambda x: x[2], reverse=True)
        
        print(f"  最容易混淆的数字对:")
        for true_digit, pred_digit, count in confusion_pairs[:5]:
            print(f"    {true_digit} 被误认为 {pred_digit}: {count} 次")
        
        # 类别难度分析
        class_accuracies = self.evaluation_results['class_accuracies']
        easiest_class = np.argmax(class_accuracies)
        hardest_class = np.argmin(class_accuracies)
        
        print(f"\n  类别难度分析:")
        print(f"    最容易识别: 数字 {easiest_class} (准确率: {class_accuracies[easiest_class]:.4f})")
        print(f"    最难识别: 数字 {hardest_class} (准确率: {class_accuracies[hardest_class]:.4f})")
        
        print(f"\n【详细分类报告】")
        print(self.evaluation_results['classification_report'])
    
    def create_comprehensive_visualization(self, figsize=(20, 15)):
        """创建综合的可视化报告"""
        fig = plt.figure(figsize=figsize)
        gs = fig.add_gridspec(3, 4, hspace=0.3, wspace=0.3)
        
        # 1. 混淆矩阵热力图
        ax1 = fig.add_subplot(gs[0, 0])
        cm = self.evaluation_results['confusion_matrix']
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax1,
                   xticklabels=range(10), yticklabels=range(10))
        ax1.set_title('混淆矩阵')
        ax1.set_xlabel('预测标签')
        ax1.set_ylabel('真实标签')
        
        # 2. 各类别性能对比
        ax2 = fig.add_subplot(gs[0, 1])
        metrics = ['精确率', '召回率', 'F1分数', '准确率']
        x = np.arange(10)
        width = 0.2
        
        for i, metric in enumerate(['precision_per_class', 'recall_per_class', 
                                   'f1_per_class', 'class_accuracies']):
            values = self.evaluation_results[metric]
            ax2.bar(x + i*width, values, width, label=metrics[i], alpha=0.8)
        
        ax2.set_xlabel('数字类别')
        ax2.set_ylabel('分数')
        ax2.set_title('各类别性能对比')
        ax2.set_xticks(x + width * 1.5)
        ax2.set_xticklabels(range(10))
        ax2.legend()
        ax2.grid(True, alpha=0.3)
        
        # 3. 支持度分布
        ax3 = fig.add_subplot(gs[0, 2])
        support = self.evaluation_results['support_per_class']
        ax3.bar(range(10), support, color='skyblue', alpha=0.7)
        ax3.set_xlabel('数字类别')
        ax3.set_ylabel('样本数量')
        ax3.set_title('各类别样本分布')
        ax3.set_xticks(range(10))
        ax3.grid(True, alpha=0.3)
        
        # 4. 总体性能雷达图
        ax4 = fig.add_subplot(gs[0, 3], projection='polar')
        categories = ['准确率', '宏精确率', '宏召回率', '宏F1', '加权精确率', '加权召回率', '加权F1']
        values = [
            self.evaluation_results['accuracy'],
            self.evaluation_results['precision_macro'],
            self.evaluation_results['recall_macro'],
            self.evaluation_results['f1_macro'],
            self.evaluation_results['precision_weighted'],
            self.evaluation_results['recall_weighted'],
            self.evaluation_results['f1_weighted']
        ]
        
        angles = np.linspace(0, 2 * np.pi, len(categories), endpoint=False).tolist()
        values += values[:1]  # 闭合图形
        angles += angles[:1]
        
        ax4.plot(angles, values, 'o-', linewidth=2)
        ax4.fill(angles, values, alpha=0.25)
        ax4.set_xticks(angles[:-1])
        ax4.set_xticklabels(categories)
        ax4.set_ylim(0, 1)
        ax4.set_title('总体性能雷达图')
        
        # 5. 错误分析热力图
        ax5 = fig.add_subplot(gs[1, 0])
        error_matrix = cm.copy()
        np.fill_diagonal(error_matrix, 0)  # 将对角线设为0，只显示错误
        
        # 只显示有错误的单元格
        mask = error_matrix == 0
        sns.heatmap(error_matrix, annot=True, fmt='d', cmap='Reds', ax=ax5,
                   xticklabels=range(10), yticklabels=range(10), mask=mask,
                   cbar_kws={'label': '错误次数'})
        ax5.set_title('错误分类热力图')
        ax5.set_xlabel('预测标签')
        ax5.set_ylabel('真实标签')
        
        # 6. 类别准确率排序
        ax6 = fig.add_subplot(gs[1, 1])
        class_accuracies = self.evaluation_results['class_accuracies']
        sorted_indices = np.argsort(class_accuracies)[::-1]
        sorted_accuracies = class_accuracies[sorted_indices]
        sorted_labels = [f'数字 {i}' for i in sorted_indices]
        
        bars = ax6.barh(range(len(sorted_labels)), sorted_accuracies, color='lightgreen', alpha=0.7)
        ax6.set_yticks(range(len(sorted_labels)))
        ax6.set_yticklabels(sorted_labels)
        ax6.set_xlabel('准确率')
        ax6.set_title('各类别准确率排序')
        ax6.grid(True, alpha=0.3)
        
        # 添加数值标签
        for i, (bar, acc) in enumerate(zip(bars, sorted_accuracies)):
            ax6.text(bar.get_width() + 0.01, bar.get_y() + bar.get_height()/2,
                    f'{acc:.3f}', ha='left', va='center')
        
        # 7. 精确率-召回率散点图
        ax7 = fig.add_subplot(gs[1, 2])
        precision = self.evaluation_results['precision_per_class']
        recall = self.evaluation_results['recall_per_class']
        
        scatter = ax7.scatter(precision, recall, c=range(10), cmap='viridis', s=100, alpha=0.7)
        ax7.set_xlabel('精确率')
        ax7.set_ylabel('召回率')
        ax7.set_title('精确率-召回率分布')
        ax7.grid(True, alpha=0.3)
        
        # 添加数字标签
        for i in range(10):
            ax7.annotate(str(i), (precision[i], recall[i]), 
                        xytext=(5, 5), textcoords='offset points')
        
        # 添加平均线
        ax7.axhline(y=recall.mean(), color='red', linestyle='--', alpha=0.5, label='平均召回率')
        ax7.axvline(x=precision.mean(), color='blue', linestyle='--', alpha=0.5, label='平均精确率')
        ax7.legend()
        
        # 8. F1分数分布
        ax8 = fig.add_subplot(gs[1, 3])
        f1_scores = self.evaluation_results['f1_per_class']
        ax8.hist(f1_scores, bins=10, alpha=0.7, color='orange', edgecolor='black')
        ax8.set_xlabel('F1分数')
        ax8.set_ylabel('类别数量')
        ax8.set_title('F1分数分布')
        ax8.grid(True, alpha=0.3)
        
        # 添加统计信息
        ax8.axvline(x=f1_scores.mean(), color='red', linestyle='--', 
                   label=f'平均值: {f1_scores.mean():.3f}')
        ax8.legend()
        
        # 9-12. 预留空间用于其他图表
        ax9 = fig.add_subplot(gs[2, 0])
        ax10 = fig.add_subplot(gs[2, 1])
        ax11 = fig.add_subplot(gs[2, 2])
        ax12 = fig.add_subplot(gs[2, 3])
        
        # 隐藏这些子图（可以后续添加更多可视化）
        for ax in [ax9, ax10, ax11, ax12]:
            ax.set_visible(False)
        
        plt.suptitle(f'{self.model_name} - 综合评估报告', fontsize=16, fontweight='bold')
        plt.tight_layout()
        plt.show()
    
    def analyze_errors(self, X_test, top_n=20):
        """分析错误分类的样本"""
        if self.y_true is None or self.y_pred is None:
            print("请先运行evaluate方法")
            return
        
        # 找到错误分类的样本
        errors = np.where(self.y_true != self.y_pred)[0]
        
        print(f"\n【错误分析报告】")
        print(f"总错误样本数: {len(errors)}")
        print(f"错误率: {len(errors) / len(self.y_true):.4f}")
        
        if len(errors) == 0:
            print("没有错误分类的样本！")
            return
        
        # 分析错误类型
        error_analysis = {}
        for idx in errors:
            true_label = self.y_true[idx]
            pred_label = self.y_pred[idx]
            pair = (true_label, pred_label)
            error_analysis[pair] = error_analysis.get(pair, 0) + 1
        
        # 排序错误类型
        sorted_errors = sorted(error_analysis.items(), key=lambda x: x[1], reverse=True)
        
        print(f"\n最常见的错误类型 (前10种):")
        for (true_label, pred_label), count in sorted_errors[:10]:
            print(f"  {true_label} → {pred_label}: {count} 次")
        
        # 可视化错误样本
        if X_test is not None:
            self.visualize_error_samples(X_test, errors, top_n)
        
        return error_analysis
    
    def visualize_error_samples(self, X_test, error_indices, top_n=20):
        """可视化错误分类的样本"""
        n_samples = min(top_n, len(error_indices))
        selected_errors = np.random.choice(error_indices, n_samples, replace=False)
        
        rows = (n_samples + 4) // 5  # 每行5个图
        fig, axes = plt.subplots(rows, 5, figsize=(15, 3*rows))
        
        if rows == 1:
            axes = axes.reshape(1, -1)
        
        for i, idx in enumerate(selected_errors):
            row = i // 5
            col = i % 5
            
            # 显示图像
            image = X_test[idx].reshape(28, 28)
            axes[row, col].imshow(image, cmap='gray')
            
            # 添加标题
            true_label = self.y_true[idx]
            pred_label = self.y_pred[idx]
            
            # 如果有概率信息，显示置信度
            title = f'真实: {true_label}, 预测: {pred_label}'
            if self.probabilities is not None:
                confidence = self.probabilities[idx, pred_label]
                title += f'\n置信度: {confidence:.3f}'
            
            axes[row, col].set_title(title, fontsize=10)
            axes[row, col].axis('off')
        
        # 隐藏多余的子图
        for i in range(n_samples, rows * 5):
            row = i // 5
            col = i % 5
            axes[row, col].axis('off')
        
        plt.suptitle('错误分类样本展示', fontsize=14, fontweight='bold')
        plt.tight_layout()
        plt.show()

print("ModelEvaluator类定义完成")

## 2. 可视化工具类定义

In [None]:
class AdvancedVisualizer:
    """高级可视化工具类"""
    
    def __init__(self):
        self.color_palette = sns.color_palette("husl", 10)
    
    def create_interactive_confusion_matrix(self, cm, class_names=None):
        """创建交互式混淆矩阵"""
        if class_names is None:
            class_names = [f'类别 {i}' for i in range(len(cm))]
        
        fig = go.Figure(data=go.Heatmap(
            z=cm,
            x=class_names,
            y=class_names,
            colorscale='Blues',
            text=cm,
            texttemplate="%{text}",
            textfont={"size": 12},
            hoverongaps=False
        ))
        
        fig.update_layout(
            title='交互式混淆矩阵',
            xaxis_title='预测标签',
            yaxis_title='真实标签',
            width=600,
            height=600
        )
        
        return fig
    
    def create_performance_comparison_chart(self, evaluators, evaluator_names):
        """创建多模型性能对比图表"""
        metrics = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']
        metric_names = ['准确率', '宏精确率', '宏召回率', '宏F1']
        
        fig = go.Figure()
        
        for i, (evaluator, name) in enumerate(zip(evaluators, evaluator_names)):
            values = [evaluator.evaluation_results[metric] for metric in metrics]
            
            fig.add_trace(go.Scatter(
                x=metric_names,
                y=values,
                mode='lines+markers',
                name=name,
                line=dict(width=3),
                marker=dict(size=8)
            ))
        
        fig.update_layout(
            title='多模型性能对比',
            xaxis_title='评估指标',
            yaxis_title='分数',
            yaxis=dict(range=[0.8, 1.0]),
            width=800,
            height=500
        )
        
        return fig
    
    def create_class_performance_radar(self, evaluator):
        """创建各类别性能雷达图"""
        precision = evaluator.evaluation_results['precision_per_class']
        recall = evaluator.evaluation_results['recall_per_class']
        f1 = evaluator.evaluation_results['f1_per_class']
        
        fig = make_subplots(
            rows=2, cols=5,
            specs=[[{'type': 'polar'}] * 5] * 2,
            subplot_titles=[f'数字 {i}' for i in range(10)]
        )
        
        for i in range(10):
            row = (i // 5) + 1
            col = (i % 5) + 1
            
            fig.add_trace(
                go.Scatterpolar(
                    r=[precision[i], recall[i], f1[i], precision[i]],
                    theta=['精确率', '召回率', 'F1分数', '精确率'],
                    fill='toself',
                    name=f'数字 {i}'
                ),
                row=row, col=col
            )
        
        fig.update_layout(
            title='各类别性能雷达图',
            height=600,
            showlegend=False
        )
        
        return fig
    
    def create_learning_curve_animation(self, histories, model_names):
        """创建学习曲线动画"""
        fig = go.Figure()
        
        # 添加训练准确率曲线
        for i, (history, name) in enumerate(zip(histories, model_names)):
            epochs = list(range(len(history['train_accuracies'])))
            
            fig.add_trace(go.Scatter(
                x=epochs,
                y=history['train_accuracies'],
                mode='lines',
                name=f'{name} - 训练',
                line=dict(width=2),
                visible=True if i == 0 else False
            ))
            
            fig.add_trace(go.Scatter(
                x=epochs,
                y=history['val_accuracies'],
                mode='lines',
                name=f'{name} - 验证',
                line=dict(width=2, dash='dash'),
                visible=True if i == 0 else False
            ))
        
        # 创建动画按钮
        buttons = []
        for i, name in enumerate(model_names):
            buttons.append(
                dict(
                    method="update",
                    args=[{"visible": [j == i * 2 or j == i * 2 + 1 for j in range(len(model_names) * 2)]},
                          {"title": f"学习曲线 - {name}"}],
                    label=name
                )
            )
        
        fig.update_layout(
            updatemenus=[dict(
                type="buttons",
                direction="right",
                x=0.1,
                y=1.02,
                showactive=True,
                buttons=buttons
            )],
            title="学习曲线对比",
            xaxis_title="Epoch",
            yaxis_title="准确率",
            width=800,
            height=500
        )
        
        return fig
    
    def create_prediction_confidence_analysis(self, y_true, y_pred_proba, y_pred):
        """创建预测置信度分析"""
        # 计算每个预测的置信度
        confidences = np.max(y_pred_proba, axis=1)
        
        # 分离正确和错误预测的置信度
        correct_mask = y_true == y_pred
        correct_confidences = confidences[correct_mask]
        incorrect_confidences = confidences[~correct_mask]
        
        fig = make_subplots(
            rows=2, cols=2,
            subplot_titles=['置信度分布', '置信度 vs 准确性', '各类别置信度', '低置信度样本']
        )
        
        # 1. 置信度分布直方图
        fig.add_trace(
            go.Histogram(
                x=correct_confidences,
                name='正确预测',
                opacity=0.7,
                marker_color='green',
                nbinsx=20
            ),
            row=1, col=1
        )
        
        fig.add_trace(
            go.Histogram(
                x=incorrect_confidences,
                name='错误预测',
                opacity=0.7,
                marker_color='red',
                nbinsx=20
            ),
            row=1, col=1
        )
        
        # 2. 置信度 vs 准确性散点图
        accuracy_threshold = np.linspace(0, 1, 20)
        accuracy_by_confidence = []
        
        for threshold in accuracy_threshold:
            mask = confidences >= threshold
            if mask.sum() > 0:
                accuracy = np.mean(y_true[mask] == y_pred[mask])
                accuracy_by_confidence.append(accuracy)
            else:
                accuracy_by_confidence.append(0)
        
        fig.add_trace(
            go.Scatter(
                x=accuracy_threshold,
                y=accuracy_by_confidence,
                mode='lines+markers',
                name='准确率',
                line=dict(width=3)
            ),
            row=1, col=2
        )
        
        # 3. 各类别置信度箱线图
        class_confidences = []
        for digit in range(10):
            mask = y_true == digit
            class_confidences.append(confidences[mask])
        
        fig.add_trace(
            go.Box(
                y=class_confidences,
                name='置信度分布',
                boxpoints='outliers'
            ),
            row=2, col=1
        )
        
        # 4. 低置信度样本统计
        low_confidence_threshold = 0.5
        low_confidence_mask = confidences < low_confidence_threshold
        low_confidence_accuracy = np.mean(y_true[low_confidence_mask] == y_pred[low_confidence_mask])
        
        fig.add_trace(
            go.Bar(
                x=['所有样本', f'低置信度(<{low_confidence_threshold})'],
                y=[np.mean(y_true == y_pred), low_confidence_accuracy],
                name='准确率',
                marker_color=['blue', 'orange']
            ),
            row=2, col=2
        )
        
        fig.update_layout(
            title='预测置信度分析',
            height=800,
            showlegend=True
        )
        
        # 更新坐标轴标签
        fig.update_xaxes(title_text="置信度", row=1, col=1)
        fig.update_yaxes(title_text="频次", row=1, col=1)
        fig.update_xaxes(title_text="置信度阈值", row=1, col=2)
        fig.update_yaxes(title_text="准确率", row=1, col=2)
        fig.update_xaxes(title_text="数字类别", row=2, col=1)
        fig.update_yaxes(title_text="置信度", row=2, col=1)
        fig.update_yaxes(title_text="准确率", row=2, col=2)
        
        return fig

print("AdvancedVisualizer类定义完成")

## 3. 示例数据加载和模型评估演示

In [None]:
def load_sample_data():
    """加载示例数据用于演示"""
    print("正在加载示例数据...")
    
    # 从sklearn加载MNIST数据
    from sklearn.datasets import fetch_openml
    from sklearn.model_selection import train_test_split
    
    mnist = fetch_openml('mnist_784', version=1, as_frame=False)
    X, y = mnist.data, mnist.target.astype(int)
    
    # 数据预处理
    X = X.astype('float32') / 255.0
    
    # 分割数据（只使用小样本进行演示）
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
    
    # 进一步减小测试集用于演示
    X_test_demo = X_test[:1000]
    y_test_demo = y_test[:1000]
    
    print(f"演示数据集大小: {X_test_demo.shape}")
    print(f"标签分布: {np.bincount(y_test_demo)}")
    
    return X_test_demo, y_test_demo

def create_mock_predictions(X_test, y_test):
    """创建模拟的预测结果用于演示"""
    np.random.seed(42)
    
    # 创建模拟预测（90%准确率）
    y_pred = y_test.copy()
    n_errors = int(len(y_test) * 0.1)  # 10%错误率
    error_indices = np.random.choice(len(y_test), n_errors, replace=False)
    
    # 为错误样本生成随机预测
    for idx in error_indices:
        available_classes = [i for i in range(10) if i != y_test[idx]]
        y_pred[idx] = np.random.choice(available_classes)
    
    # 创建模拟概率
    y_pred_proba = np.random.dirichlet(np.ones(10), size=len(y_test))
    
    # 调整概率使预测类别有最高概率
    for i in range(len(y_test)):
        y_pred_proba[i, y_pred[i]] = np.max(y_pred_proba[i, :]) * 1.5
        y_pred_proba[i] = y_pred_proba[i] / y_pred_proba[i].sum()
    
    print(f"模拟预测准确率: {np.mean(y_pred == y_test):.4f}")
    
    return y_pred, y_pred_proba

# 加载示例数据
X_demo, y_demo = load_sample_data()
y_pred_demo, y_pred_proba_demo = create_mock_predictions(X_demo, y_demo)

print("\n示例数据准备完成！")

## 4. 评估工具演示

In [None]:
# 创建评估器
evaluator = ModelEvaluator("演示模型")

# 评估模型
results = evaluator.evaluate(y_demo, y_pred_demo, y_pred_proba_demo, X_demo)

# 打印详细报告
evaluator.print_detailed_report()

## 5. 综合可视化演示

In [None]:
# 创建综合可视化
evaluator.create_comprehensive_visualization()

## 6. 错误分析演示

In [None]:
# 分析错误
error_analysis = evaluator.analyze_errors(X_demo, top_n=15)

## 7. 高级可视化演示

In [None]:
# 创建高级可视化器
visualizer = AdvancedVisualizer()

# 1. 交互式混淆矩阵
interactive_cm = visualizer.create_interactive_confusion_matrix(
    evaluator.evaluation_results['confusion_matrix']
)
interactive_cm.show()

# 2. 性能雷达图
performance_radar = visualizer.create_class_performance_radar(evaluator)
performance_radar.show()

# 3. 置信度分析
confidence_analysis = visualizer.create_prediction_confidence_analysis(
    y_demo, y_pred_proba_demo, y_pred_demo
)
confidence_analysis.show()

## 8. 多模型对比演示

In [None]:
def create_multiple_mock_models(X_test, y_test, n_models=3):
    """创建多个模拟模型用于对比"""
    evaluators = []
    model_names = []
    
    # 不同的准确率水平
    accuracy_levels = [0.85, 0.90, 0.95]
    
    for i in range(n_models):
        np.random.seed(42 + i)  # 不同的随机种子
        
        # 创建模拟预测
        y_pred = y_test.copy()
        target_accuracy = accuracy_levels[i]
        n_errors = int(len(y_test) * (1 - target_accuracy))
        error_indices = np.random.choice(len(y_test), n_errors, replace=False)
        
        for idx in error_indices:
            available_classes = [j for j in range(10) if j != y_test[idx]]
            y_pred[idx] = np.random.choice(available_classes)
        
        # 创建概率
        y_pred_proba = np.random.dirichlet(np.ones(10), size=len(y_test))
        for j in range(len(y_test)):
            y_pred_proba[j, y_pred[j]] = np.max(y_pred_proba[j, :]) * 1.5
            y_pred_proba[j] = y_pred_proba[j] / y_pred_proba[j].sum()
        
        # 创建评估器
        evaluator = ModelEvaluator(f"模型 {i+1} (准确率: {target_accuracy:.2f})")
        evaluator.evaluate(y_test, y_pred, y_pred_proba_demo)
        
        evaluators.append(evaluator)
        model_names.append(f"模型 {i+1}")
        
        print(f"{model_names[-1]}: 实际准确率 = {np.mean(y_pred == y_test):.4f}")
    
    return evaluators, model_names

# 创建多个模拟模型
evaluators, model_names = create_multiple_mock_models(X_demo, y_demo)

# 创建性能对比图表
comparison_chart = visualizer.create_performance_comparison_chart(evaluators, model_names)
comparison_chart.show()

# 创建对比报告
print("\n" + "="*80)
print("                    多模型性能对比报告")
print("="*80)

print(f"{'模型':<20} {'准确率':<10} {'宏F1':<10} {'加权F1':<10}")
print("-" * 60)

for evaluator, name in zip(evaluators, model_names):
    accuracy = evaluator.evaluation_results['accuracy']
    f1_macro = evaluator.evaluation_results['f1_macro']
    f1_weighted = evaluator.evaluation_results['f1_weighted']
    
    print(f"{name:<20} {accuracy:<10.4f} {f1_macro:<10.4f} {f1_weighted:<10.4f}")

# 找出最佳模型
best_idx = np.argmax([eval.evaluation_results['accuracy'] for eval in evaluators])
print(f"\n最佳模型: {model_names[best_idx]}")
print(f"最高准确率: {evaluators[best_idx].evaluation_results['accuracy']:.4f}")

## 9. 实用工具函数

In [None]:
def generate_evaluation_report(evaluator, save_path=None):
    """生成完整的评估报告"""
    report = []
    
    report.append("# 神经网络模型评估报告")
    report.append(f"\n## 模型名称: {evaluator.model_name}")
    report.append(f"\n生成时间: {time.strftime('%Y-%m-%d %H:%M:%S')}")
    
    # 总体性能
    report.append("\n## 总体性能指标")
    report.append(f"- **准确率**: {evaluator.evaluation_results['accuracy']:.4f}")
    report.append(f"- **宏平均精确率**: {evaluator.evaluation_results['precision_macro']:.4f}")
    report.append(f"- **宏平均召回率**: {evaluator.evaluation_results['recall_macro']:.4f}")
    report.append(f"- **宏平均F1分数**: {evaluator.evaluation_results['f1_macro']:.4f}")
    report.append(f"- **加权平均精确率**: {evaluator.evaluation_results['precision_weighted']:.4f}")
    report.append(f"- **加权平均召回率**: {evaluator.evaluation_results['recall_weighted']:.4f}")
    report.append(f"- **加权平均F1分数**: {evaluator.evaluation_results['f1_weighted']:.4f}")
    
    # 各类别详细指标
    report.append("\n## 各类别详细指标")
    report.append("| 类别 | 精确率 | 召回率 | F1分数 | 准确率 | 支持度 |")
    report.append("|------|--------|--------|--------|--------|--------|")
    
    for i in range(10):
        precision = evaluator.evaluation_results['precision_per_class'][i]
        recall = evaluator.evaluation_results['recall_per_class'][i]
        f1 = evaluator.evaluation_results['f1_per_class'][i]
        accuracy = evaluator.evaluation_results['class_accuracies'][i]
        support = evaluator.evaluation_results['support_per_class'][i]
        
        report.append(f"| {i} | {precision:.4f} | {recall:.4f} | {f1:.4f} | {accuracy:.4f} | {support} |")
    
    # 分析和建议
    report.append("\n## 分析和建议")
    
    class_accuracies = evaluator.evaluation_results['class_accuracies']
    easiest_class = np.argmax(class_accuracies)
    hardest_class = np.argmin(class_accuracies)
    
    report.append(f"\n### 性能分析")
    report.append(f"- **最容易识别的数字**: {easiest_class} (准确率: {class_accuracies[easiest_class]:.4f})")
    report.append(f"- **最难识别的数字**: {hardest_class} (准确率: {class_accuracies[hardest_class]:.4f})")
    report.append(f"- **性能差异**: {class_accuracies[easiest_class] - class_accuracies[hardest_class]:.4f}")
    
    # 改进建议
    report.append("\n### 改进建议")
    
    if evaluator.evaluation_results['accuracy'] < 0.95:
        report.append("- **准确率偏低**: 建议增加网络深度或宽度")
        report.append("- **数据增强**: 考虑使用旋转、平移等数据增强技术")
    
    if np.std(class_accuracies) > 0.05:
        report.append("- **类别不平衡**: 某些类别性能较差，需要针对性优化")
        report.append("- **错误分析**: 重点分析难识别类别的错误模式")
    
    report.append("- **超参数调优**: 尝试不同的学习率和批次大小")
    report.append("- **正则化**: 考虑添加Dropout或L2正则化防止过拟合")
    
    # 生成报告文本
    report_text = "\n".join(report)
    
    # 保存报告
    if save_path:
        with open(save_path, 'w', encoding='utf-8') as f:
            f.write(report_text)
        print(f"评估报告已保存到: {save_path}")
    
    return report_text

def compare_models_detailed(evaluators, model_names):
    """详细对比多个模型"""
    print("\n" + "="*100)
    print("                         详细模型对比分析")
    print("="*100)
    
    # 创建对比表格
    metrics = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']
    metric_names = ['准确率', '宏精确率', '宏召回率', '宏F1']
    
    print(f"{'模型':<20} {'总体准确率':<12} {'宏精确率':<12} {'宏召回率':<12} {'宏F1':<12}")
    print("-" * 80)
    
    for evaluator, name in zip(evaluators, model_names):
        values = [evaluator.evaluation_results[metric] for metric in metrics]
        print(f"{name:<20} {values[0]:<12.4f} {values[1]:<12.4f} {values[2]:<12.4f} {values[3]:<12.4f}")
    
    # 各类别最佳模型
    print(f"\n【各类别最佳模型】")
    print(f"{'类别':<6}", end="")
    for name in model_names:
        print(f"{name:<15}", end="")
    print("最佳模型")
    print("-" * (6 + 15 * len(model_names) + 10))
    
    for digit in range(10):
        print(f"{digit:<6}", end="")
        best_accuracy = 0
        best_model = ""
        
        for evaluator, name in zip(evaluators, model_names):
            accuracy = evaluator.evaluation_results['class_accuracies'][digit]
            print(f"{accuracy:<15.4f}", end="")
            
            if accuracy > best_accuracy:
                best_accuracy = accuracy
                best_model = name
        
        print(f"{best_model}")
    
    # 统计显著性测试（简化版）
    print(f"\n【性能排名】")
    rankings = {}
    
    for metric, metric_name in zip(metrics, metric_names):
        scores = [eval.evaluation_results[metric] for eval in evaluators]
        sorted_indices = np.argsort(scores)[::-1]
        
        print(f"\n{metric_name}排名:")
        for rank, idx in enumerate(sorted_indices, 1):
            print(f"  {rank}. {model_names[idx]}: {scores[idx]:.4f}")
        
        rankings[metric] = sorted_indices
    
    # 综合排名
    print(f"\n【综合排名】(基于4个指标的平均排名)")
    avg_ranks = []
    
    for i in range(len(evaluators)):
        avg_rank = np.mean([list(ranks).index(i) + 1 for ranks in rankings.values()])
        avg_ranks.append(avg_rank)
    
    final_ranking = np.argsort(avg_ranks)
    
    for rank, idx in enumerate(final_ranking, 1):
        print(f"  {rank}. {model_names[idx]}: 平均排名 {avg_ranks[idx]:.2f}")

# 生成评估报告
report = generate_evaluation_report(evaluator)
print(report)

# 如果有多个模型，进行详细对比
if len(evaluators) > 1:
    compare_models_detailed(evaluators, model_names)

## 10. 总结和使用指南

In [None]:
def print_usage_guide():
    """打印使用指南"""
    print("\n" + "="*80)
    print("                模型评估和可视化工具 - 使用指南")
    print("="*80)
    
    print("\n【基本使用流程】")
    steps = [
        "1. 创建评估器: evaluator = ModelEvaluator('模型名称')",
        "2. 评估模型: results = evaluator.evaluate(y_true, y_pred, y_pred_proba, X_test)",
        "3. 查看报告: evaluator.print_detailed_report()",
        "4. 可视化: evaluator.create_comprehensive_visualization()",
        "5. 错误分析: evaluator.analyze_errors(X_test)"
    ]
    
    for step in steps:
        print(f"  {step}")
    
    print("\n【高级可视化】")
    advanced_features = [
        "• 交互式混淆矩阵: visualizer.create_interactive_confusion_matrix()",
        "• 多模型性能对比: visualizer.create_performance_comparison_chart()",
        "• 类别性能雷达图: visualizer.create_class_performance_radar()",
        "• 学习曲线动画: visualizer.create_learning_curve_animation()",
        "• 预测置信度分析: visualizer.create_prediction_confidence_analysis()"
    ]
    
    for feature in advanced_features:
        print(f"  {feature}")
    
    print("\n【实用工具】")
    tools = [
        "• 生成报告: generate_evaluation_report(evaluator, '报告.md')",
        "• 模型对比: compare_models_detailed(evaluators, model_names)",
        "• 批量评估: 支持同时评估多个模型",
        "• 结果导出: 支持保存可视化结果和评估报告"
    ]
    
    for tool in tools:
        print(f"  {tool}")
    
    print("\n【输入数据要求】")
    requirements = [
        "• y_true: 真实标签 (numpy数组)",
        "• y_pred: 预测标签 (numpy数组)",
        "• y_pred_proba: 预测概率 (numpy数组, 可选)",
        "• X_test: 测试数据 (用于错误分析, 可选)"
    ]
    
    for req in requirements:
        print(f"  {req}")
    
    print("\n【扩展功能】")
    extensions = [
        "• 支持多分类和二分类问题",
        "• 自定义可视化样式和颜色",
        "• 集成到Jupyter Notebook或Web应用",
        "• 支持大规模数据集的高效处理",
        "• 可扩展的评估指标体系"
    ]
    
    for ext in extensions:
        print(f"  {ext}")
    
    print("\n" + "="*80)
    print("感谢使用模型评估和可视化工具！")
    print("这些工具可以帮助您全面分析和展示神经网络模型的性能。")
    print("="*80)

# 显示使用指南
print_usage_guide()

print("\n模型评估和可视化工具演示完成！")
print("主要功能:")
print("1. 全面的性能指标评估")
print("2. 丰富的可视化图表")
print("3. 深入的错误分析")
print("4. 多模型对比分析")
print("5. 交互式可视化界面")
print("6. 自动报告生成")
print("\n这些工具可以应用于任何分类模型的评估，不限于数字识别任务。")