# CyberPuppy 評估報告

此 notebook 提供完整的離線評估報告，包含：
- 宏 F1 分數
- AUCPR (Average Precision)
- 會話級指標
- 線上收斂監控視覺化

## 1. 環境設定與資料載入

In [None]:
# 匯入必要套件
import sys
import os
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
from pathlib import Path

# 設定繪圖風格
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 12

# 支援中文顯示
plt.rcParams['font.sans-serif'] = ['Microsoft YaHei', 'SimHei', 'DejaVu Sans']
plt.rcParams['axes.unicode_minus'] = False

# 添加專案路徑
sys.path.append('..')

# 匯入 CyberPuppy 模組
from src.cyberpuppy.eval.metrics import (
    MetricsCalculator,
    SessionContext,
    OnlineMonitor,
    PrometheusExporter,
    CSVExporter,
    EvaluationReport
)

print("環境設定完成！")

In [None]:
# 載入或生成測試資料
def generate_test_data(n_samples=1000, random_state=42):
    """生成測試資料"""
    np.random.seed(random_state)
    
    # 標籤
    toxicity_labels = ['none', 'toxic', 'severe']
    emotion_labels = ['pos', 'neu', 'neg']
    
    # 生成真實標籤（模擬不平衡資料）
    y_true_toxicity = np.random.choice(
        toxicity_labels, n_samples, p=[0.7, 0.25, 0.05]
    )
    y_true_emotion = np.random.choice(
        emotion_labels, n_samples, p=[0.3, 0.4, 0.3]
    )
    
    # 生成預測（添加一些錯誤）
    y_pred_toxicity = []
    y_prob_toxicity = []
    
    for true_label in y_true_toxicity:
        if np.random.random() < 0.8:  # 80% 準確率
            pred = true_label
        else:
            pred = np.random.choice(toxicity_labels)
        y_pred_toxicity.append(pred)
        
        # 生成機率
        probs = np.random.dirichlet([1, 1, 1])
        if pred == 'none':
            probs[0] += 0.5
        elif pred == 'toxic':
            probs[1] += 0.5
        else:
            probs[2] += 0.5
        probs = probs / probs.sum()
        y_prob_toxicity.append(probs)
    
    y_prob_toxicity = np.array(y_prob_toxicity)
    
    return {
        'y_true_toxicity': y_true_toxicity,
        'y_pred_toxicity': y_pred_toxicity,
        'y_prob_toxicity': y_prob_toxicity,
        'y_true_emotion': y_true_emotion
    }

# 生成測試資料
test_data = generate_test_data()
print(f"生成 {len(test_data['y_true_toxicity'])} 筆測試資料")

## 2. 分類指標評估

In [None]:
# 計算分類指標
calculator = MetricsCalculator()

# 毒性分類指標
toxicity_metrics = calculator.calculate_classification_metrics(
    test_data['y_true_toxicity'],
    test_data['y_pred_toxicity'],
    task_name='toxicity',
    average='macro'
)

# 顯示主要指標
print("=" * 50)
print("毒性偵測 - 分類指標")
print("=" * 50)

for name, result in toxicity_metrics.items():
    if name != 'toxicity_confusion_matrix':
        print(f"{name:30s}: {result.value:.4f}")

# 顯示混淆矩陣
cm = toxicity_metrics['toxicity_confusion_matrix'].metadata['matrix']
print("\n混淆矩陣:")
print(pd.DataFrame(
    cm,
    index=['真實:none', '真實:toxic', '真實:severe'],
    columns=['預測:none', '預測:toxic', '預測:severe']
))

In [None]:
# 視覺化混淆矩陣
fig, ax = plt.subplots(1, 1, figsize=(8, 6))

# 正規化混淆矩陣
cm_normalized = cm / cm.sum(axis=1, keepdims=True)

# 繪製熱圖
sns.heatmap(
    cm_normalized,
    annot=True,
    fmt='.2f',
    cmap='Blues',
    xticklabels=['none', 'toxic', 'severe'],
    yticklabels=['none', 'toxic', 'severe'],
    ax=ax
)

ax.set_title('毒性偵測 - 正規化混淆矩陣')
ax.set_xlabel('預測標籤')
ax.set_ylabel('真實標籤')

plt.tight_layout()
plt.show()

## 3. 機率指標評估 (AUCPR)

In [None]:
# 計算機率指標
from sklearn.preprocessing import LabelEncoder

# 轉換標籤為數值
le = LabelEncoder()
y_true_encoded = le.fit_transform(test_data['y_true_toxicity'])

# One-hot 編碼
y_true_onehot = np.eye(3)[y_true_encoded]

# 計算機率指標
prob_metrics = calculator.calculate_probability_metrics(
    y_true_onehot,
    test_data['y_prob_toxicity'],
    task_name='toxicity'
)

print("=" * 50)
print("毒性偵測 - 機率指標")
print("=" * 50)

for name, result in prob_metrics.items():
    print(f"{name:30s}: {result.value:.4f}")

In [None]:
# 繪製 Precision-Recall 曲線
from sklearn.metrics import precision_recall_curve, average_precision_score

fig, axes = plt.subplots(1, 3, figsize=(15, 5))
labels = ['none', 'toxic', 'severe']

for i, label in enumerate(labels):
    # 計算 PR 曲線
    precision, recall, _ = precision_recall_curve(
        y_true_onehot[:, i],
        test_data['y_prob_toxicity'][:, i]
    )
    
    ap = average_precision_score(
        y_true_onehot[:, i],
        test_data['y_prob_toxicity'][:, i]
    )
    
    # 繪製曲線
    axes[i].plot(recall, precision, lw=2, label=f'AP = {ap:.3f}')
    axes[i].fill_between(recall, precision, alpha=0.3)
    axes[i].set_xlabel('Recall')
    axes[i].set_ylabel('Precision')
    axes[i].set_title(f'{label.capitalize()} 類別 - PR 曲線')
    axes[i].legend(loc='lower left')
    axes[i].grid(True, alpha=0.3)

plt.suptitle('Precision-Recall 曲線（各類別）', fontsize=14)
plt.tight_layout()
plt.show()

## 4. 會話級指標評估

In [None]:
# 生成模擬會話資料
def generate_sessions(n_sessions=50):
    """生成模擬會話資料"""
    sessions = []
    
    for i in range(n_sessions):
        session = SessionContext(session_id=f"session_{i}")
        n_messages = np.random.randint(3, 15)
        
        # 模擬會話中的毒性變化
        base_toxicity = np.random.random()
        
        for j in range(n_messages):
            # 添加一些變化
            toxicity = max(0, min(1, base_toxicity + np.random.normal(0, 0.1)))
            
            message = {
                'message_id': f"msg_{j}",
                'text': f"Message {j}",
                'scores': {
                    'toxicity': toxicity,
                    'emotion': np.random.choice(['pos', 'neu', 'neg'])
                },
                'intervention': toxicity > 0.7  # 高毒性時介入
            }
            
            session.add_message(message)
            
            # 模擬介入效果
            if message['intervention']:
                base_toxicity *= 0.7  # 介入後降低毒性
        
        sessions.append(session)
    
    return sessions

# 生成會話資料
sessions = generate_sessions(50)
print(f"生成 {len(sessions)} 個會話")
print(f"平均每會話 {np.mean([len(s.messages) for s in sessions]):.1f} 條訊息")

In [None]:
# 計算會話級指標
session_metrics = calculator.calculate_session_metrics(sessions, 'toxicity')

print("=" * 50)
print("會話級指標")
print("=" * 50)

for name, result in session_metrics.items():
    print(f"{name:30s}: {result.value:.4f}")
    if result.metadata:
        for key, value in result.metadata.items():
            print(f"  - {key}: {value}")

In [None]:
# 視覺化會話毒性趨勢
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# 選擇幾個代表性會話
selected_sessions = sessions[:4]

for i, (session, ax) in enumerate(zip(selected_sessions, axes.flat)):
    # 提取毒性分數
    toxicity_scores = []
    intervention_points = []
    
    for j, msg in enumerate(session.messages):
        score = msg['scores']['toxicity']
        toxicity_scores.append(score)
        if msg.get('intervention', False):
            intervention_points.append(j)
    
    # 繪製毒性趨勢
    x = range(len(toxicity_scores))
    ax.plot(x, toxicity_scores, 'b-', linewidth=2, label='毒性分數')
    
    # 標記介入點
    if intervention_points:
        intervention_scores = [toxicity_scores[p] for p in intervention_points]
        ax.scatter(intervention_points, intervention_scores, 
                  color='red', s=100, marker='v', label='介入點', zorder=5)
    
    # 添加危險閾值線
    ax.axhline(y=0.7, color='orange', linestyle='--', alpha=0.5, label='危險閾值')
    ax.axhline(y=0.3, color='green', linestyle='--', alpha=0.5, label='安全閾值')
    
    ax.set_xlabel('訊息順序')
    ax.set_ylabel('毒性分數')
    ax.set_title(f'會話 {i+1} - 毒性趨勢')
    ax.set_ylim([0, 1])
    ax.legend(loc='upper right')
    ax.grid(True, alpha=0.3)

plt.suptitle('會話毒性趨勢分析', fontsize=14)
plt.tight_layout()
plt.show()

## 5. 線上收斂監控

In [None]:
# 模擬訓練過程
monitor = OnlineMonitor(window_size=20, checkpoint_interval=10)

# 生成模擬訓練資料
n_steps = 200
training_history = []

for step in range(n_steps):
    # 模擬收斂的 loss
    loss = 2.0 * np.exp(-step/50) + 0.1 + np.random.normal(0, 0.05)
    
    # 模擬提升的準確率和 F1
    accuracy = min(0.95, 0.5 + step * 0.002 + np.random.normal(0, 0.02))
    f1_score = min(0.90, 0.4 + step * 0.002 + np.random.normal(0, 0.02))
    
    # 模擬學習率衰減
    lr = 0.001 * (0.95 ** (step // 20))
    
    # 更新監控器
    stats = monitor.update(loss, accuracy, f1_score, lr)
    training_history.append(stats)

# 取得摘要
summary = monitor.get_summary()
print("=" * 50)
print("訓練監控摘要")
print("=" * 50)
for key, value in summary.items():
    if isinstance(value, float):
        print(f"{key:25s}: {value:.4f}")
    else:
        print(f"{key:25s}: {value}")

In [None]:
# 視覺化訓練曲線
df_history = pd.DataFrame(training_history)

fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Loss 曲線
axes[0, 0].plot(df_history['step'], df_history['loss'], 'b-', alpha=0.3, label='即時 Loss')
axes[0, 0].plot(df_history['step'], df_history['loss_avg'], 'r-', linewidth=2, label='移動平均')
axes[0, 0].fill_between(df_history['step'], 
                        df_history['loss_avg'] - df_history['loss_std'],
                        df_history['loss_avg'] + df_history['loss_std'],
                        alpha=0.2, color='red')
axes[0, 0].set_xlabel('步驟')
axes[0, 0].set_ylabel('Loss')
axes[0, 0].set_title('Loss 收斂曲線')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)

# 準確率曲線
axes[0, 1].plot(df_history['step'], df_history['accuracy'], 'g-', alpha=0.3, label='即時準確率')
axes[0, 1].plot(df_history['step'], df_history['accuracy_avg'], 'darkgreen', linewidth=2, label='移動平均')
axes[0, 1].set_xlabel('步驟')
axes[0, 1].set_ylabel('準確率')
axes[0, 1].set_title('準確率提升曲線')
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3)

# F1 分數曲線
axes[1, 0].plot(df_history['step'], df_history['f1_score'], 'purple', alpha=0.3, label='即時 F1')
axes[1, 0].plot(df_history['step'], df_history['f1_avg'], 'darkviolet', linewidth=2, label='移動平均')
axes[1, 0].set_xlabel('步驟')
axes[1, 0].set_ylabel('F1 分數')
axes[1, 0].set_title('F1 分數提升曲線')
axes[1, 0].legend()
axes[1, 0].grid(True, alpha=0.3)

# 學習率曲線
axes[1, 1].plot(df_history['step'], df_history['learning_rate'], 'orange', linewidth=2)
axes[1, 1].set_xlabel('步驟')
axes[1, 1].set_ylabel('學習率')
axes[1, 1].set_title('學習率衰減')
axes[1, 1].grid(True, alpha=0.3)

plt.suptitle('訓練監控視覺化', fontsize=14)
plt.tight_layout()
plt.show()

## 6. 指標匯出

In [None]:
# 匯出到 Prometheus 格式
prometheus = PrometheusExporter(job_name='cyberpuppy_eval')

# 添加指標
for name, result in toxicity_metrics.items():
    if name != 'toxicity_confusion_matrix':
        prometheus.update_metric(
            name=name.replace('toxicity_', ''),
            value=result.value,
            labels={'task': 'toxicity', 'dataset': 'test'}
        )

# 添加會話指標
for name, result in session_metrics.items():
    prometheus.update_metric(
        name=name,
        value=result.value,
        labels={'type': 'session'}
    )

# 顯示 Prometheus 格式
print("Prometheus 格式輸出:")
print("=" * 50)
prometheus_output = prometheus.export()
print(prometheus_output[:1000] + "...\n")

In [None]:
# 匯出到 CSV
csv_exporter = CSVExporter(output_dir='./metrics_output')

# 匯出分類指標
csv_path = csv_exporter.export_metrics(
    toxicity_metrics,
    filename='toxicity_metrics.csv'
)
print(f"毒性指標已匯出到: {csv_path}")

# 匯出訓練歷史
history_path = csv_exporter.export_history(
    monitor.export_history(),
    filename='training_history.csv'
)
print(f"訓練歷史已匯出到: {history_path}")

# 顯示 CSV 內容預覽
print("\nCSV 內容預覽:")
df_metrics = pd.read_csv(csv_path)
print(df_metrics.head())

## 7. 完整評估報告

In [None]:
# 生成完整報告
report = EvaluationReport()

# 添加預測結果
report.add_predictions(
    y_true=test_data['y_true_toxicity'],
    y_pred=test_data['y_pred_toxicity'],
    y_prob=test_data['y_prob_toxicity'],
    task_name='toxicity'
)

# 添加會話
for session in sessions[:10]:  # 添加部分會話作為範例
    report.add_session(session)

# 生成報告
full_report = report.generate_report()

print("=" * 50)
print("完整評估報告摘要")
print("=" * 50)
print(json.dumps(full_report['summary'], indent=2, ensure_ascii=False))

# 儲存報告
report_path = './metrics_output/evaluation_report.json'
os.makedirs(os.path.dirname(report_path), exist_ok=True)
with open(report_path, 'w', encoding='utf-8') as f:
    json.dump(full_report, f, ensure_ascii=False, indent=2)
print(f"\n完整報告已儲存到: {report_path}")

## 8. 效能基準測試

In [None]:
# 與基準比較
benchmarks = {
    '目標': {
        'toxicity_f1': 0.78,
        'emotion_f1': 0.85,
        'aucpr': 0.75,
        'intervention_success': 0.60
    },
    '實際': {
        'toxicity_f1': toxicity_metrics['f1_score'].value,
        'emotion_f1': 0.83,  # 模擬值
        'aucpr': prob_metrics.get('aucpr', toxicity_metrics['f1_score']).value,
        'intervention_success': session_metrics['intervention_success_rate'].value
    }
}

# 視覺化比較
fig, ax = plt.subplots(figsize=(10, 6))

metrics = list(benchmarks['目標'].keys())
x = np.arange(len(metrics))
width = 0.35

target_values = [benchmarks['目標'][m] for m in metrics]
actual_values = [benchmarks['實際'][m] for m in metrics]

bars1 = ax.bar(x - width/2, target_values, width, label='目標', color='skyblue', alpha=0.8)
bars2 = ax.bar(x + width/2, actual_values, width, label='實際', color='lightcoral', alpha=0.8)

# 添加數值標籤
for bars in [bars1, bars2]:
    for bar in bars:
        height = bar.get_height()
        ax.annotate(f'{height:.3f}',
                   xy=(bar.get_x() + bar.get_width() / 2, height),
                   xytext=(0, 3),
                   textcoords="offset points",
                   ha='center', va='bottom')

# 設定圖表
ax.set_xlabel('指標')
ax.set_ylabel('分數')
ax.set_title('效能基準比較')
ax.set_xticks(x)
ax.set_xticklabels([m.replace('_', ' ').title() for m in metrics])
ax.legend()
ax.set_ylim([0, 1])
ax.grid(True, axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

# 檢查是否達標
print("\n達標檢查:")
print("=" * 50)
for metric in metrics:
    target = benchmarks['目標'][metric]
    actual = benchmarks['實際'][metric]
    status = "✅ 達標" if actual >= target else "❌ 未達標"
    diff = actual - target
    print(f"{metric:20s}: {actual:.3f} vs {target:.3f} ({diff:+.3f}) {status}")

## 總結

本評估報告提供了 CyberPuppy 系統的完整效能分析：

1. **分類指標**：計算並視覺化了宏 F1、精確度、召回率等指標
2. **機率指標**：評估了 AUCPR 和 ROC-AUC，並繪製 PR 曲線
3. **會話級指標**：分析了會話內毒性變化、介入成功率等
4. **線上監控**：展示了訓練過程的收斂情況與指標趨勢
5. **指標匯出**：支援 Prometheus 和 CSV 格式匯出，便於整合監控系統

所有指標均可透過程式化方式取得，適合整合到 CI/CD 流程中進行自動化評估。