# PCDN流量与正常流量二分类任务

## 项目概述
本项目使用XGBoost对网络流量进行二分类：
- **APP_0**: 正常流量 (标签: 0)
- **APP_1**: PCDN流量 (标签: 1)

## 数据集结构
- `Training_set/`: 训练集
- `Validation_set/`: 验证集  
- `Testing_set/`: 测试集

每个集合包含APP_0（正常流量）和APP_1（PCDN流量）两个类别的CSV文件。


In [None]:
# 导入必要的库
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from pathlib import Path
import glob
import ast
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
import xgboost as xgb

# 设置中文字体和样式
plt.rcParams['font.sans-serif'] = ['SimHei', 'DejaVu Sans']
plt.rcParams['axes.unicode_minus'] = False
sns.set_style("whitegrid")
warnings.filterwarnings('ignore')

print("✅ 所有库导入成功！")


## 1. 数据加载与探索


In [None]:
# 定义数据加载函数
def load_dataset(base_path, dataset_type):
    """
    加载指定类型的数据集
    
    Args:
        base_path: 数据集根目录
        dataset_type: 数据集类型 ('Training_set', 'Validation_set', 'Testing_set')
    
    Returns:
        DataFrame: 合并后的数据集
    """
    data_list = []
    
    # 加载APP_0 (正常流量) 数据
    app0_path = Path(base_path) / dataset_type / 'APP_0'
    app0_files = list(app0_path.glob('*.csv'))
    print(f"📁 {dataset_type}/APP_0 找到 {len(app0_files)} 个文件")
    
    for file in app0_files:
        try:
            df = pd.read_csv(file)
            df['label'] = 0  # 正常流量标签
            df['source_file'] = str(file.name)
            data_list.append(df)
            print(f"  ✅ {file.name}: {len(df)} 行")
        except Exception as e:
            print(f"  ❌ 读取 {file.name} 失败: {e}")
    
    # 加载APP_1 (PCDN流量) 数据
    app1_path = Path(base_path) / dataset_type / 'APP_1'
    app1_files = list(app1_path.glob('*.csv'))
    print(f"📁 {dataset_type}/APP_1 找到 {len(app1_files)} 个文件")
    
    for file in app1_files:
        try:
            df = pd.read_csv(file)
            df['label'] = 1  # PCDN流量标签
            df['source_file'] = str(file.name)
            data_list.append(df)
            print(f"  ✅ {file.name}: {len(df)} 行")
        except Exception as e:
            print(f"  ❌ 读取 {file.name} 失败: {e}")
    
    if data_list:
        combined_df = pd.concat(data_list, ignore_index=True)
        print(f"🎯 {dataset_type} 总计: {len(combined_df)} 行数据")
        return combined_df
    else:
        print(f"⚠️ {dataset_type} 没有加载到任何数据")
        return pd.DataFrame()

# 加载所有数据集
base_path = 'pcdn_32_pkts_2class_feature_enhance_v17.4_dataset'

print("🚀 开始加载数据集...\n")
train_df = load_dataset(base_path, 'Training_set')
print()
val_df = load_dataset(base_path, 'Validation_set')
print()
test_df = load_dataset(base_path, 'Testing_set')
print("\n📊 数据加载完成！")


In [None]:
# 数据集基本信息
print("="*60)
print("📈 数据集概览")
print("="*60)

datasets = {'训练集': train_df, '验证集': val_df, '测试集': test_df}

for name, df in datasets.items():
    if not df.empty:
        print(f"\n{name}:")
        print(f"  📏 数据形状: {df.shape}")
        print(f"  🏷️ 标签分布:")
        label_counts = df['label'].value_counts().sort_index()
        for label, count in label_counts.items():
            label_name = "正常流量" if label == 0 else "PCDN流量"
            print(f"    {label} ({label_name}): {count} 样本")
        print(f"  📂 文件来源: {df['source_file'].unique()}")
    else:
        print(f"\n{name}: 空数据集")

# 查看数据字段
if not train_df.empty:
    print("\n🔍 数据字段分析")
    print("="*60)
    print(f"总字段数: {len(train_df.columns)}")
    print(f"字段列表: {list(train_df.columns)}")
    
    # 显示前几行数据
    print("\n📋 训练集前3行数据预览:")
    display(train_df.head(3))


## 2. 数据预处理与特征工程


In [None]:
def preprocess_features(df):
    """
    数据预处理和特征工程
    """
    df_processed = df.copy()
    
    print("🔧 开始数据预处理...")
    
    # 1. 处理缺失值
    print(f"📊 缺失值统计:")
    missing_stats = df_processed.isnull().sum()
    missing_cols = missing_stats[missing_stats > 0]
    if len(missing_cols) > 0:
        print(missing_cols)
        # 填充数值型缺失值
        numeric_cols = df_processed.select_dtypes(include=[np.number]).columns
        df_processed[numeric_cols] = df_processed[numeric_cols].fillna(0)
        # 填充字符型缺失值
        categorical_cols = df_processed.select_dtypes(include=['object']).columns
        df_processed[categorical_cols] = df_processed[categorical_cols].fillna('unknown')
    else:
        print("✅ 无缺失值")
    
    # 2. 处理特殊字段
    special_fields = ['ip_direction', 'pkt_len', 'iat', 'payload']
    
    for field in special_fields:
        if field in df_processed.columns:
            print(f"🔄 处理 {field} 字段...")
            
            if field == 'payload':
                # 载荷数据：计算长度特征
                df_processed[f'{field}_length'] = df_processed[field].astype(str).str.len()
                df_processed = df_processed.drop(columns=[field])
            
            elif field in ['ip_direction', 'pkt_len', 'iat']:
                # 解析列表型特征
                try:
                    # 尝试解析为列表
                    parsed_data = df_processed[field].apply(lambda x: ast.literal_eval(str(x)) if pd.notna(x) and str(x).strip() else [])
                    
                    # 提取统计特征
                    df_processed[f'{field}_mean'] = parsed_data.apply(lambda x: np.mean(x) if len(x) > 0 else 0)
                    df_processed[f'{field}_std'] = parsed_data.apply(lambda x: np.std(x) if len(x) > 0 else 0)
                    df_processed[f'{field}_max'] = parsed_data.apply(lambda x: np.max(x) if len(x) > 0 else 0)
                    df_processed[f'{field}_min'] = parsed_data.apply(lambda x: np.min(x) if len(x) > 0 else 0)
                    df_processed[f'{field}_sum'] = parsed_data.apply(lambda x: np.sum(x) if len(x) > 0 else 0)
                    df_processed[f'{field}_count'] = parsed_data.apply(lambda x: len(x))
                    
                    # 删除原始字段
                    df_processed = df_processed.drop(columns=[field])
                    
                except Exception as e:
                    print(f"⚠️ 处理 {field} 时出错: {e}，保持原始数据")
    
    # 3. 编码分类特征
    categorical_cols = df_processed.select_dtypes(include=['object']).columns
    categorical_cols = [col for col in categorical_cols if col not in ['source_file']]  # 排除辅助字段
    
    label_encoders = {}
    for col in categorical_cols:
        if col in df_processed.columns:
            le = LabelEncoder()
            df_processed[col] = le.fit_transform(df_processed[col].astype(str))
            label_encoders[col] = le
    
    print(f"🎯 预处理完成！最终特征数: {df_processed.shape[1]}")
    
    return df_processed, label_encoders

# 处理训练数据
if not train_df.empty:
    train_processed, encoders = preprocess_features(train_df)
    print("\n✅ 训练集预处理完成")
    
    # 处理验证和测试数据（使用相同的编码器）
    if not val_df.empty:
        val_processed, _ = preprocess_features(val_df)
        print("✅ 验证集预处理完成")
    
    if not test_df.empty:
        test_processed, _ = preprocess_features(test_df)
        print("✅ 测试集预处理完成")
else:
    print("❌ 训练数据为空，无法进行预处理")


In [None]:
# 准备建模数据
if not train_df.empty:
    # 分离特征和标签
    feature_cols = [col for col in train_processed.columns if col not in ['label', 'source_file']]
    
    X_train = train_processed[feature_cols]
    y_train = train_processed['label']
    
    if not val_df.empty:
        X_val = val_processed[feature_cols]
        y_val = val_processed['label']
    
    if not test_df.empty:
        X_test = test_processed[feature_cols]
        y_test = test_processed['label']
    
    print(f"🎯 特征维度: {X_train.shape}")
    print(f"📊 训练标签分布: {y_train.value_counts().to_dict()}")
    print(f"🔧 使用的特征数量: {len(feature_cols)}")
    print(f"📝 特征名称前10个: {feature_cols[:10]}")


## 3. XGBoost模型训练


In [None]:
# XGBoost模型训练
if not train_df.empty and len(X_train) > 0:
    print("🚀 开始XGBoost模型训练...")
    
    # 确保所有特征都是数值型
    X_train_numeric = X_train.select_dtypes(include=[np.number])
    
    if len(X_train_numeric.columns) == 0:
        print("❌ 没有可用的数值型特征")
    else:
        print(f"📊 使用 {len(X_train_numeric.columns)} 个数值型特征")
        
        # 配置XGBoost参数
        xgb_params = {
            'objective': 'binary:logistic',
            'eval_metric': 'logloss',
            'max_depth': 6,
            'learning_rate': 0.1,
            'n_estimators': 100,
            'subsample': 0.8,
            'colsample_bytree': 0.8,
            'random_state': 42,
            'n_jobs': -1
        }
        
        # 创建和训练模型
        model = xgb.XGBClassifier(**xgb_params)
        
        # 准备验证数据
        eval_set = []
        if not val_df.empty:
            X_val_numeric = val_processed[X_train_numeric.columns]
            eval_set = [(X_train_numeric, y_train), (X_val_numeric, y_val)]
        else:
            eval_set = [(X_train_numeric, y_train)]
        
        # 训练模型
        model.fit(
            X_train_numeric, y_train,
            eval_set=eval_set,
            verbose=True
        )
        
        print("✅ 模型训练完成！")
        
        # 更新特征列表
        final_feature_cols = X_train_numeric.columns.tolist()
else:
    print("❌ 无法训练模型：数据不足")


## 4. 模型评估与可视化


In [None]:
# 模型预测和评估
if 'model' in locals() and not train_df.empty:
    print("🎯 模型评估开始...")
    
    # 训练集预测
    y_train_pred = model.predict(X_train_numeric)
    y_train_proba = model.predict_proba(X_train_numeric)[:, 1]
    train_accuracy = (y_train_pred == y_train).mean()
    
    print(f"📊 训练集准确率: {train_accuracy:.4f}")
    
    # 验证集预测（如果有）
    if not val_df.empty:
        y_val_pred = model.predict(X_val_numeric)
        y_val_proba = model.predict_proba(X_val_numeric)[:, 1]
        val_accuracy = (y_val_pred == y_val).mean()
        print(f"📊 验证集准确率: {val_accuracy:.4f}")
    
    # 测试集预测（如果有）
    if not test_df.empty:
        X_test_numeric = test_processed[final_feature_cols]
        y_test_pred = model.predict(X_test_numeric)
        y_test_proba = model.predict_proba(X_test_numeric)[:, 1]
        test_accuracy = (y_test_pred == y_test).mean()
        print(f"📊 测试集准确率: {test_accuracy:.4f}")
    
    # 打印详细分类报告
    print("\n" + "="*60)
    print("📋 详细分类报告")
    print("="*60)
    
    target_names = ['正常流量', 'PCDN流量']
    
    print("\n🎯 训练集分类报告:")
    print(classification_report(y_train, y_train_pred, target_names=target_names))
    
    if not val_df.empty:
        print("\n✅ 验证集分类报告:")
        print(classification_report(y_val, y_val_pred, target_names=target_names))
    
    if not test_df.empty:
        print("\n🧪 测试集分类报告:")
        print(classification_report(y_test, y_test_pred, target_names=target_names))
else:
    print("❌ 模型未训练，无法进行评估")


In [None]:
# 可视化评估结果
if 'model' in locals() and not train_df.empty:
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))
    
    # 1. 混淆矩阵
    cm_train = confusion_matrix(y_train, y_train_pred)
    sns.heatmap(cm_train, annot=True, fmt='d', cmap='Blues', ax=axes[0,0],
                xticklabels=['正常流量', 'PCDN流量'], 
                yticklabels=['正常流量', 'PCDN流量'])
    axes[0,0].set_title('🔥 训练集混淆矩阵', fontsize=14, fontweight='bold')
    axes[0,0].set_ylabel('实际标签')
    axes[0,0].set_xlabel('预测标签')
    
    # 2. ROC曲线
    if len(np.unique(y_train)) > 1:
        fpr_train, tpr_train, _ = roc_curve(y_train, y_train_proba)
        auc_train = roc_auc_score(y_train, y_train_proba)
        axes[0,1].plot(fpr_train, tpr_train, label=f'训练集 (AUC = {auc_train:.3f})', linewidth=2)
        
        if not val_df.empty and len(np.unique(y_val)) > 1:
            fpr_val, tpr_val, _ = roc_curve(y_val, y_val_proba)
            auc_val = roc_auc_score(y_val, y_val_proba)
            axes[0,1].plot(fpr_val, tpr_val, label=f'验证集 (AUC = {auc_val:.3f})', linewidth=2)
        
        if not test_df.empty and len(np.unique(y_test)) > 1:
            fpr_test, tpr_test, _ = roc_curve(y_test, y_test_proba)
            auc_test = roc_auc_score(y_test, y_test_proba)
            axes[0,1].plot(fpr_test, tpr_test, label=f'测试集 (AUC = {auc_test:.3f})', linewidth=2)
        
        axes[0,1].plot([0, 1], [0, 1], 'k--', alpha=0.5)
        axes[0,1].set_xlabel('假正率 (FPR)')
        axes[0,1].set_ylabel('真正率 (TPR)')
        axes[0,1].set_title('📈 ROC曲线', fontsize=14, fontweight='bold')
        axes[0,1].legend()
        axes[0,1].grid(True, alpha=0.3)
    
    # 3. 预测概率分布
    axes[1,0].hist(y_train_proba[y_train==0], bins=20, alpha=0.7, label='正常流量', color='blue')
    axes[1,0].hist(y_train_proba[y_train==1], bins=20, alpha=0.7, label='PCDN流量', color='red')
    axes[1,0].set_xlabel('预测概率')
    axes[1,0].set_ylabel('频次')
    axes[1,0].set_title('🎯 预测概率分布', fontsize=14, fontweight='bold')
    axes[1,0].legend()
    axes[1,0].grid(True, alpha=0.3)
    
    # 4. 准确率对比
    accuracies = [train_accuracy]
    labels = ['训练集']
    colors = ['#3498db']
    
    if not val_df.empty:
        accuracies.append(val_accuracy)
        labels.append('验证集')
        colors.append('#2ecc71')
    
    if not test_df.empty:
        accuracies.append(test_accuracy)
        labels.append('测试集')
        colors.append('#e74c3c')
    
    bars = axes[1,1].bar(labels, accuracies, color=colors)
    axes[1,1].set_title('📊 各数据集准确率对比', fontsize=14, fontweight='bold')
    axes[1,1].set_ylabel('准确率')
    axes[1,1].set_ylim(0, 1.1)
    
    # 添加数值标签
    for bar, acc in zip(bars, accuracies):
        axes[1,1].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.02,
                      f'{acc:.3f}', ha='center', va='bottom', fontweight='bold')
    
    plt.tight_layout()
    plt.show()
else:
    print("❌ 无法生成评估可视化")


## 5. 特征重要性分析


In [None]:
# 特征重要性分析
if 'model' in locals() and not train_df.empty:
    print("🔍 特征重要性分析...")
    
    # 获取特征重要性
    feature_importance = model.feature_importances_
    feature_names = final_feature_cols
    
    # 创建特征重要性DataFrame
    importance_df = pd.DataFrame({
        'feature': feature_names,
        'importance': feature_importance
    }).sort_values('importance', ascending=False)
    
    print(f"📊 Top 10 最重要特征:")
    print(importance_df.head(10))
    
    # 创建特征重要性可视化
    fig, axes = plt.subplots(2, 2, figsize=(20, 15))
    
    # 1. Top 20 特征重要性条形图
    top_20 = importance_df.head(20)
    axes[0,0].barh(range(len(top_20)), top_20['importance'], color='skyblue')
    axes[0,0].set_yticks(range(len(top_20)))
    axes[0,0].set_yticklabels(top_20['feature'])
    axes[0,0].set_xlabel('重要性分数')
    axes[0,0].set_title('🏆 Top 20 特征重要性', fontsize=14, fontweight='bold')
    axes[0,0].invert_yaxis()
    
    # 2. 特征重要性分布直方图
    axes[0,1].hist(feature_importance, bins=30, color='lightcoral', alpha=0.7, edgecolor='black')
    axes[0,1].set_xlabel('重要性分数')
    axes[0,1].set_ylabel('特征数量')
    axes[0,1].set_title('📊 特征重要性分布', fontsize=14, fontweight='bold')
    axes[0,1].grid(True, alpha=0.3)
    
    # 3. 累积重要性贡献
    cumsum_importance = np.cumsum(importance_df['importance'].values)
    axes[1,0].plot(range(1, len(cumsum_importance)+1), cumsum_importance, 'b-', linewidth=2)
    axes[1,0].fill_between(range(1, len(cumsum_importance)+1), cumsum_importance, alpha=0.3)
    axes[1,0].set_xlabel('特征数量')
    axes[1,0].set_ylabel('累积重要性')
    axes[1,0].set_title('📈 累积特征重要性贡献', fontsize=14, fontweight='bold')
    axes[1,0].grid(True, alpha=0.3)
    
    # 找到前80%重要性对应的特征数量
    threshold_80 = 0.8 * cumsum_importance[-1]
    features_80 = np.where(cumsum_importance >= threshold_80)[0][0] + 1
    axes[1,0].axhline(y=threshold_80, color='red', linestyle='--', alpha=0.7)
    axes[1,0].axvline(x=features_80, color='red', linestyle='--', alpha=0.7)
    axes[1,0].text(features_80+1, threshold_80, f'前{features_80}个特征\\n贡献80%重要性', 
                  bbox=dict(boxstyle="round,pad=0.3", facecolor="white", alpha=0.8))
    
    # 4. Top 10 特征重要性饼图
    top_10 = importance_df.head(10)
    other_importance = importance_df.iloc[10:]['importance'].sum()
    
    pie_data = top_10['importance'].tolist() + [other_importance]
    pie_labels = top_10['feature'].tolist() + ['其他特征']
    
    colors = plt.cm.Set3(np.linspace(0, 1, len(pie_data)))
    wedges, texts, autotexts = axes[1,1].pie(pie_data, labels=pie_labels, autopct='%1.1f%%', 
                                            colors=colors, startangle=90)
    axes[1,1].set_title('🥧 Top 10 特征重要性占比', fontsize=14, fontweight='bold')
    
    # 调整文字大小
    for text in texts:
        text.set_fontsize(8)
    for autotext in autotexts:
        autotext.set_fontsize(8)
        autotext.set_fontweight('bold')
    
    plt.tight_layout()
    plt.show()
    
    # 特征重要性统计
    print("\n" + "="*60)
    print("📈 特征重要性统计")
    print("="*60)
    print(f"🔢 总特征数量: {len(feature_importance)}")
    print(f"📊 平均重要性: {np.mean(feature_importance):.6f}")
    print(f"📊 重要性标准差: {np.std(feature_importance):.6f}")
    print(f"🏆 最高重要性: {np.max(feature_importance):.6f} ({importance_df.iloc[0]['feature']})")
    print(f"🔻 最低重要性: {np.min(feature_importance):.6f}")
    print(f"🎯 前{features_80}个特征贡献80%重要性")
    
    # 保存重要特征列表
    top_features = importance_df.head(20)['feature'].tolist()
    print(f"\n🌟 建议关注的Top 20特征:")
    for i, feature in enumerate(top_features, 1):
        importance_score = importance_df[importance_df['feature'] == feature]['importance'].iloc[0]
        print(f"  {i:2d}. {feature:<30} (重要性: {importance_score:.6f})")
else:
    print("❌ 无法进行特征重要性分析")


## 6. 模型性能总结与建议


In [None]:
# 模型性能总结
if 'model' in locals() and not train_df.empty:
    print("\n" + "="*80)
    print("🎯 PCDN流量分类模型性能总结")
    print("="*80)
    
    print(f"\n📊 数据集信息:")
    print(f"  🎓 训练样本: {len(X_train)} 个 (正常流量: {sum(y_train==0)}, PCDN流量: {sum(y_train==1)})")
    if not val_df.empty:
        print(f"  ✅ 验证样本: {len(X_val)} 个 (正常流量: {sum(y_val==0)}, PCDN流量: {sum(y_val==1)})")
    if not test_df.empty:
        print(f"  🧪 测试样本: {len(X_test)} 个 (正常流量: {sum(y_test==0)}, PCDN流量: {sum(y_test==1)})")
    print(f"  🔧 使用特征: {len(final_feature_cols)} 个")
    
    print(f"\n🏆 模型性能:")
    print(f"  📈 训练集准确率: {train_accuracy:.4f} ({train_accuracy*100:.2f}%)")
    if not val_df.empty:
        print(f"  📈 验证集准确率: {val_accuracy:.4f} ({val_accuracy*100:.2f}%)")
    if not test_df.empty:
        print(f"  📈 测试集准确率: {test_accuracy:.4f} ({test_accuracy*100:.2f}%)")
    
    if len(np.unique(y_train)) > 1:
        print(f"  📊 训练集AUC: {auc_train:.4f}")
        if not val_df.empty and len(np.unique(y_val)) > 1:
            print(f"  📊 验证集AUC: {auc_val:.4f}")
        if not test_df.empty and len(np.unique(y_test)) > 1:
            print(f"  📊 测试集AUC: {auc_test:.4f}")
    
    print(f"\n🌟 关键特征 (Top 5):")
    for i, (_, row) in enumerate(importance_df.head(5).iterrows(), 1):
        print(f"  {i}. {row['feature']:<25} (重要性: {row['importance']:.6f})")
    
    print(f"\n💡 模型建议:")
    
    # 数据量建议
    total_samples = len(X_train)
    if total_samples < 100:
        print(f"  ⚠️  数据量较小 ({total_samples}样本)，建议收集更多数据以提高模型稳定性")
    elif total_samples < 1000:
        print(f"  📊 数据量适中 ({total_samples}样本)，可考虑数据增强技术")
    else:
        print(f"  ✅ 数据量充足 ({total_samples}样本)")
    
    # 类别平衡建议
    class_ratio = min(sum(y_train==0), sum(y_train==1)) / max(sum(y_train==0), sum(y_train==1))
    if class_ratio < 0.5:
        print(f"  ⚠️  类别不平衡 (比例: {class_ratio:.2f})，建议使用类别权重或采样技术")
    else:
        print(f"  ✅ 类别相对平衡 (比例: {class_ratio:.2f})")
    
    # 性能建议
    if not val_df.empty:
        overfitting = train_accuracy - val_accuracy
        if overfitting > 0.1:
            print(f"  ⚠️  可能存在过拟合 (训练-验证差距: {overfitting:.3f})，建议调整正则化参数")
        elif overfitting < -0.05:
            print(f"  🤔 验证集性能优于训练集，可能数据分布不一致")
        else:
            print(f"  ✅ 模型泛化能力良好 (训练-验证差距: {overfitting:.3f})")
    
    # 特征建议
    high_importance_features = len(importance_df[importance_df['importance'] > importance_df['importance'].mean()])
    print(f"  🔧 {high_importance_features}/{len(final_feature_cols)} 个特征高于平均重要性")
    
    if features_80 < len(final_feature_cols) * 0.5:
        print(f"  💡 可考虑特征选择：前{features_80}个特征已贡献80%重要性")
    
    print(f"\n🚀 下一步建议:")
    print(f"  1. 📈 收集更多样本数据，特别是少数类别")
    print(f"  2. 🔧 基于特征重要性进行特征选择和工程优化")
    print(f"  3. ⚙️  尝试其他算法对比 (Random Forest, SVM, Neural Network)")
    print(f"  4. 🎯 进行超参数调优以进一步提升性能")
    print(f"  5. 📊 在实际场景中部署和监控模型性能")
    
    print("\n" + "="*80)
    print("✅ 分析完成！模型已准备就绪。")
    print("="*80)
else:
    print("❌ 无法生成模型总结")


## 7. 模型保存与加载


In [None]:
# 保存训练好的模型
if 'model' in locals():
    import joblib
    
    model_filename = 'pcdn_traffic_classifier.pkl'
    joblib.dump(model, model_filename)
    print(f"💾 模型已保存为: {model_filename}")
    
    # 保存特征列表
    import json
    features_filename = 'model_features.json'
    with open(features_filename, 'w') as f:
        json.dump(final_feature_cols, f, indent=2)
    print(f"📝 特征列表已保存为: {features_filename}")
    
    # 保存特征重要性
    importance_filename = 'feature_importance.csv'
    importance_df.to_csv(importance_filename, index=False, encoding='utf-8-sig')
    print(f"📊 特征重要性已保存为: {importance_filename}")
    
    print("\n🎉 所有文件保存完成！")
    
    # 演示如何加载模型
    print("\n📖 模型加载示例代码:")
    print("""
    import joblib
    import json
    
    # 加载模型
    loaded_model = joblib.load('pcdn_traffic_classifier.pkl')
    
    # 加载特征列表
    with open('model_features.json', 'r') as f:
        feature_columns = json.load(f)
    
    # 对新数据进行预测
    # new_data = preprocess_new_data(raw_data)  # 需要相同的预处理
    # predictions = loaded_model.predict(new_data[feature_columns])
    """)
else:
    print("❌ 没有训练好的模型可保存")
