In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, StratifiedKFold
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import (confusion_matrix, accuracy_score, precision_recall_fscore_support, 
                           classification_report, roc_auc_score)
from sklearn.preprocessing import StandardScaler
from matplotlib.colors import LinearSegmentedColormap
import warnings
import os
from datetime import datetime

warnings.filterwarnings('ignore')

# 设置字体 - 使用 Arial
plt.rcParams['font.sans-serif'] = ['Arial']
plt.rcParams['axes.unicode_minus'] = False

print("开始Signal1+Signal2特征分类性能分析...")
print("="*60)


In [None]:
# ========== 数据加载和预处理 ==========
print("\n>>> 加载骨质疏松数据...")
data_file = r"F:\作图目录20280825\骨质疏松数据.xlsx"
df = pd.read_excel(data_file, header=1, usecols='B:H') 
cols = ['signal_1', 'sost_1', 'signal_2', 'sost_2', 'sost_mean', 'l1_4', 'left_hip']
df.columns = cols

print(f"数据维度: {df.shape}")

# 添加类别标签
if len(df) == 103:
    df['class'] = ['Health'] * 35 + ['Osteopenia'] * 33 + ['Osteoporosis'] * 35
    print("103行数据，分配: Health(35) + Osteopenia(33) + Osteoporosis(35)")
else:
    df['class'] = ['Health'] * 35 + ['Osteopenia'] * 33 + ['Osteoporosis'] * 36
    print("104行数据，分配: Health(35) + Osteopenia(33) + Osteoporosis(36)")

print("类别分布:")
print(df['class'].value_counts())

# 准备特征数据 - 仅使用signal_1和signal_2
X = df[['signal_1', 'signal_2']].values
y = df['class'].map({'Health': 0, 'Osteopenia': 1, 'Osteoporosis': 2})

print(f"\n特征维度: {X.shape}")
print("特征名称: Signal1(COL.), Signal2(FL.)")
print("目标变量分布:", y.value_counts().sort_index())

# 分割数据
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)
print(f"训练集大小: {X_train.shape[0]}, 测试集大小: {X_test.shape[0]}")


In [None]:
# ========== 淘宝代码风格配置 ==========
print("\n>>> 设置淘宝代码风格的颜色方案...")

# 颜色配置 - 保持与淘宝代码一致
COLORS = {
    "green_light": "#C9DCC4",
    "green_transparent": "#94C3AA",
    "blue_light": "#D7E8F3",
    "blue_transparent": "#8ec1dc",
    "teal_transparent": "#57B1AB",
    "new": "#DF9D96"
}

# 创建颜色映射
train_cmap = LinearSegmentedColormap.from_list("train_cmap", [COLORS["blue_light"], COLORS["blue_transparent"]], N=256)
test_cmap = LinearSegmentedColormap.from_list("test_cmap", [COLORS["green_light"], COLORS["green_transparent"]], N=256)

print("✅ 颜色方案配置完成")

# 算法配置 - 保持与原文档一致
algo_configs = {
    'SVM': {
        'model': SVC(probability=True, random_state=42),
        'params': {'C': [0.1, 1, 10], 'kernel': ['rbf', 'linear']},
        'needs_scaling': True,
        'color': '#57B1AB'
    },
    'Random Forest': {
        'model': RandomForestClassifier(random_state=42),
        'params': {'n_estimators': [50, 100], 'max_depth': [None, 5, 10]},
        'needs_scaling': False,
        'color': '#94C3AA'
    },
    'Logistic Regression': {
        'model': LogisticRegression(random_state=42, max_iter=1000),
        'params': {'C': [0.1, 1, 10]},
        'needs_scaling': True,
        'color': '#8ec1dc'
    },
    'KNN': {
        'model': KNeighborsClassifier(),
        'params': {'n_neighbors': [3, 5, 7]},
        'needs_scaling': True,
        'color': '#DF9D96'
    },
    'Decision Tree': {
        'model': DecisionTreeClassifier(random_state=42),
        'params': {'max_depth': [None, 5, 10]},
        'needs_scaling': False,
        'color': '#C9DCC4'
    },
    'LDA': {
        'model': LinearDiscriminantAnalysis(),
        'params': {},
        'needs_scaling': False,
        'color': '#8ec1dc'
    }
}

print("✅ 算法配置完成")


In [None]:
# 混淆矩阵绘制函数 - 再次增大矩阵内数字，确保绝对清晰可读
def plot_confusion_matrix(y_true, y_pred, labels, title, cmap, save_path_base=None):
    """
    绘制混淆矩阵热图，保存为TIFF和PDF格式，矩阵内数字再次增大
    """
    cm = confusion_matrix(y_true, y_pred, labels=[0, 1, 2])
    
    # 保持8x7图片尺寸，但增大矩阵内数字
    plt.figure(figsize=(8, 7))
    ax = sns.heatmap(cm, annot=True, fmt='g', cmap=cmap, 
                     xticklabels=labels, yticklabels=labels, 
                     cbar=False, annot_kws={"size": 28, "color": "black", "weight": "bold"}, 
                     linewidths=1.5, square=True)
    
    # 标题保持18号字体
    ax.set_title(title, fontsize=18, fontweight='bold', pad=30)
    
    # 坐标轴标签保持18号
    ax.set_xlabel('Predicted', fontsize=18, fontweight='bold', labelpad=20)
    ax.set_ylabel('Actual', fontsize=18, fontweight='bold', labelpad=20)
    
    # 刻度标签保持16号
    ax.set_xticklabels(ax.get_xticklabels(), rotation=0, ha='center', fontsize=16, 
                       verticalalignment='top')
    ax.set_yticklabels(ax.get_yticklabels(), rotation=90, ha='center', fontsize=16,
                       verticalalignment='center')
    
    # 保持间距设置
    ax.tick_params(axis='x', pad=15, length=6)
    ax.tick_params(axis='y', pad=12, length=6)
    
    # 确保正方形热图区域
    ax.set_aspect('equal', adjustable='box')
    
    # 调整布局
    plt.subplots_adjust(bottom=0.22, top=0.85, left=0.22, right=0.95)
    
    if save_path_base:
        # 保存为TIFF格式
        tiff_path = f"{save_path_base}.tiff"
        plt.savefig(tiff_path, format='tiff', dpi=300, bbox_inches='tight', 
                    facecolor='white', pad_inches=0.3)
        
        # 保存为PDF格式
        pdf_path = f"{save_path_base}.pdf"
        plt.savefig(pdf_path, format='pdf', bbox_inches='tight', 
                    facecolor='white', pad_inches=0.3)
        
        print(f"  ✅ 保存: {os.path.basename(tiff_path)}, {os.path.basename(pdf_path)}")
        plt.close()
        return tiff_path, pdf_path
    else:
        plt.show()
        return None, None


In [None]:
def compute_comprehensive_metrics(y_true, y_pred, y_proba=None):
    """计算全面的评估指标"""
    # 基础指标
    acc = accuracy_score(y_true, y_pred)
    
    # 各类别的精确率、召回率、F1分数
    precision, recall, f1, support = precision_recall_fscore_support(y_true, y_pred, 
                                                                    average=None, zero_division=0)
    
    # 宏平均和加权平均
    prec_macro = precision_recall_fscore_support(y_true, y_pred, average='macro', zero_division=0)[0]
    recall_macro = precision_recall_fscore_support(y_true, y_pred, average='macro', zero_division=0)[1]
    f1_macro = precision_recall_fscore_support(y_true, y_pred, average='macro', zero_division=0)[2]
    
    prec_weighted = precision_recall_fscore_support(y_true, y_pred, average='weighted', zero_division=0)[0]
    recall_weighted = precision_recall_fscore_support(y_true, y_pred, average='weighted', zero_division=0)[1]
    f1_weighted = precision_recall_fscore_support(y_true, y_pred, average='weighted', zero_division=0)[2]
    
    cm = confusion_matrix(y_true, y_pred)
    
    # AUC分数（如果有概率预测）
    auc_score = None
    if y_proba is not None:
        try:
            auc_score = roc_auc_score(y_true, y_proba, multi_class='ovr', average='macro')
        except:
            pass
    
    return {
        'accuracy': acc,
        'precision_macro': prec_macro,
        'recall_macro': recall_macro,
        'f1_macro': f1_macro,
        'precision_weighted': prec_weighted,
        'recall_weighted': recall_weighted,
        'f1_weighted': f1_weighted,
        'precision_per_class': precision,
        'recall_per_class': recall,
        'f1_per_class': f1,
        'support_per_class': support,
        'confusion_matrix': cm,
        'auc_score': auc_score,
        'classification_report': classification_report(y_true, y_pred, target_names=['Health', 'Osteopenia', 'Osteoporosis'])
    }


In [None]:
def train_algorithm(algo_name, config, X_train, X_test, y_train, y_test):
    """训练单个算法并返回完整结果"""
    print(f"\n--- 训练 {algo_name} ---")
    
    # 数据标准化处理
    if config['needs_scaling']:
        print(f"  应用z-score标准化...")
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
    else:
        X_train_scaled = X_train.copy()
        X_test_scaled = X_test.copy()
        scaler = None
    
    # 参数优化
    model = config['model']
    param_grid = config['params']
    
    if param_grid:
        print(f"  参数优化中...")
        grid_search = GridSearchCV(model, param_grid, cv=3, scoring='accuracy', n_jobs=-1)
        grid_search.fit(X_train_scaled, y_train)
        best_model = grid_search.best_estimator_
        best_params = grid_search.best_params_
        print(f"  最佳参数: {best_params}")
    else:
        best_model = model
        best_model.fit(X_train_scaled, y_train)
        best_params = "Default"
        print("  使用默认参数")
    
    # 预测
    y_train_pred = best_model.predict(X_train_scaled)
    y_test_pred = best_model.predict(X_test_scaled)
    
    # 概率预测（如果支持）
    try:
        y_train_proba = best_model.predict_proba(X_train_scaled)
        y_test_proba = best_model.predict_proba(X_test_scaled)
    except:
        y_train_proba = None
        y_test_proba = None
    
    # 计算性能指标
    train_acc = accuracy_score(y_train, y_train_pred)
    test_acc = accuracy_score(y_test, y_test_pred)
    
    train_metrics = compute_comprehensive_metrics(y_train, y_train_pred, y_train_proba)
    test_metrics = compute_comprehensive_metrics(y_test, y_test_pred, y_test_proba)
    
    # 交叉验证
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    cv_scores = cross_val_score(best_model, X_train_scaled, y_train, cv=skf, scoring='accuracy')
    
    print(f"  训练准确率: {train_acc:.4f}")
    print(f"  测试准确率: {test_acc:.4f}")
    if test_metrics['auc_score']:
        print(f"  测试AUC: {test_metrics['auc_score']:.4f}")
    print(f"  5折CV: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")
    
    return {
        'algorithm': algo_name,
        'model': best_model,
        'scaler': scaler,
        'best_params': best_params,
        'train_accuracy': train_acc,
        'test_accuracy': test_acc,
        'train_metrics': train_metrics,
        'test_metrics': test_metrics,
        'cv_scores': cv_scores,
        'cv_mean': cv_scores.mean(),
        'cv_std': cv_scores.std(),
        'y_train': y_train,
        'y_test': y_test,
        'y_train_pred': y_train_pred,
        'y_test_pred': y_test_pred,
        'y_train_proba': y_train_proba,
        'y_test_proba': y_test_proba
    }


In [None]:
# ========== 训练所有算法 ==========
print("\n>>> 开始训练和评估六个机器学习算法...")
all_results = []
class_labels = ['Health', 'Osteopenia', 'Osteoporosis']

for algo_name, config in algo_configs.items():
    result = train_algorithm(algo_name, config, X_train, X_test, y_train, y_test)
    all_results.append(result)

print(f"\n✅ 所有算法训练完成！")


In [None]:
# ========== 生成混淆矩阵热图 ==========
print("\n>>> 生成混淆矩阵热图(TIFF和PDF格式)...")

# 创建输出目录
output_dir = r"F:\作图目录20280825\signal_confusion_matrices"
os.makedirs(output_dir, exist_ok=True)

generated_files = []

for result in all_results:
    algo_name = result['algorithm']
    print(f"\n生成 {algo_name} 混淆矩阵...")
    
    # 训练集混淆矩阵
    train_title = f"Confusion Matrix(Train) - {algo_name}"
    train_path_base = os.path.join(output_dir, f"{algo_name.replace(' ', '_')}_signal_train")
    tiff_train, pdf_train = plot_confusion_matrix(
        result['y_train'], result['y_train_pred'], 
        class_labels, train_title, train_cmap, train_path_base
    )
    
    # 测试集混淆矩阵
    test_title = f"Confusion Matrix(Test) - {algo_name}"
    test_path_base = os.path.join(output_dir, f"{algo_name.replace(' ', '_')}_signal_test")
    tiff_test, pdf_test = plot_confusion_matrix(
        result['y_test'], result['y_test_pred'], 
        class_labels, test_title, test_cmap, test_path_base
    )
    
    if tiff_train and pdf_train and tiff_test and pdf_test:
        generated_files.extend([tiff_train, pdf_train, tiff_test, pdf_test])

print(f"\n✅ 共生成 {len(generated_files)} 个混淆矩阵图片文件")


In [None]:
# ========== 性能结果汇总 ==========
print("\n" + "="*80)
print("Signal1+Signal2特征分类性能汇总")
print("="*80)

performance_data = []
for result in all_results:
    test_metrics = result['test_metrics']
    auc_str = f"{test_metrics['auc_score']:.4f}" if test_metrics['auc_score'] else "N/A"
    performance_data.append({
        '算法': result['algorithm'],
        '训练准确率': f"{result['train_accuracy']:.4f}",
        '测试准确率': f"{result['test_accuracy']:.4f}", 
        '精确率(宏平均)': f"{test_metrics['precision_macro']:.4f}",
        '召回率(宏平均)': f"{test_metrics['recall_macro']:.4f}",
        'F1分数(宏平均)': f"{test_metrics['f1_macro']:.4f}",
        'AUC分数': auc_str,
        'CV均值': f"{result['cv_mean']:.4f}",
        'CV标准差': f"{result['cv_std']:.4f}"
    })

perf_df = pd.DataFrame(performance_data)
print(perf_df.to_string(index=False))

print("\n✅ 分析完成！所有混淆矩阵热图已保存，字体大小已优化避免重叠")
