In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, StratifiedKFold
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import (confusion_matrix, accuracy_score, precision_recall_fscore_support, 
                           classification_report, roc_auc_score)
from sklearn.preprocessing import StandardScaler
from matplotlib.colors import LinearSegmentedColormap
import warnings
import os
from datetime import datetime

warnings.filterwarnings('ignore')

# Set font to Arial
plt.rcParams['font.sans-serif'] = ['Arial']
plt.rcParams['axes.unicode_minus'] = False

print("Starting Signal1+Signal2 feature classification analysis...")
print("=" * 60)

## Data Loading and Preprocessing

In [None]:
print("\n>>> Loading osteoporosis data...")
data_file = r"F:\作图目录20280825\骨质疏松数据.xlsx"
df = pd.read_excel(data_file, header=1, usecols='B:H') 
cols = ['signal_1', 'sost_1', 'signal_2', 'sost_2', 'sost_mean', 'l1_4', 'left_hip']
df.columns = cols

print(f"Data dimensions: {df.shape}")

# Add class labels
if len(df) == 103:
    df['class'] = ['Health'] * 35 + ['Osteopenia'] * 33 + ['Osteoporosis'] * 35
    print("103 rows: Health(35) + Osteopenia(33) + Osteoporosis(35)")
else:
    df['class'] = ['Health'] * 35 + ['Osteopenia'] * 33 + ['Osteoporosis'] * 36
    print("104 rows: Health(35) + Osteopenia(33) + Osteoporosis(36)")

print("Class distribution:")
print(df['class'].value_counts())

# Prepare feature data - only use signal_1 and signal_2
X = df[['signal_1', 'signal_2']].values
y = df['class'].map({'Health': 0, 'Osteopenia': 1, 'Osteoporosis': 2})

print(f"\nFeature dimensions: {X.shape}")
print("Feature names: Signal1(COL.), Signal2(FL.)")
print("Target distribution:", y.value_counts().sort_index())

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)
print(f"Training set size: {X_train.shape[0]}, Test set size: {X_test.shape[0]}")

## Color Configuration

In [None]:
print("\n>>> Setting up color scheme...")

# Color configuration
COLORS = {
    "green_light": "#C9DCC4",
    "green_transparent": "#94C3AA",
    "blue_light": "#D7E8F3",
    "blue_transparent": "#8ec1dc",
    "teal_transparent": "#57B1AB",
    "new": "#DF9D96"
}

# Create color maps
train_cmap = LinearSegmentedColormap.from_list("train_cmap", [COLORS["blue_light"], COLORS["blue_transparent"]], N=256)
test_cmap = LinearSegmentedColormap.from_list("test_cmap", [COLORS["green_light"], COLORS["green_transparent"]], N=256)

print("✅ Color scheme configured")

# Algorithm configuration
algo_configs = {
    'SVM': {
        'model': SVC(probability=True, random_state=42),
        'params': {'C': [0.1, 1, 10], 'kernel': ['rbf', 'linear']},
        'needs_scaling': True,
        'color': '#57B1AB'
    },
    'Random Forest': {
        'model': RandomForestClassifier(random_state=42),
        'params': {'n_estimators': [50, 100], 'max_depth': [None, 5, 10]},
        'needs_scaling': False,
        'color': '#94C3AA'
    },
    'Logistic Regression': {
        'model': LogisticRegression(random_state=42, max_iter=1000),
        'params': {'C': [0.1, 1, 10]},
        'needs_scaling': True,
        'color': '#8ec1dc'
    },
    'KNN': {
        'model': KNeighborsClassifier(),
        'params': {'n_neighbors': [3, 5, 7]},
        'needs_scaling': True,
        'color': '#DF9D96'
    },
    'Decision Tree': {
        'model': DecisionTreeClassifier(random_state=42),
        'params': {'max_depth': [None, 5, 10]},
        'needs_scaling': False,
        'color': '#C9DCC4'
    },
    'LDA': {
        'model': LinearDiscriminantAnalysis(),
        'params': {},
        'needs_scaling': False,
        'color': '#8ec1dc'
    }
}

print("✅ Algorithm configuration complete")

## Confusion Matrix Function

In [None]:
def plot_confusion_matrix(y_true, y_pred, labels, title, cmap, save_path_base=None):
    """Plot confusion matrix heatmap with enhanced readability"""
    cm = confusion_matrix(y_true, y_pred, labels=[0, 1, 2])
    
    plt.figure(figsize=(8, 7))
    ax = sns.heatmap(cm, annot=True, fmt='g', cmap=cmap, 
                     xticklabels=labels, yticklabels=labels, 
                     cbar=False, annot_kws={"size": 28, "color": "black", "weight": "bold"}, 
                     linewidths=1.5, square=True)
    
    ax.set_title(title, fontsize=18, fontweight='bold', pad=30)
    ax.set_xlabel('Predicted', fontsize=18, fontweight='bold', labelpad=20)
    ax.set_ylabel('Actual', fontsize=18, fontweight='bold', labelpad=20)
    
    ax.set_xticklabels(ax.get_xticklabels(), rotation=0, ha='center', fontsize=16, 
                       verticalalignment='top')
    ax.set_yticklabels(ax.get_yticklabels(), rotation=90, ha='center', fontsize=16,
                       verticalalignment='center')
    
    ax.tick_params(axis='x', pad=15, length=6)
    ax.tick_params(axis='y', pad=12, length=6)
    
    ax.set_aspect('equal', adjustable='box')
    plt.subplots_adjust(bottom=0.22, top=0.85, left=0.22, right=0.95)
    
    if save_path_base:
        tiff_path = f"{save_path_base}.tiff"
        plt.savefig(tiff_path, format='tiff', dpi=300, bbox_inches='tight', 
                    facecolor='white', pad_inches=0.3)
        
        pdf_path = f"{save_path_base}.pdf"
        plt.savefig(pdf_path, format='pdf', bbox_inches='tight', 
                    facecolor='white', pad_inches=0.3)
        
        print(f"  ✅ Saved: {os.path.basename(tiff_path)}, {os.path.basename(pdf_path)}")
        plt.close()
        return tiff_path, pdf_path
    else:
        plt.show()
        return None, None

## Metrics Computation Function

In [None]:
def compute_comprehensive_metrics(y_true, y_pred, y_proba=None):
    """Compute comprehensive evaluation metrics"""
    acc = accuracy_score(y_true, y_pred)
    
    precision, recall, f1, support = precision_recall_fscore_support(y_true, y_pred, 
                                                                    average=None, zero_division=0)
    
    prec_macro = precision_recall_fscore_support(y_true, y_pred, average='macro', zero_division=0)[0]
    recall_macro = precision_recall_fscore_support(y_true, y_pred, average='macro', zero_division=0)[1]
    f1_macro = precision_recall_fscore_support(y_true, y_pred, average='macro', zero_division=0)[2]
    
    prec_weighted = precision_recall_fscore_support(y_true, y_pred, average='weighted', zero_division=0)[0]
    recall_weighted = precision_recall_fscore_support(y_true, y_pred, average='weighted', zero_division=0)[1]
    f1_weighted = precision_recall_fscore_support(y_true, y_pred, average='weighted', zero_division=0)[2]
    
    cm = confusion_matrix(y_true, y_pred)
    
    auc_score = None
    if y_proba is not None:
        try:
            auc_score = roc_auc_score(y_true, y_proba, multi_class='ovr', average='macro')
        except:
            pass
    
    return {
        'accuracy': acc,
        'precision_macro': prec_macro,
        'recall_macro': recall_macro,
        'f1_macro': f1_macro,
        'precision_weighted': prec_weighted,
        'recall_weighted': recall_weighted,
        'f1_weighted': f1_weighted,
        'precision_per_class': precision,
        'recall_per_class': recall,
        'f1_per_class': f1,
        'support_per_class': support,
        'confusion_matrix': cm,
        'auc_score': auc_score,
        'classification_report': classification_report(y_true, y_pred, target_names=['Health', 'Osteopenia', 'Osteoporosis'])
    }

## Algorithm Training Function

In [None]:
def train_algorithm(algo_name, config, X_train, X_test, y_train, y_test):
    """Train single algorithm and return complete results"""
    print(f"\n--- Training {algo_name} ---")
    
    # Data standardization
    if config['needs_scaling']:
        print(f"  Applying z-score standardization...")
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
    else:
        X_train_scaled = X_train.copy()
        X_test_scaled = X_test.copy()
        scaler = None
    
    # Parameter optimization
    model = config['model']
    param_grid = config['params']
    
    if param_grid:
        print(f"  Parameter optimization...")
        grid_search = GridSearchCV(model, param_grid, cv=3, scoring='accuracy', n_jobs=-1)
        grid_search.fit(X_train_scaled, y_train)
        best_model = grid_search.best_estimator_
        best_params = grid_search.best_params_
        print(f"  Best parameters: {best_params}")
    else:
        best_model = model
        best_model.fit(X_train_scaled, y_train)
        best_params = "Default"
        print("  Using default parameters")
    
    # Predictions
    y_train_pred = best_model.predict(X_train_scaled)
    y_test_pred = best_model.predict(X_test_scaled)
    
    # Probability predictions
    try:
        y_train_proba = best_model.predict_proba(X_train_scaled)
        y_test_proba = best_model.predict_proba(X_test_scaled)
    except:
        y_train_proba = None
        y_test_proba = None
    
    # Performance metrics
    train_acc = accuracy_score(y_train, y_train_pred)
    test_acc = accuracy_score(y_test, y_test_pred)
    
    train_metrics = compute_comprehensive_metrics(y_train, y_train_pred, y_train_proba)
    test_metrics = compute_comprehensive_metrics(y_test, y_test_pred, y_test_proba)
    
    # Cross validation
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    cv_scores = cross_val_score(best_model, X_train_scaled, y_train, cv=skf, scoring='accuracy')
    
    print(f"  Training accuracy: {train_acc:.4f}")
    print(f"  Test accuracy: {test_acc:.4f}")
    if test_metrics['auc_score']:
        print(f"  Test AUC: {test_metrics['auc_score']:.4f}")
    print(f"  5-fold CV: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")
    
    return {
        'algorithm': algo_name,
        'model': best_model,
        'scaler': scaler,
        'best_params': best_params,
        'train_accuracy': train_acc,
        'test_accuracy': test_acc,
        'train_metrics': train_metrics,
        'test_metrics': test_metrics,
        'cv_scores': cv_scores,
        'cv_mean': cv_scores.mean(),
        'cv_std': cv_scores.std(),
        'y_train': y_train,
        'y_test': y_test,
        'y_train_pred': y_train_pred,
        'y_test_pred': y_test_pred,
        'y_train_proba': y_train_proba,
        'y_test_proba': y_test_proba
    }

## Train All Algorithms

In [None]:
print("\n>>> Training and evaluating six machine learning algorithms...")
all_results = []
class_labels = ['Health', 'Osteopenia', 'Osteoporosis']

for algo_name, config in algo_configs.items():
    result = train_algorithm(algo_name, config, X_train, X_test, y_train, y_test)
    all_results.append(result)

print(f"\n✅ All algorithms training completed!")

## Generate Confusion Matrix Heatmaps

In [None]:
print("\n>>> Generating confusion matrix heatmaps (TIFF and PDF formats)...")

output_dir = r"F:\作图目录20280825\signal_confusion_matrices"
os.makedirs(output_dir, exist_ok=True)

generated_files = []

for result in all_results:
    algo_name = result['algorithm']
    print(f"\nGenerating {algo_name} confusion matrices...")
    
    # Training confusion matrix
    train_title = f"Confusion Matrix(Train) - {algo_name}"
    train_path_base = os.path.join(output_dir, f"{algo_name.replace(' ', '_')}_signal_train")
    tiff_train, pdf_train = plot_confusion_matrix(
        result['y_train'], result['y_train_pred'], 
        class_labels, train_title, train_cmap, train_path_base
    )
    
    # Test confusion matrix
    test_title = f"Confusion Matrix(Test) - {algo_name}"
    test_path_base = os.path.join(output_dir, f"{algo_name.replace(' ', '_')}_signal_test")
    tiff_test, pdf_test = plot_confusion_matrix(
        result['y_test'], result['y_test_pred'], 
        class_labels, test_title, test_cmap, test_path_base
    )
    
    if tiff_train and pdf_train and tiff_test and pdf_test:
        generated_files.extend([tiff_train, pdf_train, tiff_test, pdf_test])

print(f"\n✅ Generated {len(generated_files)} confusion matrix files")

## Performance Summary

In [None]:
print("\n" + "=" * 80)
print("Signal1+Signal2 Feature Classification Performance Summary")
print("=" * 80)

performance_data = []
for result in all_results:
    test_metrics = result['test_metrics']
    auc_str = f"{test_metrics['auc_score']:.4f}" if test_metrics['auc_score'] else "N/A"
    performance_data.append({
        'Algorithm': result['algorithm'],
        'Train Accuracy': f"{result['train_accuracy']:.4f}",
        'Test Accuracy': f"{result['test_accuracy']:.4f}", 
        'Precision (Macro)': f"{test_metrics['precision_macro']:.4f}",
        'Recall (Macro)': f"{test_metrics['recall_macro']:.4f}",
        'F1 Score (Macro)': f"{test_metrics['f1_macro']:.4f}",
        'AUC Score': auc_str,
        'CV Mean': f"{result['cv_mean']:.4f}",
        'CV Std': f"{result['cv_std']:.4f}"
    })

perf_df = pd.DataFrame(performance_data)
print(perf_df.to_string(index=False))

print("\n✅ Analysis complete! All confusion matrix heatmaps saved with optimized font sizes")