In [None]:
# advanced_ml_benchmark.py
# Purpose:
# 1. Train classic ML models using the same train/test splits as the 1D-CNN model.
# 2. Evaluate 10 classical ML models on the internal test set and an external CRLM validation set.
# 3. Save and visualize comparative results for both datasets.

print("--- Starting advanced machine learning benchmark ---")

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

In [None]:
# Import required models and tools
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix
from sklearn.metrics import precision_score, recall_score

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
import lightgbm as lgb
import xgboost as xgb

warnings.filterwarnings('ignore')

In [None]:
# --- Step 0: Global settings and path definitions ---
print("\n--- Step 0: Global settings and path definitions ---")

# Reproducibility
SEED = 42
np.random.seed(SEED)

# Plot fonts (adjust if these fonts are not installed on your system)
plt.rcParams['font.sans-serif'] = ['SimHei', 'Arial Unicode MS']
plt.rcParams['axes.unicode_minus'] = False

# Base directories (edit to match your environment)
BASE_DIR = r"D:\结直肠癌肝转移Biomarker 诊断\新的策略\Autoencoder"
TRAIN_DATA_DIR = BASE_DIR
VALIDATION_DATA_DIR = os.path.join(BASE_DIR, "validation_datasets")
OUTPUT_DIR = os.path.join(BASE_DIR, "advanced_ml_benchmark_results")
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Input files (edit names if needed)
EXPRESSION_FILE = os.path.join(TRAIN_DATA_DIR, "expression_data_combat_corrected.csv")
METADATA_FILE = os.path.join(TRAIN_DATA_DIR, "metadata_combined.csv")
FUNC_GENES_FILE = os.path.join(TRAIN_DATA_DIR, "functional_genes_620.txt")
EXTERNAL_VALIDATION_FILE = os.path.join(VALIDATION_DATA_DIR, "dat_crlm.csv")

print(f"Results will be saved to: {OUTPUT_DIR}")

In [None]:
# --- Step 1: Load and prepare training and internal test data ---
print("\n--- Step 1: Load and prepare training and internal test data ---")
try:
    expression_data = pd.read_csv(EXPRESSION_FILE, index_col=0)
    metadata = pd.read_csv(METADATA_FILE, index_col=0)
    with open(FUNC_GENES_FILE, 'r', encoding='utf-8') as f:
        functional_genes = [line.strip() for line in f.readlines() if line.strip()]
    print("✅ Required training files loaded successfully")
except FileNotFoundError as e:
    print(f"❌ Error: training file not found - {e}")
    raise

# Filter functional genes present in the expression matrix
available_functional_genes = [gene for gene in functional_genes if gene in expression_data.columns]
if len(available_functional_genes) == 0:
    raise ValueError("No functional genes found in expression matrix. Check FUNC_GENES_FILE and EXPRESSION_FILE.")

X = expression_data[available_functional_genes]
y_raw = metadata.reindex(X.index)['group']

# Encode labels (metastasis=1, primary=0)
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y_raw.astype(str))
print(f"Label encoding mapping: {dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))}")

# IMPORTANT: use the same data split strategy as the CNN model
X_train, X_test_internal, y_train, y_test_internal = train_test_split(
    X, y, test_size=0.2, random_state=SEED, stratify=y
)
print(f"Train/internal test split: {len(X_train)} training samples, {len(X_test_internal)} internal test samples")

# Standardize features: fit on training set, apply to test sets
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_internal_scaled = scaler.transform(X_test_internal)
print("✅ Training and internal test data are ready")

In [None]:
# --- Step 2: Load and prepare external validation data ---
print("\n--- Step 2: Load and prepare external validation data ---")
try:
    dat_crlm = pd.read_csv(EXTERNAL_VALIDATION_FILE, index_col=0)
    print(f"✅ External CRLM validation set loaded: {dat_crlm.shape}")
except FileNotFoundError as e:
    print(f"❌ Error: external validation file not found - {e}")
    raise

# Prepare external validation features (align to available_functional_genes)
X_val_external = pd.DataFrame(index=dat_crlm.index, columns=available_functional_genes)
# If columns exist, copy; otherwise will produce NaN
for g in available_functional_genes:
    if g in dat_crlm.columns:
        X_val_external[g] = dat_crlm[g]
    else:
        X_val_external[g] = 0.0  # fill missing genes with 0

# Prepare external validation labels (convert to binary: metastasis=1 else 0)
if 'status' not in dat_crlm.columns:
    raise ValueError("External validation file must contain a 'status' column.")
y_val_external = dat_crlm['status'].apply(lambda x: 1 if 'metastasis' in str(x).lower() else 0).astype(int)

# IMPORTANT: use the scaler fitted on the training set to transform external data
X_val_external_scaled = scaler.transform(X_val_external)
print("✅ External validation data are ready")

In [None]:
# --- Step 3: Define and evaluate 10 machine learning models ---
print("\n--- Step 3: Training and evaluating 10 models ---")

models = {
    "Logistic Regression": LogisticRegression(random_state=SEED, max_iter=1000),
    "Support Vector Machine": SVC(probability=True, random_state=SEED),
    "Random Forest": RandomForestClassifier(random_state=SEED),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(random_state=SEED),
    "AdaBoost": AdaBoostClassifier(random_state=SEED),
    "Gaussian Naive Bayes": GaussianNB(),
    "Decision Tree": DecisionTreeClassifier(random_state=SEED),
    "LightGBM": lgb.LGBMClassifier(random_state=SEED),
    "XGBoost": xgb.XGBClassifier(random_state=SEED, use_label_encoder=False, eval_metric='logloss')
}

internal_results = []
external_results = []

def evaluate_model(y_true, y_pred_proba, y_pred):
    auc = roc_auc_score(y_true, y_pred_proba) if len(np.unique(y_true)) > 1 else np.nan
    accuracy = accuracy_score(y_true, y_pred)
    cm = confusion_matrix(y_true, y_pred)
    if cm.shape == (2, 2):
        tn, fp, fn, tp = cm.ravel()
        sensitivity = tp / (tp + fn) if (tp + fn) > 0 else np.nan
        specificity = tn / (tn + fp) if (tn + fp) > 0 else np.nan
    else:
        sensitivity = specificity = np.nan
    precision = precision_score(y_true, y_pred, zero_division=0)
    recall = recall_score(y_true, y_pred, zero_division=0)
    return {
        "AUC": float(auc) if not np.isnan(auc) else np.nan,
        "Accuracy": float(accuracy),
        "Sensitivity": float(sensitivity) if not np.isnan(sensitivity) else np.nan,
        "Specificity": float(specificity) if not np.isnan(specificity) else np.nan,
        "Precision": float(precision),
        "Recall": float(recall)
    }

for name, model in models.items():
    print(f"--- Processing: {name} ---")
    # Fit model
    model.fit(X_train_scaled, y_train)

    # Evaluate on internal test set
    if hasattr(model, "predict_proba"):
        y_pred_proba_internal = model.predict_proba(X_test_internal_scaled)[:, 1]
    else:
        # Fallback: use decision_function and scale to [0,1] via logistic sigmoid
        try:
            scores = model.decision_function(X_test_internal_scaled)
            y_pred_proba_internal = 1 / (1 + np.exp(-scores))
        except Exception:
            y_pred_proba_internal = np.zeros(len(X_test_internal_scaled))
    y_pred_internal = model.predict(X_test_internal_scaled)
    res_internal = evaluate_model(y_test_internal, y_pred_proba_internal, y_pred_internal)
    res_internal["Model"] = name
    internal_results.append(res_internal)

    # Evaluate on external validation set
    if hasattr(model, "predict_proba"):
        y_pred_proba_external = model.predict_proba(X_val_external_scaled)[:, 1]
    else:
        try:
            scores_ext = model.decision_function(X_val_external_scaled)
            y_pred_proba_external = 1 / (1 + np.exp(-scores_ext))
        except Exception:
            y_pred_proba_external = np.zeros(len(X_val_external_scaled))
    y_pred_external = model.predict(X_val_external_scaled)
    res_external = evaluate_model(y_val_external, y_pred_proba_external, y_pred_external)
    res_external["Model"] = name
    external_results.append(res_external)

internal_df = pd.DataFrame(internal_results).sort_values(by="AUC", ascending=False).reset_index(drop=True)
external_df = pd.DataFrame(external_results).sort_values(by="AUC", ascending=False).reset_index(drop=True)


In [None]:
# --- Step 4: Display and save results ---
print("\n" + "="*60)
print("Benchmark results on internal test set")
print("="*60)
print(internal_df)
internal_df.to_csv(os.path.join(OUTPUT_DIR, "internal_test_set_benchmark.csv"), index=False)

print("\n" + "="*60)
print("Benchmark results on external CRLM validation set")
print("="*60)
print(external_df)
external_df.to_csv(os.path.join(OUTPUT_DIR, "external_crlm_set_benchmark.csv"), index=False)
print(f"\n✅ All results saved to: {OUTPUT_DIR}")

In [None]:
# --- Step 5: Visualize results ---
print("\n--- Step 5: Visualizing model performance comparisons ---")

fig, axes = plt.subplots(2, 1, figsize=(12, 16))
fig.suptitle("Machine Learning Model Performance Comparison", fontsize=20, fontweight='bold')

# Internal test AUC plot
sns.barplot(ax=axes[0], x="AUC", y="Model", data=internal_df, palette="Blues_r")
axes[0].set_title("AUC on Internal Test Set", fontsize=16)
axes[0].set_xlabel("AUC (Area Under Curve)", fontsize=12)
axes[0].set_ylabel("Model", fontsize=12)
axes[0].set_xlim(0, 1.0)
for index, value in enumerate(internal_df["AUC"]):
    axes[0].text(value + 0.01, index, f"{value:.4f}", va="center")

# External validation AUC plot
sns.barplot(ax=axes[1], x="AUC", y="Model", data=external_df, palette="Greens_r")
axes[1].set_title("AUC on External CRLM Validation Set", fontsize=16)
axes[1].set_xlabel("AUC (Area Under Curve)", fontsize=12)
axes[1].set_ylabel("Model", fontsize=12)
axes[1].set_xlim(0, 1.0)
for index, value in enumerate(external_df["AUC"]):
    axes[1].text(value + 0.01, index, f"{value:.4f}", va="center")

plt.tight_layout(rect=[0, 0, 1, 0.96])
plot_output_file = os.path.join(OUTPUT_DIR, "benchmark_comparison_plots.png")
plt.savefig(plot_output_file, dpi=300)
print(f"✅ Performance comparison plot saved to: {plot_output_file}")
plt.show()

print("\n--- Advanced benchmark completed ---")