In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

# 載入處理好的資料
df = pd.read_csv("../data/processed/nhanes_2013_2020_ckd_cleaned.csv")


# 要排除的變數（實際存在於資料中的才會排除）
exclude_cols = [
    "SEQN", "id", "ckd",               # 標籤與識別碼
    "URXUMA", "URXUCR", "ACR",         # 尿液衍生欄位（洩漏標籤）
    "URDACT", "URDUCRLC", "URDUMALC", "SDDSRVYR"   # 尿液其他延伸欄位（註解欄位）
]
exclude_cols = [col for col in exclude_cols if col in df.columns]

# 分出 X 和 y
X = df.drop(columns=exclude_cols)
y = df["ckd"]

# 顯示特徵欄位前 10 項與總數
feature_list = X.columns.tolist()
feature_count = len(feature_list)
print(feature_list[:10], f"...（共 {feature_count} 個特徵）")



# 分類欄位（根據 pandas 自動推斷 object 或 category）
categorical_cols = X.select_dtypes(include=["object", "category"]).columns.tolist()

# 數值欄位（剩下的）
numeric_cols = [col for col in X.columns if col not in categorical_cols]

# 建立 ColumnTransformer：對數值欄位做標準化、類別欄位做 one-hot
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), categorical_cols)
    ]
)

# 分訓練 / 測試集
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# 執行轉換
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

# 輸出轉換後的形狀資訊
X_train_transformed.shape, X_test_transformed.shape


['SDMVSTRA', 'DMDEDUC2', 'SDMVPSU', 'year_group', 'RIDEXMON', 'SIAINTRP', 'FIALANG', 'RIDRETH1', 'RIAGENDR', 'RIDRETH3'] ...（共 59 個特徵）


((8620, 61), (2155, 61))

In [8]:
# Logistic model
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# 建立並訓練 Logistic Regression（加 class_weight）
logit_model = LogisticRegression(
    penalty='l2',
    C=5.0,
    class_weight={0: 0.8, 1: 1.2},
    solver='liblinear',
    max_iter=1000,
    random_state=42
)

logit_model.fit(X_train_transformed, y_train)

# 預測
y_pred = logit_model.predict(X_test_transformed)
y_pred_proba = logit_model.predict_proba(X_test_transformed)[:, 1]

# 評估
print("📊 Classification Report (Logistic Regression):")
print(classification_report(y_test, y_pred))

print("🔲 Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print(f"🎯 ROC AUC Score: {roc_auc_score(y_test, y_pred_proba):.4f}")


📊 Classification Report (Logistic Regression):
              precision    recall  f1-score   support

           0       0.72      0.69      0.71      1293
           1       0.57      0.61      0.59       862

    accuracy                           0.66      2155
   macro avg       0.65      0.65      0.65      2155
weighted avg       0.66      0.66      0.66      2155

🔲 Confusion Matrix:
[[893 400]
 [340 522]]
🎯 ROC AUC Score: 0.7091


In [9]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# 建立 SVM（RBF Kernel）
svm_rbf_model = SVC(
    kernel='rbf',
    C=1.0,              # 控制誤差容忍度（大一點 → 越少錯誤，但可能過擬合）
    gamma='scale',      # 'scale' 自動計算 gamma，或改數值如 0.01、0.05 試
    class_weight='balanced',
    probability=True,   # 計算 AUC 要打開
    random_state=42
)

# 訓練
svm_rbf_model.fit(X_train_transformed, y_train)

# 預測
y_pred = svm_rbf_model.predict(X_test_transformed)
y_pred_proba = svm_rbf_model.predict_proba(X_test_transformed)[:, 1]

# 評估
print("📊 Classification Report (SVM RBF):")
print(classification_report(y_test, y_pred))
print("🔲 Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print(f"🎯 ROC AUC Score: {roc_auc_score(y_test, y_pred_proba):.4f}")


📊 Classification Report (SVM RBF):
              precision    recall  f1-score   support

           0       0.74      0.72      0.73      1293
           1       0.60      0.62      0.61       862

    accuracy                           0.68      2155
   macro avg       0.67      0.67      0.67      2155
weighted avg       0.68      0.68      0.68      2155

🔲 Confusion Matrix:
[[932 361]
 [331 531]]
🎯 ROC AUC Score: 0.7238


In [10]:
# XGBoost
import xgboost
from xgboost.sklearn import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.model_selection import train_test_split
print(xgboost.__version__)
print(XGBClassifier.__module__)
# ⏳ 計算 scale_pos_weight
neg, pos = (y_train == 0).sum(), (y_train == 1).sum()
scale = neg / pos

# ✅ 建立強化版 XGBoost 模型
xgb_model = XGBClassifier(
    use_label_encoder=False,
    eval_metric="logloss",
    scale_pos_weight=scale,
    max_depth=4,
    learning_rate=0.03,
    n_estimators=400,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)


xgb_model.fit(X_train_transformed, y=y_train)

# 🔍 預測
y_pred = xgb_model.predict(X_test_transformed)
y_prob = xgb_model.predict_proba(X_test_transformed)[:, 1]

# 📊 評估報告
print("📊 Classification Report (XGBoost):")
print(classification_report(y_test, y_pred))

# 🔲 混淆矩陣
print("🔲 Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# 🎯 ROC AUC
roc_auc = roc_auc_score(y_test, y_prob)
print(f"🎯 ROC AUC Score: {roc_auc:.4f}")


2.1.4
xgboost.sklearn


Parameters: { "use_label_encoder" } are not used.



📊 Classification Report (XGBoost):
              precision    recall  f1-score   support

           0       0.74      0.72      0.73      1293
           1       0.59      0.62      0.60       862

    accuracy                           0.68      2155
   macro avg       0.66      0.67      0.66      2155
weighted avg       0.68      0.68      0.68      2155

🔲 Confusion Matrix:
[[925 368]
 [331 531]]
🎯 ROC AUC Score: 0.7323


In [11]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# 建立並訓練 Random Forest（注意 class_weight）
rf_model = RandomForestClassifier(
    n_estimators=500,
    max_depth=8,
    class_weight={0: 0.7, 1: 1.3},
    min_samples_leaf=10,
    min_samples_split=20,
    random_state=42
)
rf_model.fit(X_train_transformed, y_train)

# 預測
y_pred = rf_model.predict(X_test_transformed)
y_pred_proba = rf_model.predict_proba(X_test_transformed)

# 評估指標
print("📊 Classification Report (Random Forest):")
print(classification_report(y_test, y_pred))

print("🔲 Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print(f"🎯 ROC AUC Score: {roc_auc_score(y_test, y_pred_proba[:, 1]):.4f}")


📊 Classification Report (Random Forest):
              precision    recall  f1-score   support

           0       0.75      0.63      0.68      1293
           1       0.55      0.69      0.61       862

    accuracy                           0.65      2155
   macro avg       0.65      0.66      0.65      2155
weighted avg       0.67      0.65      0.66      2155

🔲 Confusion Matrix:
[[812 481]
 [268 594]]
🎯 ROC AUC Score: 0.7284


In [39]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import tensorflow.keras.backend as K

# 讀資料
df = pd.read_csv("../data/processed/nhanes_2013_2020_ckd_cleaned.csv")
exclude_cols = ["SEQN", "id", "ckd", "URXUMA", "URXUCR", "ACR", "URDACT", "URDUCRLC", "URDUMALC"]
exclude_cols = [col for col in exclude_cols if col in df.columns]
X = df.drop(columns=exclude_cols)
y = df["ckd"]

# 標準化 + OneHot
categorical_cols = X.select_dtypes(include=["object", "category"]).columns.tolist()
numeric_cols = [col for col in X.columns if col not in categorical_cols]
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), categorical_cols)
    ]
)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

# Focal Loss 定義
def focal_loss(gamma=2., alpha=0.5):
    def loss(y_true, y_pred):
        y_true = K.cast(y_true, dtype='float32')
        pt = y_true * y_pred + (1 - y_true) * (1 - y_pred)
        return -K.mean(alpha * K.pow(1. - pt, gamma) * K.log(pt + K.epsilon()))
    return loss

# 建立 DNN 模型
model = Sequential([
    Dense(256, activation='relu', input_shape=(X_train_transformed.shape[1],)),
    Dropout(0.2),
    Dense(128, activation='relu'),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')
])
optimizer = Adam(learning_rate=0.0005)
model.compile(optimizer=optimizer, loss=focal_loss(gamma=2., alpha=0.5), metrics=['accuracy'])

# 提早停止
early_stop = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# 訓練（不加 class_weight）
model.fit(
    X_train_transformed, y_train,
    validation_split=0.2,
    epochs=100,
    batch_size=64,
    callbacks=[early_stop],
    verbose=2
)

# 預測 + threshold 調整
y_pred_prob = model.predict(X_test_transformed).flatten()
threshold = 0.4  # 調這裡
y_pred = (y_pred_prob > threshold).astype(int)

# 評估
print("DNN + Focal Loss Classification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_pred_prob))


Epoch 1/100


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


108/108 - 0s - 4ms/step - accuracy: 0.6404 - loss: 0.0809 - val_accuracy: 0.6473 - val_loss: 0.0802
Epoch 2/100
108/108 - 0s - 957us/step - accuracy: 0.6788 - loss: 0.0760 - val_accuracy: 0.6497 - val_loss: 0.0792
Epoch 3/100
108/108 - 0s - 948us/step - accuracy: 0.6863 - loss: 0.0742 - val_accuracy: 0.6439 - val_loss: 0.0796
Epoch 4/100
108/108 - 0s - 950us/step - accuracy: 0.6972 - loss: 0.0730 - val_accuracy: 0.6485 - val_loss: 0.0800
Epoch 5/100
108/108 - 0s - 972us/step - accuracy: 0.7052 - loss: 0.0717 - val_accuracy: 0.6514 - val_loss: 0.0798
Epoch 6/100
108/108 - 0s - 944us/step - accuracy: 0.7101 - loss: 0.0706 - val_accuracy: 0.6555 - val_loss: 0.0801
Epoch 7/100
108/108 - 0s - 910us/step - accuracy: 0.7171 - loss: 0.0696 - val_accuracy: 0.6578 - val_loss: 0.0803
Epoch 8/100
108/108 - 0s - 912us/step - accuracy: 0.7227 - loss: 0.0685 - val_accuracy: 0.6520 - val_loss: 0.0808
Epoch 9/100
108/108 - 0s - 921us/step - accuracy: 0.7320 - loss: 0.0669 - val_accuracy: 0.6531 - val_l

In [12]:
# DNN
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LeakyReLU
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.regularizers import l2
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import tensorflow.keras.backend as K
import numpy as np
import tensorflow as tf
import random

seed = 1100
np.random.seed(seed)
tf.random.set_seed(seed)
random.seed(seed)

# Focal Loss 定義
def focal_loss(gamma=2, alpha=0.5):
    def loss(y_true, y_pred):
        y_true = K.cast(y_true, dtype='float32')
        pt = y_true * y_pred + (1 - y_true) * (1 - y_pred)
        return -K.mean(alpha * K.pow(1. - pt, gamma) * K.log(pt + K.epsilon()))
    return loss

# 建構升級版 DNN
model = Sequential([
    Dense(256, kernel_regularizer=l2(0.001), input_shape=(X_train_transformed.shape[1],)),
    LeakyReLU(alpha=0.1),
    Dropout(0.1),
    Dense(128, kernel_regularizer=l2(0.001)),
    LeakyReLU(alpha=0.1),
    Dense(64, kernel_regularizer=l2(0.001)),
    LeakyReLU(alpha=0.1),
    Dense(32, kernel_regularizer=l2(0.001)),
    LeakyReLU(alpha=0.1),
    Dense(1, activation='sigmoid')
])

# 編譯
optimizer = RMSprop(learning_rate=0.0005)
model.compile(optimizer=optimizer, loss=focal_loss(), metrics=['accuracy'])

# Callbacks
early_stop = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=1e-6)

# 訓練
model.fit(
    X_train_transformed, y_train,
    validation_split=0.2,
    epochs=100,
    batch_size=64,
    callbacks=[early_stop, reduce_lr],
    verbose=2
)

# 預測 + threshold 調整
y_pred_prob = model.predict(X_test_transformed).flatten()
threshold = 0.42
y_pred = (y_pred_prob > threshold).astype(int)

# 評估
print("DNN 升級版 Classification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_pred_prob))




Epoch 1/100


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


108/108 - 0s - 4ms/step - accuracy: 0.6563 - loss: 0.4145 - val_accuracy: 0.6578 - val_loss: 0.3539 - learning_rate: 5.0000e-04
Epoch 2/100
108/108 - 0s - 1ms/step - accuracy: 0.6781 - loss: 0.3006 - val_accuracy: 0.6549 - val_loss: 0.2579 - learning_rate: 5.0000e-04
Epoch 3/100
108/108 - 0s - 1ms/step - accuracy: 0.6898 - loss: 0.2196 - val_accuracy: 0.6549 - val_loss: 0.1920 - learning_rate: 5.0000e-04
Epoch 4/100
108/108 - 0s - 1ms/step - accuracy: 0.6910 - loss: 0.1655 - val_accuracy: 0.6537 - val_loss: 0.1494 - learning_rate: 5.0000e-04
Epoch 5/100
108/108 - 0s - 1ms/step - accuracy: 0.6888 - loss: 0.1314 - val_accuracy: 0.6555 - val_loss: 0.1232 - learning_rate: 5.0000e-04
Epoch 6/100
108/108 - 0s - 1ms/step - accuracy: 0.6904 - loss: 0.1104 - val_accuracy: 0.6578 - val_loss: 0.1075 - learning_rate: 5.0000e-04
Epoch 7/100
108/108 - 0s - 983us/step - accuracy: 0.6897 - loss: 0.0980 - val_accuracy: 0.6578 - val_loss: 0.0980 - learning_rate: 5.0000e-04
Epoch 8/100
108/108 - 0s - 1ms

In [None]:
import shap
import matplotlib.pyplot as plt
import os

# 設定輸出目錄
output_dir = "../outputs"
os.makedirs(output_dir, exist_ok=True)

# 建立 SHAP explainer
explainer = shap.TreeExplainer(xgb_model)
shap_values = explainer.shap_values(X_test_transformed)

# Summary plot 存檔
plt.figure()
shap.summary_plot(shap_values, X_test_transformed, feature_names=feature_list, show=False)
plt.tight_layout()
#plt.show()
plt.savefig(os.path.join(output_dir, "shap_summary.png"), dpi=300)
plt.close()

# Dependence plot（針對第一個變數，也可用名字取代）
plt.figure()
shap.dependence_plot(0, shap_values, X_test_transformed, feature_names=feature_list, show=False)
plt.tight_layout()
plt.savefig(os.path.join(output_dir, "shap_dependence_0.png"), dpi=300)
plt.show()
plt.close()


  shap.summary_plot(shap_values, X_test_transformed, feature_names=feature_list, show=False)


<Figure size 640x480 with 0 Axes>