In [1]:
# ================================================================
# Decision Tree (DT)
# ================================================================

import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, LeaveOneOut
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, balanced_accuracy_score, f1_score
from sklearn.tree import DecisionTreeClassifier
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.pipeline import Pipeline
import statsmodels.stats.proportion as smp
from itertools import product
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

# -----------------------
# 0) Ayarlar
# -----------------------
FILE_PATH = "ML_Analysis_V3.xlsx"   # <-- kendi yolun
TARGETS = ["Cervical Lordosis Risk","Kyphosis Risk","Lumbar Lordosis Risk","Scoliosis Risk"]
RUN_LOSO = True                     # hızlı istersen False yap
TUNING_SCORING = "balanced_accuracy"  # alternatif: "f1_macro"

# -----------------------
# 1) Veri seti
# -----------------------
df = pd.read_excel(FILE_PATH, sheet_name="Sheet1")
feature_cols = [c for c in df.columns if c not in TARGETS]

# -----------------------
# 2) Yardımcılar
# -----------------------
def make_adaptive_sampler(y_train):
    """
    Minority sayısına göre SMOTE(k) ya da ROS seç.
    minority >= 3 -> SMOTE(k = min(5, minority-1))
    aksi halde -> ROS
    """
    counts = y_train.value_counts().to_dict()
    minority_n = min(counts.get(0, 0), counts.get(1, 0))
    if minority_n >= 3:
        k = max(1, min(5, minority_n - 1))
        return SMOTE(k_neighbors=k, random_state=42)
    else:
        return RandomOverSampler(random_state=42)

def get_stratified_cv(y, desired_splits):
    """
    StratifiedKFold kat sayısını, min sınıf adedini aşmayacak şekilde adaptif seç.
    """
    minority_n = y.value_counts().min()
    n_splits = max(2, min(desired_splits, int(minority_n))) if int(minority_n) > 1 else 2
    return StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42), n_splits

def fit_predict_one_fold_dt(X_train, y_train, X_test, dt_params):
    """
    Tek fold eğitimi: Sampler -> DecisionTree
    (Ağaçlar ölçekten etkilenmediği için scaler kullanmıyoruz.)
    """
    sampler = make_adaptive_sampler(y_train)
    pipe = Pipeline([
        ('sampler', sampler),
        ('dt', DecisionTreeClassifier(**dt_params))
    ])
    pipe.fit(X_train, y_train)
    return pipe.predict(X_test)

# -----------------------
# 3) Global tuning (hızlı)
# -----------------------
def global_tune_dt(X, y, desired_splits=5, scoring="balanced_accuracy"):
    skf, used_splits = get_stratified_cv(y, desired_splits)

    # Param grid (küçük ve etkili; aşırı grid küçük veride aşırı uyum yapar)
    grid = {
        "criterion": ["gini", "entropy"],
        "max_depth": [None, 2, 3, 4, 5],
        "min_samples_split": [2, 3, 4, 5],
        "min_samples_leaf": [1, 2, 3],
        "max_features": [None, "sqrt", "log2"],
        "ccp_alpha": [0.0, 0.001, 0.01],
        "class_weight": ["balanced"],      # sabit: dengesizlik için
        "random_state": [42]
    }

    keys = list(grid.keys())
    vals = [grid[k] for k in keys]
    best_score, best_params = -np.inf, None

    for combo in product(*vals):
        p = dict(zip(keys, combo))
        fold_scores = []
        for tr, va in skf.split(X, y):
            X_tr, X_va = X.iloc[tr], X.iloc[va]
            y_tr, y_va = y.iloc[tr], y.iloc[va]

            sampler = make_adaptive_sampler(y_tr)
            pipe = Pipeline([
                ('sampler', sampler),
                ('dt', DecisionTreeClassifier(**p))
            ])
            pipe.fit(X_tr, y_tr)
            pred = pipe.predict(X_va)
            s = balanced_accuracy_score(y_va, pred) if scoring=="balanced_accuracy" \
                else f1_score(y_va, pred, average="macro")
            fold_scores.append(s)

        m = np.mean(fold_scores)
        if m > best_score:
            best_score, best_params = m, p

    return best_params, best_score, used_splits

# -----------------------
# 4) Değerlendirme
# -----------------------
def evaluate_scheme_dt(X, y, splitter, scheme_name, dt_params):
    splits = splitter.split(X, y) if isinstance(splitter, StratifiedKFold) else splitter.split(X)
    y_true, y_pred = [], []
    for tr, te in splits:
        X_tr, X_te = X.iloc[tr], X.iloc[te]
        y_tr, y_te = y.iloc[tr], y.iloc[te]
        pred = fit_predict_one_fold_dt(X_tr, y_tr, X_te, dt_params)
        y_true.extend(y_te); y_pred.extend(pred)

    rep = classification_report(y_true, y_pred, output_dict=True, zero_division=0)
    cm  = confusion_matrix(y_true, y_pred, labels=[0,1])
    tn, fp, fn, tp = cm.ravel()
    acc = accuracy_score(y_true, y_pred)
    n = len(y_true); k = int((np.array(y_true) == np.array(y_pred)).sum())
    ci_lo, ci_hi = smp.proportion_confint(k, n, alpha=0.05, method='wilson')

    return {
        "Scheme": scheme_name,
        "Accuracy": acc,
        "95% CI Low": ci_lo, "95% CI High": ci_hi,
        "Weighted Precision": rep["weighted avg"]["precision"],
        "Weighted Recall": rep["weighted avg"]["recall"],
        "Weighted F1": rep["weighted avg"]["f1-score"],
        "Support": int(cm.sum()), "TN": int(tn), "FP": int(fp), "FN": int(fn), "TP": int(tp)
    }

# -----------------------
# 5) Çalıştır
# -----------------------
all_rows, tuning_rows = [], []

for target in TARGETS:
    print(f"\n=== Decision Tree Tuning & Evaluation for: {target} ===")
    X = df[feature_cols].copy(); y = df[target].copy()

    best_params, best_cv, used_splits = global_tune_dt(X, y, desired_splits=5, scoring=TUNING_SCORING)
    print(f"Best params (Stratified {used_splits}-Fold): {best_params} | Mean CV ({TUNING_SCORING}): {round(best_cv,3)}")
    tuning_rows.append({"Target": target, "CV (tuning)": f"Stratified {used_splits}-Fold",
                        "Best Params": best_params, f"Mean CV ({TUNING_SCORING})": round(best_cv,3)})

    if RUN_LOSO:
        row_loso = evaluate_scheme_dt(X, y, LeaveOneOut(), "LOSO", best_params)
        all_rows.append({"Target": target, **row_loso})

    cv5, used5   = get_stratified_cv(y, 5)
    cv10, used10 = get_stratified_cv(y, 10)

    row_s5  = evaluate_scheme_dt(X, y, cv5,  f"Stratified {used5}-Fold",  best_params)
    row_s10 = evaluate_scheme_dt(X, y, cv10, f"Stratified {used10}-Fold", best_params)
    all_rows += [{"Target": target, **row_s5}, {"Target": target, **row_s10}]

summary_dt = pd.DataFrame(all_rows)
tuning_summary = pd.DataFrame(tuning_rows)

print("\n=== Decision Tree – Master Table (LOSO + adaptive 5/10-Fold) ===")
print(summary_dt)
print("\n=== Decision Tree – Tuning Summary ===")
print(tuning_summary)

# İstersen CSV:
# summary_dt.to_csv("DT_Balanced_AdaptiveOversampling_AdaptiveCV_AllTargets.csv", index=False)
# tuning_summary.to_csv("DT_Tuning_Summary.csv", index=False)



=== Decision Tree Tuning & Evaluation for: Cervical Lordosis Risk ===
Best params (Stratified 5-Fold): {'criterion': 'gini', 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': None, 'ccp_alpha': 0.0, 'class_weight': 'balanced', 'random_state': 42} | Mean CV (balanced_accuracy): 0.92

=== Decision Tree Tuning & Evaluation for: Kyphosis Risk ===
Best params (Stratified 5-Fold): {'criterion': 'gini', 'max_depth': 2, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'ccp_alpha': 0.0, 'class_weight': 'balanced', 'random_state': 42} | Mean CV (balanced_accuracy): 0.725

=== Decision Tree Tuning & Evaluation for: Lumbar Lordosis Risk ===
Best params (Stratified 4-Fold): {'criterion': 'gini', 'max_depth': 3, 'min_samples_split': 4, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'ccp_alpha': 0.0, 'class_weight': 'balanced', 'random_state': 42} | Mean CV (balanced_accuracy): 0.634

=== Decision Tree Tuning & Evaluation for: Scoliosis Risk ===
