In [3]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import (confusion_matrix, classification_report,
                             accuracy_score, precision_score, recall_score, f1_score,
                             roc_auc_score, average_precision_score)

RANDOM_STATE = 0
TEST_SIZE = 1/3

# 0) Load + clean
df = pd.read_csv("CKD.csv")
df = df.replace(["?", "NA", "na", "NaN", "nan", ""], np.nan)

# 1) Target + features
y = (df["classification"] == "yes").astype(int)  # 1=yes, 0=no
X = df.drop(columns=["classification"])

# 2) Train/test split (STRATIFY!)
X_train_raw, X_test_raw, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y
)

# 3) Train-based IMPUTATION (simple & leakage-safe)
num_cols = X_train_raw.select_dtypes(exclude="object").columns
cat_cols = X_train_raw.select_dtypes(include="object").columns

# numeric -> median
num_medians = X_train_raw[num_cols].median()
X_train_raw[num_cols] = X_train_raw[num_cols].fillna(num_medians)
X_test_raw[num_cols]  = X_test_raw[num_cols].fillna(num_medians)

# categorical -> mode from train
for c in cat_cols:
    mode_val = X_train_raw[c].mode(dropna=True)
    if len(mode_val) == 0:
        # if an all-NaN cat col exists, fill with a placeholder
        fillv = "missing"
    else:
        fillv = mode_val.iloc[0]
    X_train_raw[c] = X_train_raw[c].fillna(fillv)
    X_test_raw[c]  = X_test_raw[c].fillna(fillv)

# 4) One-hot encode TRAIN, align TEST to TRAIN columns
X_train = pd.get_dummies(X_train_raw, dtype=float, drop_first=True)
X_test  = pd.get_dummies(X_test_raw,  dtype=float, drop_first=True)
X_train, X_test = X_train.align(X_test, join="left", axis=1, fill_value=0)

# 5) Scaled copies (for models that like scaling)
scaler = StandardScaler(with_mean=True, with_std=True)
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled  = scaler.transform(X_test)

# 6) Helper: fit + evaluate
def evaluate_classifier(name, estimator, param_grid, Xtr, ytr, Xte, yte, scoring="roc_auc"):
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
    grid = GridSearchCV(estimator, param_grid, cv=cv, n_jobs=-1, verbose=1, scoring=scoring, refit=True)
    grid.fit(Xtr, ytr)

    best = grid.best_estimator_
    y_pred = best.predict(Xte)
    # get scores for ROC/PR AUC
    if hasattr(best, "decision_function"):
        y_score = best.decision_function(Xte)
    elif hasattr(best, "predict_proba"):
        y_score = best.predict_proba(Xte)[:, 1]
    else:
        y_score = y_pred  # fallback (rare)

    print(f"\n===== {name} =====")
    print("Best CV score (", scoring, "):", grid.best_score_)
    print("Best params:", grid.best_params_)
    print("\nConfusion matrix:\n", confusion_matrix(yte, y_pred))
    print("\nClassification report:\n", classification_report(yte, y_pred))
    print("Accuracy :", accuracy_score(yte, y_pred))
    print("Precision:", precision_score(yte, y_pred))
    print("Recall   :", recall_score(yte, y_pred))
    print("F1       :", f1_score(yte, y_pred))
    try:
        print("ROC AUC  :", roc_auc_score(yte, y_score))
        print("PR  AUC  :", average_precision_score(yte, y_score))
    except Exception:
        pass
    return best, grid.best_params_




In [4]:
# 1) LOGISTIC REGRESSION 
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(max_iter=2000, class_weight="balanced", random_state=RANDOM_STATE)
param_grid_lr = {
    "C": np.logspace(-3, 3, 13),
    "solver": ["liblinear", "lbfgs"],
    "penalty": ["l2"],
}
best_lr, params_lr = evaluate_classifier(
    "LogisticRegression",
    lr, param_grid_lr,
    X_train_scaled, y_train, X_test_scaled, y_test,
    scoring="roc_auc"
)


# 2) RANDOM FOREST 
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(random_state=RANDOM_STATE, class_weight="balanced", n_jobs=-1)
param_grid_rf = {
    "n_estimators": [200, 400, 600],
    "max_depth": [None, 5, 10, 15],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "max_features": ["sqrt", "log2", None],
}
best_rf, params_rf = evaluate_classifier(
    "RandomForest",
    rf, param_grid_rf,
    X_train, y_train, X_test, y_test,
    scoring="roc_auc"
)


Fitting 5 folds for each of 26 candidates, totalling 130 fits

===== LogisticRegression =====
Best CV score ( roc_auc ): 1.0
Best params: {'C': np.float64(0.1), 'penalty': 'l2', 'solver': 'lbfgs'}

Confusion matrix:
 [[50  0]
 [ 4 79]]

Classification report:
               precision    recall  f1-score   support

           0       0.93      1.00      0.96        50
           1       1.00      0.95      0.98        83

    accuracy                           0.97       133
   macro avg       0.96      0.98      0.97       133
weighted avg       0.97      0.97      0.97       133

Accuracy : 0.9699248120300752
Precision: 1.0
Recall   : 0.9518072289156626
F1       : 0.9753086419753086
ROC AUC  : 0.9995180722891566
PR  AUC  : 0.9997165131112689
Fitting 5 folds for each of 324 candidates, totalling 1620 fits

===== RandomForest =====
Best CV score ( roc_auc ): 1.0
Best params: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}



In [5]:
# 3) DECISION TREE 
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(random_state=RANDOM_STATE)
param_grid_dt = {
    "criterion": ["gini", "entropy", "log_loss"],
    "max_depth": [None, 4, 6, 8, 10, 12],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
}
best_dt, params_dt = evaluate_classifier(
    "DecisionTree",
    dt, param_grid_dt,
    X_train, y_train, X_test, y_test,
    scoring="roc_auc"
)


# 4) k-NEAREST NEIGHBORS 
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()
param_grid_knn = {
    "n_neighbors": list(range(3, 32, 2)),
    "weights": ["uniform", "distance"],
    "p": [1, 2],  # 1=Manhattan, 2=Euclidean
}
best_knn, params_knn = evaluate_classifier(
    "kNN",
    knn, param_grid_knn,
    X_train_scaled, y_train, X_test_scaled, y_test,
    scoring="roc_auc"
)



Fitting 5 folds for each of 162 candidates, totalling 810 fits

===== DecisionTree =====
Best CV score ( roc_auc ): 0.9711051693404634
Best params: {'criterion': 'gini', 'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 10}

Confusion matrix:
 [[48  2]
 [ 2 81]]

Classification report:
               precision    recall  f1-score   support

           0       0.96      0.96      0.96        50
           1       0.98      0.98      0.98        83

    accuracy                           0.97       133
   macro avg       0.97      0.97      0.97       133
weighted avg       0.97      0.97      0.97       133

Accuracy : 0.9699248120300752
Precision: 0.9759036144578314
Recall   : 0.9759036144578314
F1       : 0.9759036144578314
ROC AUC  : 0.9756626506024098
PR  AUC  : 0.9771266510446963
Fitting 5 folds for each of 60 candidates, totalling 300 fits

===== kNN =====
Best CV score ( roc_auc ): 1.0
Best params: {'n_neighbors': 11, 'p': 1, 'weights': 'uniform'}

Confusion matrix:


In [6]:

# 5) NAIVE BAYES (GaussianNB,
from sklearn.naive_bayes import GaussianNB

# GaussianNB tolerates mixed (continuous + OHE) features; scaling is OK.
gnb = GaussianNB()
param_grid_gnb = {
    "var_smoothing": np.logspace(-12, -6, 7)
}
best_gnb, params_gnb = evaluate_classifier(
    "GaussianNB",
    gnb, param_grid_gnb,
    X_train_scaled, y_train, X_test_scaled, y_test,
    scoring="roc_auc"
)

Fitting 5 folds for each of 7 candidates, totalling 35 fits

===== GaussianNB =====
Best CV score ( roc_auc ): 1.0
Best params: {'var_smoothing': np.float64(1e-12)}

Confusion matrix:
 [[49  1]
 [ 2 81]]

Classification report:
               precision    recall  f1-score   support

           0       0.96      0.98      0.97        50
           1       0.99      0.98      0.98        83

    accuracy                           0.98       133
   macro avg       0.97      0.98      0.98       133
weighted avg       0.98      0.98      0.98       133

Accuracy : 0.9774436090225563
Precision: 0.9878048780487805
Recall   : 0.9759036144578314
F1       : 0.9818181818181818
ROC AUC  : 0.9897590361445783
PR  AUC  : 0.9878101465936793


In [13]:
!pip install xgboost lightgbm catboost
from xgboost import XGBClassifier

    # Handle mild imbalance
    # class balance (pos = 1)
neg = (y_train == 0).sum()
pos = (y_train == 1).sum()
spw = neg / pos if pos else 1.0  

xgb = XGBClassifier(
    random_state=RANDOM_STATE,
    tree_method="hist",
    n_estimators=300,
    eval_metric="logloss",
    scale_pos_weight=spw,
    n_jobs=-1,
)
param_grid_xgb = [
    # gbtree
    {
        "booster": ["gbtree"],
        "max_depth": [3, 4, 5],
        "learning_rate": [0.05, 0.1, 0.2],
        "subsample": [0.7, 0.9, 1.0],
        "colsample_bytree": [0.7, 0.9, 1.0],
        "min_child_weight": [1, 3, 5],
        "gamma": [0, 1],
    },
    # dart
    {
        "booster": ["dart"],
        "max_depth": [3, 4, 5],
        "learning_rate": [0.05, 0.1, 0.2],
        "subsample": [0.7, 0.9, 1.0],
        "colsample_bytree": [0.7, 0.9, 1.0],
        "min_child_weight": [1, 3, 5],
        "gamma": [0, 1],
        "rate_drop": [0.0, 0.1, 0.2],
        "skip_drop": [0.0, 0.5],
        "normalize_type": ["tree", "forest"],
        "sample_type": ["uniform", "weighted"],
    },
]
best_xgb, params_xgb = evaluate_classifier(
    "XGBoost",
    xgb, param_grid_xgb,
    X_train, y_train, X_test, y_test,
    scoring="roc_auc"
)

Collecting xgboost
  Downloading xgboost-3.0.4-py3-none-win_amd64.whl.metadata (2.1 kB)
Collecting lightgbm
  Downloading lightgbm-4.6.0-py3-none-win_amd64.whl.metadata (17 kB)
Collecting catboost
  Downloading catboost-1.2.8-cp311-cp311-win_amd64.whl.metadata (1.5 kB)
Collecting graphviz (from catboost)
  Downloading graphviz-0.21-py3-none-any.whl.metadata (12 kB)
Collecting plotly (from catboost)
  Downloading plotly-6.3.0-py3-none-any.whl.metadata (8.5 kB)
Collecting narwhals>=1.15.1 (from plotly->catboost)
  Downloading narwhals-2.1.1-py3-none-any.whl.metadata (11 kB)
Downloading xgboost-3.0.4-py3-none-win_amd64.whl (56.8 MB)
   ---------------------------------------- 0.0/56.8 MB ? eta -:--:--
   ------- -------------------------------- 10.2/56.8 MB 58.3 MB/s eta 0:00:01
   ---------------- ----------------------- 23.1/56.8 MB 58.5 MB/s eta 0:00:01
   ------------------------ --------------- 35.1/56.8 MB 58.8 MB/s eta 0:00:01
   --------------------------------- ------ 46.9/56.8 M

KeyboardInterrupt: 