In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import (confusion_matrix, classification_report,
                             roc_auc_score, accuracy_score, precision_score, recall_score, f1_score)

# 0) Load + clean (simple)
dataset = pd.read_csv("CKD.csv")
dataset = dataset.replace(["?", "NA", "na", "NaN", "nan", ""], np.nan)  # optional, but safer

# 1) Target/Features (keep your style)
y = (dataset["classification"] == "yes").astype(int)  # 1=yes, 0=no
X = dataset.drop(columns=["classification"])

# 2) Train/test split with stratify (important for your mild imbalance)
X_train_raw, X_test_raw, y_train, y_test = train_test_split(
    X, y, test_size=1/3, random_state=0, stratify=y
)

# 3) One-hot encode using TRAIN set columns; align TEST to TRAIN
X_train = pd.get_dummies(X_train_raw, dtype=float, drop_first=True)
X_test  = pd.get_dummies(X_test_raw,  dtype=float, drop_first=True)

# align columns: keep train’s columns, drop unseen test-only columns, fill missing with 0
X_train, X_test = X_train.align(X_test, join="left", axis=1, fill_value=0)

# 4) Scale features (scaling dummies is not strictly needed, but harmless and keeps SVM happy)
sc = StandardScaler(with_mean=True, with_std=True)
X_train = sc.fit_transform(X_train)
X_test  = sc.transform(X_test)

# 5) Grid search SVC (same grid you used)
param_grid = {
    "kernel": ["rbf", "linear"],
    "C": [0.5, 1, 3, 10],
    "gamma": ["scale", 0.1, 0.01],  # ignored for linear
}
svc = SVC(probability=True, random_state=0)  # optional: class_weight="balanced"
grid = GridSearchCV(
    svc,
    param_grid,
    cv=5,
    n_jobs=-1,
    verbose=1,
    scoring="f1_weighted"   # or "roc_auc" if you prefer AUC as your selector
)
grid.fit(X_train, y_train)

# 6) Evaluate
y_pred = grid.predict(X_test)

# Use SCORES for ROC AUC (not hard labels)
if hasattr(grid.best_estimator_, "decision_function"):
    y_score = grid.decision_function(X_test)
else:
    y_score = grid.predict_proba(X_test)[:, 1]

cm = confusion_matrix(y_test, y_pred)
print(cm)
print(classification_report(y_test, y_pred))
print("Accuracy :", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall   :", recall_score(y_test, y_pred))
print("F1       :", f1_score(y_test, y_pred))
print("ROC AUC  :", roc_auc_score(y_test, y_score))

print("\nBest params:", grid.best_params_)


Fitting 5 folds for each of 24 candidates, totalling 120 fits
[[49  1]
 [ 1 82]]
              precision    recall  f1-score   support

           0       0.98      0.98      0.98        50
           1       0.99      0.99      0.99        83

    accuracy                           0.98       133
   macro avg       0.98      0.98      0.98       133
weighted avg       0.98      0.98      0.98       133

Accuracy : 0.9849624060150376
Precision: 0.9879518072289156
Recall   : 0.9879518072289156
F1       : 0.9879518072289156
ROC AUC  : 0.9990361445783132

Best params: {'C': 0.5, 'gamma': 'scale', 'kernel': 'rbf'}
