# 1. Import Libraries

In [0]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import mlflow
import mlflow.sklearn

# 2. Load Data

In [0]:

df = pd.read_csv("/dbfs/FileStore/tables/kidney_disease.csv")
df.replace("?", np.nan, inplace=True)
for col in ['age', 'bp', 'bgr', 'bu', 'sc', 'sod', 'pot', 'hemo', 'pcv', 'wc', 'rc']:
    df[col] = pd.to_numeric(df[col], errors='coerce')


# 3. Preprocessing

In [0]:

df.drop(columns=['id'], inplace=True)
df = df.dropna()
df['classification'] = df['classification'].replace({'ckd': 1, 'notckd': 0})

X = df.drop('classification', axis=1)
y = df['classification']

X = pd.get_dummies(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)






# 4. Define Models

In [0]:

models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

# 5. GridSearch Tuning Parameters

In [0]:
grid_params = {
    "Logistic Regression": {
        'clf__C': [0.1, 1, 10]
    },
    "Random Forest": {
        'clf__n_estimators': [100, 200],
        'clf__max_depth': [5, 10]
    },
    "XGBoost": {
        'clf__n_estimators': [100, 200],
        'clf__learning_rate': [0.05, 0.1],
        'clf__max_depth': [3, 5]
    }
}

# 6. Train, Tune and Log with MLflow

In [0]:
mlflow.set_experiment("/Users/zhao.xinyuan@northeastern.edu/ckd_model_comparison")


for name, model in models.items():
    print(f"\n\n Tuning and Training: {name}")
    
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('clf', model)
    ])

    grid = GridSearchCV(pipeline, grid_params[name], cv=3, scoring='accuracy')

    with mlflow.start_run(run_name=name):
        grid.fit(X_train, y_train)
        best_model = grid.best_estimator_

        preds = best_model.predict(X_test)
        acc = accuracy_score(y_test, preds)
        report = classification_report(y_test, preds, output_dict=True)

        mlflow.log_param("model_name", name)
        mlflow.log_params(grid.best_params_)
        mlflow.log_metric("accuracy", acc)

        for k, v in report.items():
            if isinstance(v, dict):
                for metric, val in v.items():
                    mlflow.log_metric(f"{k}_{metric}", val)

        mlflow.sklearn.log_model(best_model, f"model_{name.replace(' ', '_').lower()}")
        print(f"{name} logged to MLflow with accuracy: {acc:.4f}")

Accuracy = 1.0000 for all models
→ This suggests either:

Your dataset is very small/simple or

There may be data leakage (e.g., label information leaking into features)
