In [25]:
import mlflow
import mlflow.sklearn
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, ConfusionMatrixDisplay, classification_report
from mlflow.models.signature import infer_signature
from hyperopt import fmin, tpe, hp, Trials, STATUS_OK
import matplotlib.pyplot as plt
import os

In [26]:
# Load preprocessed dataset
df = pd.read_csv("/Users/touka/Desktop/BAU/forth year/s2/AIN3009/project/Mlflow_project/data/diabetes_cleaned.csv")

In [27]:
# Separate Features and Target
X = df.drop("Outcome", axis=1)
y = df["Outcome"]

# Split into Train/Test Sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Scale features 
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [28]:
# Define a function to log classification metrics and artifacts to MLflow
def log_classification_metrics(y_true, y_pred, prefix=""):
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred)
    rec = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)

    mlflow.log_metric(f"{prefix}accuracy", acc)
    mlflow.log_metric(f"{prefix}precision", prec)
    mlflow.log_metric(f"{prefix}recall", rec)
    mlflow.log_metric(f"{prefix}f1_score", f1)

    # Confusion Matrix
    cm = confusion_matrix(y_true, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm)
    disp.plot(cmap='Blues', values_format='d')
    cm_filename = f"{prefix}confusion_matrix.png"
    plt.savefig(cm_filename)
    mlflow.log_artifact(cm_filename)
    plt.close()

    # Classification Report
    report = classification_report(y_true, y_pred)
    report_filename = f"{prefix}classification_report.txt"
    with open(report_filename, "w") as f:
        f.write(report)
    mlflow.log_artifact(report_filename)

    # Clean up
    os.remove(cm_filename)
    os.remove(report_filename)

In [30]:
# Train a baseline Logistic Regression model and log parameters and model to MLflow
mlflow.set_tracking_uri("file:///Users/touka/Desktop/BAU/forth year/s2/AIN3009/project/Mlflow_project/mlruns")
mlflow.set_experiment("Diabetes_Prediction_Experiment")

with mlflow.start_run(run_name="LR-Baseline"):
    mlflow.set_tag("model", "LogisticRegression")
    mlflow.set_tag("type", "baseline")

    model = LogisticRegression(max_iter=1000, C=1.0, penalty="l2", solver="lbfgs", random_state=42)
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)

    mlflow.log_params({"max_iter": 1000, "C": 1.0, "penalty": "l2", "solver": "lbfgs"})
    log_classification_metrics(y_test, y_pred)

    signature = infer_signature(X_test_scaled, y_pred)
    mlflow.sklearn.log_model(model, "lr_model", signature=signature)
    mlflow.register_model(f"runs:/{mlflow.active_run().info.run_id}/lr_model", "DiabetesModel")

Registered model 'DiabetesModel' already exists. Creating a new version of this model...
Created version '1' of model 'DiabetesModel'.


In [31]:
# Hyperopt Tuning
lr_space = {
    "C": hp.loguniform("C", -4, 2),
    "penalty": hp.choice("penalty", ["l1", "l2"]),
    "solver": hp.choice("solver", ["liblinear", "saga"])
}

def lr_objective(params):
    if params["penalty"] == "l1" and params["solver"] not in ["liblinear", "saga"]:
        return {"loss": float("inf"), "status": STATUS_OK}

    with mlflow.start_run(run_name=f"LR-Tune-{params['penalty']}", nested=True):
        mlflow.set_tag("model", "LogisticRegression")
        mlflow.set_tag("type", "tuning")

        model = LogisticRegression(max_iter=1000, random_state=42, **params)
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)

        mlflow.log_params(params)
        log_classification_metrics(y_test, y_pred)

        signature = infer_signature(X_test_scaled, y_pred)
        mlflow.sklearn.log_model(model, "lr_model", signature=signature)

        return {"loss": -accuracy_score(y_test, y_pred), "status": STATUS_OK}

lr_trials = Trials()
best_lr = fmin(fn=lr_objective, space=lr_space, algo=tpe.suggest, max_evals=5, trials=lr_trials)

penalties = ["l1", "l2"]
solvers = ["liblinear", "saga"]

final_lr_params = {
    "C": best_lr["C"],
    "penalty": penalties[best_lr["penalty"]],
    "solver": solvers[best_lr["solver"]],
    "max_iter": 1000
}

final_lr = LogisticRegression(**final_lr_params, random_state=42)
final_lr.fit(X_train_scaled, y_train)
final_lr_pred = final_lr.predict(X_test_scaled)

# log the best Logistic Regression model
with mlflow.start_run(run_name="LR-Best-Tuned"):
    mlflow.set_tag("model", "LogisticRegression")
    mlflow.set_tag("type", "best_tuned")

    mlflow.log_params(final_lr_params)
    log_classification_metrics(y_test, final_lr_pred)

    signature = infer_signature(X_test_scaled, final_lr_pred)
    mlflow.sklearn.log_model(final_lr, "lr_model", signature=signature)
    mlflow.register_model(f"runs:/{mlflow.active_run().info.run_id}/lr_model", "DiabetesModel")


100%|██████████| 5/5 [00:27<00:00,  5.45s/trial, best loss: -0.7581227436823105]


Registered model 'DiabetesModel' already exists. Creating a new version of this model...
Created version '2' of model 'DiabetesModel'.
