In [24]:
import mlflow
import mlflow.sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from mlflow.models.signature import infer_signature
from hyperopt import fmin, tpe, hp, Trials, STATUS_OK
import pandas as pd
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
import os
import json


In [25]:
df = pd.read_csv("/Users/touka/Desktop/BAU/forth year/s2/AIN3009/project/Mlflow_project/data/diabetes_cleaned.csv")

In [26]:
# Separate Features and Target
X = df.drop("Outcome", axis=1)
y = df["Outcome"]

# Split into Train/Test Sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Scale features 
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [27]:
# Define a function to log classification metrics and artifacts to MLflow
def log_classification_metrics(y_true, y_pred, prefix=""):
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred)
    rec = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)

    mlflow.log_metric(f"{prefix}accuracy", acc)
    mlflow.log_metric(f"{prefix}precision", prec)
    mlflow.log_metric(f"{prefix}recall", rec)
    mlflow.log_metric(f"{prefix}f1_score", f1)

    # Confusion Matrix
    cm = confusion_matrix(y_true, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm)
    disp.plot(cmap="Blues", values_format='d')
    cm_file = f"{prefix}confusion_matrix.png"
    plt.savefig(cm_file)
    mlflow.log_artifact(cm_file)
    plt.close()

    # Classification Report
    report = classification_report(y_true, y_pred)
    report_file = f"{prefix}classification_report.txt"
    with open(report_file, "w") as f:
        f.write(report)
    mlflow.log_artifact(report_file)

    # Clean up
    os.remove(cm_file)
    os.remove(report_file)

In [28]:
# Train a baseline Logistic Regression model and log parameters and model to MLflow
mlflow.set_tracking_uri("file:///Users/touka/Desktop/BAU/forth year/s2/AIN3009/project/Mlflow_project/mlruns")
mlflow.set_experiment("Diabetes_Prediction_Experiment")

with mlflow.start_run(run_name="SVM_Training"):
    mlflow.set_tag("model", "SVM")
    mlflow.set_tag("type", "baseline")
    
    model = SVC(kernel="rbf", C=1.0, gamma="scale", probability=True, random_state=42)
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)

    mlflow.log_params({"kernel": "rbf", "C": 1.0, "gamma": "scale"})
    log_classification_metrics(y_test, y_pred)

    signature = infer_signature(X_test_scaled, y_pred)
    mlflow.sklearn.log_model(model, "svm_model", signature=signature)
    mlflow.register_model(f"runs:/{mlflow.active_run().info.run_id}/svm_model", "DiabetesModel")


Registered model 'DiabetesModel' already exists. Creating a new version of this model...
Created version '3' of model 'DiabetesModel'.


In [29]:
# Hyperopt Tuning
space = {
    "C": hp.loguniform("C", -4, 2),
    "gamma": hp.loguniform("gamma", -4, 1),
    "kernel": hp.choice("kernel", ["linear", "rbf"])
}

def objective(params):
    kernel_name = params["kernel"]
    with mlflow.start_run(run_name=f"SVM-Tune-{kernel_name}", nested=True):
        mlflow.set_tag("model", "SVM")
        mlflow.set_tag("type", "tuning")

        model = SVC(**params, probability=True, random_state=42)
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)

        mlflow.log_params(params)
        log_classification_metrics(y_test, y_pred)

        signature = infer_signature(X_test_scaled, y_pred)
        mlflow.sklearn.log_model(model, "svm_model", signature=signature)

        acc = accuracy_score(y_test, y_pred)
        return {"loss": -acc, "status": STATUS_OK}

trials = Trials()
best_params = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=5, trials=trials)

# Map kernel index to string
kernels = ["linear", "rbf"]
best_params["kernel"] = kernels[best_params["kernel"]]

# log the best SVM model
final_model = SVC(**best_params, probability=True, random_state=42)
final_model.fit(X_train_scaled, y_train)
final_pred = final_model.predict(X_test_scaled)

with mlflow.start_run(run_name="SVM-Best-Tuned"):
    mlflow.set_tag("model", "SVM")
    mlflow.set_tag("type", "best_tuned")

    mlflow.log_params(best_params)
    log_classification_metrics(y_test, final_pred)

    signature = infer_signature(X_test_scaled, final_pred)
    mlflow.sklearn.log_model(final_model, "svm_model", signature=signature)
    mlflow.register_model(f"runs:/{mlflow.active_run().info.run_id}/svm_model", "DiabetesModel")


100%|██████████| 5/5 [00:28<00:00,  5.75s/trial, best loss: -0.9187725631768953]


Registered model 'DiabetesModel' already exists. Creating a new version of this model...
Created version '4' of model 'DiabetesModel'.
