In [25]:
import mlflow
import mlflow.sklearn
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, ConfusionMatrixDisplay, classification_report
from mlflow.models.signature import infer_signature
from hyperopt import fmin, tpe, hp, Trials, STATUS_OK
import matplotlib.pyplot as plt
import os

In [26]:
df = pd.read_csv("/Users/touka/Desktop/BAU/forth year/s2/AIN3009/project/Mlflow_project/data/diabetes_cleaned.csv")

In [27]:
# Separate Features and Target
X = df.drop("Outcome", axis=1)
y = df["Outcome"]

# Split into Train/Test Sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

In [28]:
# Define a function to log classification metrics and artifacts to MLflow
def log_classification_metrics(y_true, y_pred, prefix=""):
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred)
    rec = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)

    mlflow.log_metric(f"{prefix}accuracy", acc)
    mlflow.log_metric(f"{prefix}precision", prec)
    mlflow.log_metric(f"{prefix}recall", rec)
    mlflow.log_metric(f"{prefix}f1_score", f1)

    # Confusion Matrix
    cm = confusion_matrix(y_true, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm)
    disp.plot(cmap='Blues', values_format='d')
    cm_filename = f"{prefix}confusion_matrix.png"
    plt.savefig(cm_filename)
    mlflow.log_artifact(cm_filename)
    plt.close()

    # Classification Report
    report = classification_report(y_true, y_pred)
    report_filename = f"{prefix}classification_report.txt"
    with open(report_filename, "w") as f:
        f.write(report)
    mlflow.log_artifact(report_filename)
    
    # Clean up
    os.remove(cm_filename)
    os.remove(report_filename)

In [29]:
# Train a baseline Logistic Regression model and log parameters and model to MLflow
mlflow.set_tracking_uri("file:///Users/touka/Desktop/BAU/forth year/s2/AIN3009/project/Mlflow_project/mlruns")
mlflow.set_experiment("Diabetes_Prediction_Experiment")

with mlflow.start_run(run_name="RF-Training"):
    mlflow.set_tag("model", "RandomForest")
    mlflow.set_tag("type", "baseline")

    rf = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)
    rf.fit(X_train, y_train)

    # Predict and evaluate
    y_pred = rf.predict(X_test)
   
    mlflow.log_params({"n_estimators": 100, "max_depth": 5})
    log_classification_metrics(y_test, y_pred)

    signature = infer_signature(X_test, y_pred)
    mlflow.sklearn.log_model(rf, "rf_model", signature=signature)
    mlflow.register_model(f"runs:/{mlflow.active_run().info.run_id}/rf_model", "DiabetesModel")


Registered model 'DiabetesModel' already exists. Creating a new version of this model...
Created version '5' of model 'DiabetesModel'.


In [30]:
# Hyperopt Tuning
rf_space = {
    "n_estimators": hp.choice("n_estimators", [50, 100, 200]),
    "max_depth": hp.choice("max_depth", [3, 5, 7, 10]),
    "min_samples_split": hp.choice("min_samples_split", [2, 4, 6])
}

def rf_objective(params):
    with mlflow.start_run(run_name=f"RF-Tune-{params['n_estimators']}-{params['max_depth']}", nested=True):
        mlflow.set_tag("model", "RandomForest")
        mlflow.set_tag("type", "tuning")

        model = RandomForestClassifier(**params, random_state=42)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        mlflow.log_params(params)
        log_classification_metrics(y_test, y_pred)

        signature = infer_signature(X_test, y_pred)
        mlflow.sklearn.log_model(model, "rf_model", signature=signature)

        return {"loss": -accuracy_score(y_test, y_pred), "status": STATUS_OK}

rf_trials = Trials()
best_rf = fmin(fn=rf_objective, space=rf_space, algo=tpe.suggest, max_evals=5, trials=rf_trials)

rf_space_mapping = {
    "n_estimators": [50, 100, 200],
    "max_depth": [3, 5, 7, 10],
    "min_samples_split": [2, 4, 6]
}

best_rf_params = {
    "n_estimators": rf_space_mapping["n_estimators"][best_rf["n_estimators"]],
    "max_depth": rf_space_mapping["max_depth"][best_rf["max_depth"]],
    "min_samples_split": rf_space_mapping["min_samples_split"][best_rf["min_samples_split"]]
}

final_rf = RandomForestClassifier(**best_rf_params, random_state=42)
final_rf.fit(X_train, y_train)
final_rf_pred = final_rf.predict(X_test)

# log the best Random Forest model
with mlflow.start_run(run_name="RF-Best-Tuned"):
    mlflow.set_tag("model", "RandomForest")
    mlflow.set_tag("type", "best_tuned")

    mlflow.log_params(best_rf_params)
    log_classification_metrics(y_test, final_rf_pred)

    signature = infer_signature(X_test, final_rf_pred)
    mlflow.sklearn.log_model(final_rf, "rf_model", signature=signature)
    mlflow.register_model(f"runs:/{mlflow.active_run().info.run_id}/rf_model", "DiabetesModel")

  0%|          | 0/5 [00:00<?, ?trial/s, best loss=?]




 20%|██        | 1/5 [00:05<00:21,  5.35s/trial, best loss: -0.759927797833935]




 40%|████      | 2/5 [00:10<00:16,  5.43s/trial, best loss: -0.9638989169675091]




 60%|██████    | 3/5 [00:16<00:10,  5.36s/trial, best loss: -0.9638989169675091]




 80%|████████  | 4/5 [00:21<00:05,  5.37s/trial, best loss: -0.9638989169675091]




100%|██████████| 5/5 [00:28<00:00,  5.79s/trial, best loss: -0.9638989169675091]


Registered model 'DiabetesModel' already exists. Creating a new version of this model...
Created version '6' of model 'DiabetesModel'.
