In [8]:
import os
import joblib
import pandas as pd
import numpy as np
from pprint import pprint
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
import mlflow
from mlflow.tracking import MlflowClient
import mlflow.sklearn
from mlflow.models import infer_signature
from sklearn.metrics import accuracy_score

In [9]:
MLFLOW_URI = "http://127.0.0.1:8100"
EXPERIMENT_NAME = "IRIS_Classifier"
REGISTERED_MODEL_NAME = "iris_dt"
ARTIFACT_PATH = "iris_model"  # model artifact path in mlflow run
LOCAL_MODEL_DIR = "models"
TEST_DATA = "data/test.csv"

os.makedirs(LOCAL_MODEL_DIR, exist_ok=True)

mlflow.set_tracking_uri(MLFLOW_URI)
client = MlflowClient(mlflow.get_tracking_uri())
all_experiments = client.search_experiments()

print(all_experiments)

[<Experiment: artifact_location='mlflow-artifacts:/914216624959645619', creation_time=1761392399205, experiment_id='914216624959645619', last_update_time=1761392399205, lifecycle_stage='active', name='IRIS_Classifier', tags={'mlflow.experimentKind': 'custom_model_development'}>, <Experiment: artifact_location='mlflow-artifacts:/0', creation_time=1761386550352, experiment_id='0', last_update_time=1761386550352, lifecycle_stage='active', name='Default', tags={}>]


In [10]:
mlflow.set_experiment(EXPERIMENT_NAME)

all_experiments = client.search_experiments()
print(all_experiments)

[<Experiment: artifact_location='mlflow-artifacts:/914216624959645619', creation_time=1761392399205, experiment_id='914216624959645619', last_update_time=1761392399205, lifecycle_stage='active', name='IRIS_Classifier', tags={'mlflow.experimentKind': 'custom_model_development'}>, <Experiment: artifact_location='mlflow-artifacts:/0', creation_time=1761386550352, experiment_id='0', last_update_time=1761386550352, lifecycle_stage='active', name='Default', tags={}>]


In [11]:

def train_and_store(data_path: str, n_iter=20, random_state=42):
    df = pd.read_csv(data_path)
    print(f"Dataset shape: {df.shape}")

    X = df[['sepal_length','sepal_width','petal_length','petal_width']]
    y = df['species']
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.1, stratify=y, random_state=random_state
    )

    # Search space for DecisionTree hyperparams
    # param_dist = {
    #     "max_depth": [None, 2, 3, 4, 5],
    #     "min_samples_split": [2, 4],
    #     "min_samples_leaf": [1, 2],
    #     "criterion": ["gini", "entropy"]
    # }
    param_dist = {
        "max_depth": [5],
    }

    base = DecisionTreeClassifier(random_state=random_state)

    search = RandomizedSearchCV(
        base,
        param_dist,
        n_iter=n_iter,
        scoring="accuracy",
        cv=3,
        random_state=random_state,
        n_jobs=-1,
        verbose=1
    )

    # Fit search
    search.fit(X_train, y_train)

    best = search.best_estimator_
    best_params = search.best_params_
    best_score = search.best_score_
    print("Best params:", best_params)
    print("Best CV score:", best_score)

    # Evaluate on test set
    preds = best.predict(X_test)
    acc = metrics.accuracy_score(y_test, preds)
    precision = metrics.precision_score(y_test, preds, average="weighted", zero_division=0)
    recall = metrics.recall_score(y_test, preds, average="weighted", zero_division=0)
    f1 = metrics.f1_score(y_test, preds, average="weighted", zero_division=0)

    metrics_dict = {
        "cv_best_score": float(best_score),
        "accuracy": float(acc),
        "precision": float(precision),
        "recall": float(recall),
        "f1": float(f1)
    }

    # Save locally
    local_model_path = os.path.join(LOCAL_MODEL_DIR, "model.joblib")
    joblib.dump(best, local_model_path)
    print(f"Saved model locally at {local_model_path}")

    # MLflow logging and register
    signature = infer_signature(X_test, best.predict(X_test))

    with mlflow.start_run() as run:
        run_id = run.info.run_id
        
        # Log best params and metrics
        mlflow.log_params(best_params)
        mlflow.log_metrics(metrics_dict)
        
        # Log model and register it
        mlflow.sklearn.log_model(
            sk_model=best,
            name=ARTIFACT_PATH,
            registered_model_name=REGISTERED_MODEL_NAME,
            input_example=X_test.iloc[:5],
            signature=signature
        )
        
        # Optionally log the local joblib as an artifact too
        mlflow.log_artifact(local_model_path, artifact_path="local_model")

        print("MLflow run created:", run_id)
        print("Metrics logged:", metrics_dict)

    return local_model_path, X_test, y_test


In [12]:
data_path = os.getenv("DATA_PATH", "data/data.csv")
model_path, X_test, y_test = train_and_store(data_path)
print(f"Model saved to: {model_path}")

Dataset shape: (114, 5)
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Best params: {'max_depth': 5}
Best CV score: 0.8137254901960783
Saved model locally at models/model.joblib


Registered model 'iris_dt' already exists. Creating a new version of this model...
2025/10/25 14:39:53 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: iris_dt, version 4


MLflow run created: 37b4630dedf94989b14194b020bf1991
Metrics logged: {'cv_best_score': 0.8137254901960783, 'accuracy': 0.75, 'precision': 0.7833333333333333, 'recall': 0.75, 'f1': 0.7555555555555555}
üèÉ View run lyrical-fawn-574 at: http://127.0.0.1:8100/#/experiments/914216624959645619/runs/37b4630dedf94989b14194b020bf1991
üß™ View experiment at: http://127.0.0.1:8100/#/experiments/914216624959645619
Model saved to: models/model.joblib


Created version '4' of model 'iris_dt'.


In [15]:

def get_latest_model_uri(model_name: str):
    # Search all model versions for this registered model
    versions = client.search_model_versions(f"name = '{model_name}'")
    if not versions:
        raise ValueError(f"No versions found for model '{model_name}'")

    # Each version object has version number as string, convert to int
    latest = max(versions, key=lambda v: int(v.version))
    latest_version = latest.version
    model_uri = f"models:/{model_name}/{latest_version}"
    return model_uri, latest_version



In [16]:
if not os.path.exists(TEST_DATA):
    print("Test data not found:", TEST_DATA)
    sys.exit(2)

try:
    model_uri, version = get_latest_model_uri(REGISTERED_MODEL_NAME)
    print(f"Selected model version {version}")
except Exception as e:
    print("Error finding model in registry:", str(e))
    sys.exit(2)

    
model_uri = f"models:/{REGISTERED_MODEL_NAME}/{version}"
print("Loading model_uri:", model_uri)

Selected model version 4
Loading model_uri: models:/iris_dt/4


In [1]:
!gsutil mb -l "us-central1" -p "southern-lane-473106-m4" "gs://mlops-southern-lane-473106-m4-week-5"

Creating gs://mlops-southern-lane-473106-m4-week-5/...
ServiceException: 409 A Cloud Storage bucket named 'mlops-southern-lane-473106-m4-week-5' already exists. Try another name. Bucket names must be globally unique across all Google Cloud projects, including those outside of your organization.
