## Train using MLflow

In [1]:
from sklearn.datasets import fetch_california_housing
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import joblib
import mlflow
import mlflow.sklearn

In [8]:
# Initialize MLflow
# mlflow.set_tracking_uri("file:mlruns") 
mlflow.set_tracking_uri("file:mlruns")
mlflow.set_experiment("cal-housing-reg")

2025/07/06 21:05:49 INFO mlflow.tracking.fluent: Experiment with name 'cal-housing-reg' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:///Users/abc/Repos/ML-end-to-end/notebooks/mlruns/307506043393679682', creation_time=1751861149718, experiment_id='307506043393679682', last_update_time=1751861149718, lifecycle_stage='active', name='cal-housing-reg', tags={}>

In [3]:
# Load the California housing dataset
data = fetch_california_housing()
X, y = data.data, data.target

In [4]:
# Split Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Train model v1 with MLflow
from sklearn.metrics import root_mean_squared_error, mean_absolute_error, r2_score

with mlflow.start_run() as run1:
    # Train model
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)

    y_preds_train = model.predict(X_train)
    y_preds_test = model.predict(X_test)

    rmse_train = root_mean_squared_error(y_train, y_preds_train)
    rmse_test = root_mean_squared_error(y_test, y_preds_test)

    mae_train = mean_absolute_error(y_train, y_preds_train)
    mae_test = mean_absolute_error(y_test, y_preds_test)    

    r2_train = r2_score(y_train, y_preds_train)
    r2_test = r2_score(y_test, y_preds_test)

    
    # Log params, metrics, and model
    mlflow.log_param("model_type", "RandomForestRegression")
    mlflow.log_param("random_state", 42)
    mlflow.log_param("n_estimators", 100)
    mlflow.log_metric("rmse_train", rmse_train)
    mlflow.log_metric("rmse_test", rmse_test)
    mlflow.log_metric("mae_train", mae_train)
    mlflow.log_metric("mae_test", mae_test)
    mlflow.log_metric("r2_train", r2_train)
    mlflow.log_metric("r2_test", r2_test)

    # Log input example for the model
    input_example = X_test[0:1]
    mlflow.sklearn.log_model(model, name="model_v1", input_example=input_example) # registered_model_name=model_name, if you want to register the model
    run1_id = run1.info.run_id

    # # Save local copy
    # joblib.dump(model, "../models/model_mlflow_v1.pkl")


In [None]:
# Register the model v1
result = mlflow.register_model(
    model_uri=f"runs:/{run1_id}/model",
    name="housing_price_model"
)

client = mlflow.MlflowClient()

# Set model version alias
model_name = "housing_price_model"
model_version_alias = "staging"
client.set_registered_model_alias(
    model_name, model_version_alias, "1"
) 

In [None]:
# Train model v2 with MLflow
with mlflow.start_run() as run2:
    # Train model
    model = RandomForestRegressor(n_estimators=1000, random_state=42)
    model.fit(X_train, y_train)

    y_preds_train = model.predict(X_train)
    y_preds_test = model.predict(X_test)

    rmse_train = root_mean_squared_error(y_train, y_preds_train)
    rmse_test = root_mean_squared_error(y_test, y_preds_test)

    mae_train = mean_absolute_error(y_train, y_preds_train)
    mae_test = mean_absolute_error(y_test, y_preds_test)    

    r2_train = r2_score(y_train, y_preds_train)
    r2_test = r2_score(y_test, y_preds_test)

    # Log params, metrics, and model
    mlflow.log_param("model_type", "RandomForestRegression")
    mlflow.log_param("random_state", 42)
    mlflow.log_param("n_estimators", 1000)
    mlflow.log_metric("rmse_train", rmse_train)
    mlflow.log_metric("rmse_test", rmse_test)
    mlflow.log_metric("mae_train", mae_train)
    mlflow.log_metric("mae_test", mae_test)
    mlflow.log_metric("r2_train", r2_train)
    mlflow.log_metric("r2_test", r2_test)
    
    # Log input example for the model
    input_example = X_test[0:1]
    mlflow.sklearn.log_model(model, name="model_v2", input_example=input_example) # registered_model_name=model_name, if you want to register the model
    run2_id = run2.info.run_id

    # # Save local copy
    # joblib.dump(model, "../models/model_mlflow_v2.pkl")



In [None]:
# Register the model v2
result = mlflow.register_model(
    model_uri=f"runs:/{run2_id}/model",
    name="housing_price_model"
)

# Set model version alias
model_name = "housing_price_model"
model_version_alias = "prod"
client.set_registered_model_alias(
    model_name, model_version_alias, "2"
) 

In [None]:
# # check registered models
# client = mlflow.tracking.MlflowClient()
# for model in client.search_registered_models(filter_string="name LIKE 'housing_price_model'"):
#     print(model.aliases)

{'prod': '2', 'staging': '1'}
