In [2]:
import os
import pickle
import click
import mlflow

from mlflow.entities import ViewType
from mlflow.tracking import MlflowClient
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

HPO_EXPERIMENT_NAME = "random-forest-hyperopt"
EXPERIMENT_NAME = "random-forest-best-models"
RF_PARAMS = ['max_depth', 'n_estimators', 'min_samples_split', 'min_samples_leaf', 'random_state', 'n_jobs']

mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment(EXPERIMENT_NAME)
mlflow.sklearn.autolog()


def load_pickle(filename):
    with open(filename, "rb") as f_in:
        return pickle.load(f_in)


def train_and_log_model(data_path, params):
    X_train, y_train = load_pickle(os.path.join(data_path, "train.pkl"))
    X_val, y_val = load_pickle(os.path.join(data_path, "val.pkl"))
    X_test, y_test = load_pickle(os.path.join(data_path, "test.pkl"))

    with mlflow.start_run():
        for param in RF_PARAMS:
            params[param] = int(params[param])

        rf = RandomForestRegressor(**params)
        rf.fit(X_train, y_train)

        # Evaluate model on the validation and test sets
        val_rmse = mean_squared_error(y_val, rf.predict(X_val), squared=False)
        mlflow.log_metric("val_rmse", val_rmse)
        test_rmse = mean_squared_error(y_test, rf.predict(X_test), squared=False)
        mlflow.log_metric("test_rmse", test_rmse)





2024/05/25 22:08:43 INFO mlflow.tracking.fluent: Experiment with name 'random-forest-best-models' does not exist. Creating a new experiment.


In [16]:
client = MlflowClient()

# Retrieve the top_n model runs and log the models
experiment = client.get_experiment_by_name(HPO_EXPERIMENT_NAME)
runs = client.search_runs(
    experiment_ids=experiment.experiment_id,
    run_view_type=ViewType.ACTIVE_ONLY,
    max_results=5,
    order_by=["metrics.rmse ASC"]
)

for run in runs:
    print(run.data.params)


{'n_estimators': '12', 'max_depth': '19', 'min_samples_split': '10', 'min_samples_leaf': '4', 'random_state': '42', 'n_jobs': '-1'}
{'n_estimators': '34', 'max_depth': '15', 'min_samples_split': '2', 'min_samples_leaf': '4', 'random_state': '42', 'n_jobs': '-1'}
{'n_estimators': '25', 'max_depth': '20', 'min_samples_split': '8', 'min_samples_leaf': '3', 'random_state': '42', 'n_jobs': '-1'}
{'n_estimators': '28', 'max_depth': '16', 'min_samples_split': '3', 'min_samples_leaf': '3', 'random_state': '42', 'n_jobs': '-1'}
{'n_estimators': '16', 'max_depth': '4', 'min_samples_split': '2', 'min_samples_leaf': '4', 'random_state': '42', 'n_jobs': '-1'}


In [17]:
for run in runs:
    train_and_log_model(data_path="./output", params=run.data.params)



In [28]:
# Select the model with the lowest test RMSE
best_experiments = client.get_experiment_by_name(EXPERIMENT_NAME)
experiment_id = best_experiments.experiment_id
best_model = client.search_runs(
    experiment_ids=experiment_id,
    filter_string="",
    max_results=5,
    order_by=["metrics.test_rmse ASC"]
)

print(best_model[0])


<Run: data=<RunData: metrics={'test_rmse': 2.7968922677815167,
 'training_mean_absolute_error': 1.7709469996668474,
 'training_mean_squared_error': 6.471086504822739,
 'training_r2_score': 0.2618068680873662,
 'training_root_mean_squared_error': 2.54383303399078,
 'training_score': 0.2618068680873662,
 'val_rmse': 2.486169593903545}, params={'bootstrap': 'True',
 'ccp_alpha': '0.0',
 'criterion': 'squared_error',
 'max_depth': '15',
 'max_features': '1.0',
 'max_leaf_nodes': 'None',
 'max_samples': 'None',
 'min_impurity_decrease': '0.0',
 'min_samples_leaf': '4',
 'min_samples_split': '2',
 'min_weight_fraction_leaf': '0.0',
 'monotonic_cst': 'None',
 'n_estimators': '34',
 'n_jobs': '-1',
 'oob_score': 'False',
 'random_state': '42',
 'verbose': '0',
 'warm_start': 'False'}, tags={'estimator_class': 'sklearn.ensemble._forest.RandomForestRegressor',
 'estimator_name': 'RandomForestRegressor',
 'mlflow.log-model.history': '[{"run_id": "dc6e5ca9a82943e3a0ac9b05d51103e3", '
             

In [46]:
# best_run = client.search_runs( ...  )[0]
best_experiments = client.get_experiment_by_name(EXPERIMENT_NAME)
best_model = client.search_runs(
    experiment_ids=best_experiments.experiment_id,
    filter_string="",
    max_results=5,
    order_by=["metrics.test_rmse ASC"]
)

print(best_model[0])

<Run: data=<RunData: metrics={'test_rmse': 2.7968922677815167,
 'training_mean_absolute_error': 1.7709469996668474,
 'training_mean_squared_error': 6.471086504822739,
 'training_r2_score': 0.2618068680873662,
 'training_root_mean_squared_error': 2.54383303399078,
 'training_score': 0.2618068680873662,
 'val_rmse': 2.486169593903545}, params={'bootstrap': 'True',
 'ccp_alpha': '0.0',
 'criterion': 'squared_error',
 'max_depth': '15',
 'max_features': '1.0',
 'max_leaf_nodes': 'None',
 'max_samples': 'None',
 'min_impurity_decrease': '0.0',
 'min_samples_leaf': '4',
 'min_samples_split': '2',
 'min_weight_fraction_leaf': '0.0',
 'monotonic_cst': 'None',
 'n_estimators': '34',
 'n_jobs': '-1',
 'oob_score': 'False',
 'random_state': '42',
 'verbose': '0',
 'warm_start': 'False'}, tags={'estimator_class': 'sklearn.ensemble._forest.RandomForestRegressor',
 'estimator_name': 'RandomForestRegressor',
 'mlflow.log-model.history': '[{"run_id": "dc6e5ca9a82943e3a0ac9b05d51103e3", '
             

In [36]:
print(best_model[0].info.run_id)

dc6e5ca9a82943e3a0ac9b05d51103e3


In [47]:
best_test_rmse = best_model[0].data.metrics['test_rmse']
print(best_test_rmse)

2.7968922677815167


In [37]:
# Register the best model
# mlflow.register_model( ... )
run_id = best_model[0].info.run_id
model_uri = f"runs:/{run_id}/model"
mlflow.register_model(
    model_uri=model_uri,
    name="random-nyc-taxi-models",
    tags={"model":"best"}
)

Registered model 'random-nyc-taxi-models' already exists. Creating a new version of this model...
2024/05/25 22:40:42 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: random-nyc-taxi-models, version 1
Created version '1' of model 'random-nyc-taxi-models'.


<ModelVersion: aliases=[], creation_timestamp=1716651642842, current_stage='None', description='', last_updated_timestamp=1716651642842, name='random-nyc-taxi-models', run_id='dc6e5ca9a82943e3a0ac9b05d51103e3', run_link='', source='mlflow-artifacts:/5/dc6e5ca9a82943e3a0ac9b05d51103e3/artifacts/model', status='READY', status_message='', tags={'model': 'best'}, user_id='', version='1'>