### MLFlow model registry

The key idea in this notebook is to do the same done in the ui.

In [46]:
from mlflow.tracking import MlflowClient
from mlflow import register_model, set_tracking_uri
import mlflow

MLFLOW_TRACKING_URI = "sqlite:///mlflow.db"
client = MlflowClient(tracking_uri=MLFLOW_TRACKING_URI)
client.get_experiment(1)

<Experiment: artifact_location='/home/yezergm/mlops-zoomcamp/02-experiment-tracking/mlruns/1', creation_time=1743783492519, experiment_id='1', last_update_time=1743783492519, lifecycle_stage='active', name='NYC-taxi-duration-prediction', tags={}>

In [3]:
client.create_experiment("fake-exp-02")
client.get_experiment(3)

<Experiment: artifact_location='/home/yezergm/mlops-zoomcamp/02-experiment-tracking/mlruns/3', creation_time=1744365418165, experiment_id='3', last_update_time=1744365418165, lifecycle_stage='active', name='fake-exp-02', tags={}>

In [2]:
from mlflow.entities import ViewType
runs = client.search_runs(
    experiment_ids="1",
    filter_string="metrics.rmse < 10.0",
    order_by=["metrics.rmse ASC"],
    run_view_type=ViewType.ACTIVE_ONLY,
)
for run in runs:
    print(f"run id: {run.info.run_id}, rmse: {run.data.metrics['rmse']:.4f}, {run.info.run_name}")


run id: 3463b2d27a214a959b40ab09c3307d55, rmse: 6.3372, rebellious-calf-803
run id: b20a13d5349c4884abe7878647f4b3e0, rmse: 6.6505, dazzling-shrew-25
run id: 82edfac7c99c4e7398548e465fbc371e, rmse: 6.7113, overjoyed-bird-183
run id: 30eb57108c0041b6b8ff08d1479a6ded, rmse: 8.0407, nimble-asp-409


In [52]:
# register models with commands
run_id = "30eb57108c0041b6b8ff08d1479a6ded"
model_uri = f"runs:/{run_id}/models_mlflow" # go to run annd check if the path exists
set_tracking_uri(MLFLOW_TRACKING_URI)
result = register_model(
    model_uri=model_uri,
    name="nyc-taxi-regressor",
)

# we can add an aliases in ui


Registered model 'nyc-taxi-regressor' already exists. Creating a new version of this model...
Created version '8' of model 'nyc-taxi-regressor'.


In [44]:
# It is important to delete the alias before deleting the model version
# whether not the model can see and list the aliases of former registered models
client.delete_registered_model_alias(name="nyc-taxi-regressor", alias="tontito")

### Comparing versions of registered models with new data, to select the new champion model.

steps
1. Load the test dataset, which corresponds to the NYC Green Taxi data from the month of March 2021.
2. Download the DictVectorizer that was fitted using the training data and saved to MLflow as an artifact, and load it with pickle.
3. Preprocess the test set using the DictVectorizer so we can properly feed the regressors.
4. Make predictions on the test set using the model versions that are currently in the "Staging" and "Production" stages, and compare their performance.
Based on the results, update the "Production" model version accordingly.

In [54]:
import pandas as pd
from sklearn.metrics import root_mean_squared_error
import shutil
import pickle
from mlflow.tracking import MlflowClient
from mlflow import register_model, set_tracking_uri
import mlflow

MLFLOW_TRACKING_URI = "sqlite:///mlflow.db"
client = MlflowClient(tracking_uri=MLFLOW_TRACKING_URI)


def read_dataframe(filename):
    df = pd.read_parquet(filename)

    df.lpep_dropoff_datetime = pd.to_datetime(df.lpep_dropoff_datetime)
    df.lpep_pickup_datetime = pd.to_datetime(df.lpep_pickup_datetime)

    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)
    
    return df

def preprocess(df, dv):
    df['PU_DO'] = df['PULocationID'] + '_' + df['DOLocationID']
    cat = ['PU_DO']
    num = ['trip_distance']
    target = 'duration'
    df_dicts = df[cat + num].to_dict(orient='records')
    X_ = dv.transform(df_dicts)
    y_ = df[target].values
    return X_, y_

def test_model(model_path, X_test, y_test):
    model = mlflow.pyfunc.load_model(model_path)
    y_pred = model.predict(X_test)
    return {"rmse": root_mean_squared_error(y_test, y_pred)}

##### get metrics over the registered models over the new data

In [58]:
model_name = "nyc-taxi-regressor"
data_path = "data/green_tripdata_2021-03.parquet"

model_ranking = {}
registered_model =client.get_registered_model(model_name)
print(registered_model)
for key, value in registered_model.aliases.items():
    print(f"Aliases: {key} -- Version: {value}")
    retrieved_model = client.get_model_version(model_name, value)
    model_path = client.get_model_version_download_uri(name=model_name, version=value)
    print(client.get_model_version_download_uri(name=model_name, version=value))
    shutil.rmtree("preprocessor")
    mlflow.artifacts.download_artifacts(
        f"runs:/{run_id}/preprocessor/preprocessor.b",
        dst_path = "preprocessor")
    with open("preprocessor/preprocessor.b", "rb") as f_in:
        dv = pickle.load(f_in)
    df = read_dataframe(data_path)
    X_test, y_test = preprocess(df, dv)
    rmse = test_model(model_path=model_path, X_test=X_test, y_test=y_test)
    model_ranking[key] = {'run_id': run_id, 'version': value, 'rmse': rmse}

model_ranking




<RegisteredModel: aliases={'challenger': 2, 'champion': 1, 'toletito': 6}, creation_timestamp=1744122608228, description='This is a model to calculate taxi ride duration', last_updated_timestamp=1744378081892, latest_versions=[<ModelVersion: aliases=[], creation_timestamp=1744377032759, current_stage='None', description='', last_updated_timestamp=1744377032759, name='nyc-taxi-regressor', run_id='82edfac7c99c4e7398548e465fbc371e', run_link='', source='/home/yezergm/mlops-zoomcamp/02-experiment-tracking/mlruns/1/82edfac7c99c4e7398548e465fbc371e/artifacts/models_mlflow', status='READY', status_message=None, tags={}, user_id=None, version=6>], name='nyc-taxi-regressor', tags={}>
Aliases: challenger -- Version: 2
/home/yezergm/mlops-zoomcamp/02-experiment-tracking/mlruns/1/b20a13d5349c4884abe7878647f4b3e0/artifacts/models_mlflow


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Aliases: champion -- Version: 1
/home/yezergm/mlops-zoomcamp/02-experiment-tracking/mlruns/1/3463b2d27a214a959b40ab09c3307d55/artifacts/models_mlflow


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Aliases: toletito -- Version: 6
/home/yezergm/mlops-zoomcamp/02-experiment-tracking/mlruns/1/82edfac7c99c4e7398548e465fbc371e/artifacts/models_mlflow


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

{'challenger': {'run_id': '30eb57108c0041b6b8ff08d1479a6ded',
  'version': 2,
  'rmse': {'rmse': 6.579025207611266}},
 'champion': {'run_id': '30eb57108c0041b6b8ff08d1479a6ded',
  'version': 1,
  'rmse': {'rmse': 6.278375017835054}},
 'toletito': {'run_id': '30eb57108c0041b6b8ff08d1479a6ded',
  'version': 6,
  'rmse': {'rmse': 6.63079976082962}}}