In [2]:
from mlflow.tracking import MlflowClient


MLFLOW_TRACKING_URI = "sqlite:///mlflow.db"

client = MlflowClient(tracking_uri=MLFLOW_TRACKING_URI)

In [6]:
# Indirectly list all available experiments
client.search_experiments()

[<Experiment: artifact_location='file:///e:/Projects/Git/mlops_selflearn/02-experiment-tracking/mlruns/1', creation_time=1684855005802, experiment_id='1', last_update_time=1684855005802, lifecycle_stage='active', name='nyc-taxi-experiment', tags={}>,
 <Experiment: artifact_location='mlflow-artifacts:/0', creation_time=1684854568996, experiment_id='0', last_update_time=1684854568996, lifecycle_stage='active', name='Default', tags={}>]

In [7]:
# Create a new experiment
client.create_experiment(name="jupyter-created-experiment")

'2'

In [11]:
# Search for runs in the experiment
from mlflow.entities import ViewType

runs = client.search_runs(
    experiment_ids=1,
    filter_string='metrics.rmse < 6.8',
    run_view_type= ViewType.ACTIVE_ONLY,
    max_results=5,
    order_by=['metrics.rmse ASC']
)

for run in runs:
    print(f"run id: {run.info.run_id}, rmse: {run.data.metrics['rmse']:.4f}")

run id: a2d9f5d0f19e430fa3779542d3269539, rmse: 6.2726
run id: e5cb769d83304c9d926b0f3af20f6444, rmse: 6.2765
run id: 100f36fdff744e1689c204a596a2a01a, rmse: 6.2835
run id: 1e9ea5af29224725886fd97139814715, rmse: 6.2859
run id: c59f76789a7a4dbdb03bf3a9d85856d8, rmse: 6.2870


In [15]:
runs = client.search_runs(
    experiment_ids=1,
    filter_string='tags.model="final_xgboost"',
    run_view_type= ViewType.ACTIVE_ONLY,
    max_results=1
)

for run in runs:
    print(f"run id: {run.info.run_id}, rmse: {run.data.metrics['rmse']:.4f}")

run id: 92a3c7a4793b473382de2221a55f029e, rmse: 6.3055


In [12]:
import mlflow

mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)

In [18]:
# Register model
run_id = '92a3c7a4793b473382de2221a55f029e'
mlflow.register_model(model_uri=f'runs:/{run_id}/model', name='nyc-taxi-regressor')

Registered model 'nyc-taxi-regressor' already exists. Creating a new version of this model...
2023/05/27 13:07:12 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: nyc-taxi-regressor, version 1
Created version '1' of model 'nyc-taxi-regressor'.


<ModelVersion: aliases=[], creation_timestamp=1685164032205, current_stage='None', description=None, last_updated_timestamp=1685164032205, name='nyc-taxi-regressor', run_id='92a3c7a4793b473382de2221a55f029e', run_link=None, source='file:///e:/Projects/Git/mlops_selflearn/02-experiment-tracking/mlruns/1/92a3c7a4793b473382de2221a55f029e/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=1>

In [19]:
client.search_registered_models()

[<RegisteredModel: aliases={}, creation_timestamp=1685163992244, description=None, last_updated_timestamp=1685164032205, latest_versions=[<ModelVersion: aliases=[], creation_timestamp=1685164032205, current_stage='None', description=None, last_updated_timestamp=1685164032205, name='nyc-taxi-regressor', run_id='92a3c7a4793b473382de2221a55f029e', run_link=None, source='file:///e:/Projects/Git/mlops_selflearn/02-experiment-tracking/mlruns/1/92a3c7a4793b473382de2221a55f029e/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=1>], name='nyc-taxi-regressor', tags={}>]

In [21]:
model_name = "nyc-taxi-regressor"
latest_ver = client.get_latest_versions(name=model_name)

for ver in latest_ver:
    print(f"{ver.version=}, {ver.current_stage=}")

ver.version=1, ver.current_stage='None'


In [23]:
# Stage the model
model_version=1
new_stage='staging'
client.transition_model_version_stage(
    name=model_name,
    version=model_version,
    stage=new_stage,
    archive_existing_versions=False
)

In [24]:
# Update model desc
from datetime import datetime

date = datetime.today().date()
client.update_model_version(
    name=model_name,
    version=1,
    description=f'The model version {model_version} was transitioned to {new_stage} on {date}.'
)

<ModelVersion: aliases=[], creation_timestamp=1685164032205, current_stage='Staging', description='The model version 1 was transitioned to staging on 2023-05-27.', last_updated_timestamp=1685164615494, name='nyc-taxi-regressor', run_id='92a3c7a4793b473382de2221a55f029e', run_link=None, source='file:///e:/Projects/Git/mlops_selflearn/02-experiment-tracking/mlruns/1/92a3c7a4793b473382de2221a55f029e/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=1>

#### Simulated end-to-end run

In [38]:
from sklearn.metrics import mean_squared_error
import pandas as pd


def read_dataframe(filename):
    df = pd.read_parquet(filename)

    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)
    
    return df

def preprocess(df, dv):
    df['PU_DO'] = df['PULocationID'] + '_' + df['DOLocationID']

    categorical = ['PU_DO']
    numerical = ['trip_distance']

    train_dicts = df[categorical + numerical].to_dict(orient='records')
    return dv.fit_transform(train_dicts)

def test_model(name, stage, X_test, y_test):
        # Default model load uri
        # model = mlflow.pyfunc.load_model(f"models:/{name}/{stage}")
        
        # Note: when using custom log model folder, need to specify the custom folder directly
        model = mlflow.pyfunc.load_model(f"models:/{name}/{stage}/../models_mlflow")
        y_pred = model.predict(X_test)
        return {"rmse": mean_squared_error(y_test, y_pred, squared=False)}


In [42]:
df = read_dataframe("data/green_tripdata_2021-02.parquet")

In [27]:
client.download_artifacts(run_id=run_id, path='preprocessor', dst_path='.')

  client.download_artifacts(run_id=run_id, path='preprocessor', dst_path='.')


'e:\\Projects\\Git\\mlops_selflearn\\02-experiment-tracking\\preprocessor'

In [28]:
import pickle

with open("preprocessor/preprocessor.b", "rb") as f_in:
    dv = pickle.load(f_in)

In [43]:
X_test = preprocess(df, dv)

In [44]:
target = "duration"
y_test = df[target].values

In [47]:
%time test_model(name=model_name, stage='Staging', X_test=X_test, y_test=y_test)

 - mlflow (current: 2.3.2, required: mlflow==2.3)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.


CPU times: total: 10.5 s
Wall time: 803 ms


{'rmse': 12.87621730657822}

In [48]:
client.transition_model_version_stage(
    name=model_name,
    version=model_version,
    stage="production",
    archive_existing_versions=True
)

<ModelVersion: aliases=[], creation_timestamp=1685164032205, current_stage='Production', description='The model version 1 was transitioned to staging on 2023-05-27.', last_updated_timestamp=1685166587490, name='nyc-taxi-regressor', run_id='92a3c7a4793b473382de2221a55f029e', run_link=None, source='file:///e:/Projects/Git/mlops_selflearn/02-experiment-tracking/mlruns/1/92a3c7a4793b473382de2221a55f029e/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=1>