In [1]:
from mlflow.tracking import MlflowClient

MLFLOW_TRACKING_URI = "sqlite:///mlflow.db"
client = MlflowClient(tracking_uri=MLFLOW_TRACKING_URI)

In [4]:
client.create_experiment(name="my-cool-experiment")

'2'

In [12]:
from mlflow.entities import ViewType

runs = client.search_runs(
    experiment_ids='1',
    filter_string="metrics.rmse < 6.3",
    run_view_type=ViewType.ACTIVE_ONLY,
    max_results=5,
    order_by=["metrics.rmse ASC"]
)

In [13]:
for run in runs:
    print(f"run id: {run.info.run_id}, rmse: {run.data.metrics['rmse']:.4f}")

run id: 8ddc753de4e140a3879c6ae6e7846613, rmse: 6.2934
run id: 0c76291eef3945dba694c7ff4b9c19ff, rmse: 6.2934
run id: d9757abe8c7a4e0ca64d267dd621cff6, rmse: 6.2934
run id: a5637640b97344d19851cc956aba8b2a, rmse: 6.2934


In [14]:
import mlflow

mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)

In [40]:
run_id = "0c76291eef3945dba694c7ff4b9c19ff"
model_uri = f"runs:/{run_id}/model"
mlflow.register_model(model_uri=model_uri, name='nyc-taxi-regressor')

Registered model 'nyc-taxi-regressor' already exists. Creating a new version of this model...
2023/05/29 23:16:41 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: nyc-taxi-regressor, version 4
Created version '4' of model 'nyc-taxi-regressor'.


<ModelVersion: aliases=[], creation_timestamp=1685369801527, current_stage='None', description=None, last_updated_timestamp=1685369801527, name='nyc-taxi-regressor', run_id='0c76291eef3945dba694c7ff4b9c19ff', run_link=None, source='/Users/ryujaesung/kade/mlops-zoomcamp/week2/experiment-tracking/mlruns/1/0c76291eef3945dba694c7ff4b9c19ff/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=4>

In [41]:
latest_versions = client.get_latest_versions(name="nyc-taxi-regressor")
for version in latest_versions:
    print(f"name: {version.name}, version: {version.version}, stage: {version.current_stage}")

name: nyc-taxi-regressor, version: 2, stage: Staging
name: nyc-taxi-regressor, version: 3, stage: Production
name: nyc-taxi-regressor, version: 4, stage: None


In [42]:
client.transition_model_version_stage(
    name="nyc-taxi-regressor",
    version=4,
    stage="Production",
    archive_existing_versions=False
)

<ModelVersion: aliases=[], creation_timestamp=1685369801527, current_stage='Production', description=None, last_updated_timestamp=1685369806096, name='nyc-taxi-regressor', run_id='0c76291eef3945dba694c7ff4b9c19ff', run_link=None, source='/Users/ryujaesung/kade/mlops-zoomcamp/week2/experiment-tracking/mlruns/1/0c76291eef3945dba694c7ff4b9c19ff/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=4>

In [43]:
client.update_model_version(
    name="nyc-taxi-regressor",
    version=4,
    description="This model version was registered via the MLflow API."
)

<ModelVersion: aliases=[], creation_timestamp=1685369801527, current_stage='Production', description='This model version was registered via the MLflow API.', last_updated_timestamp=1685369808458, name='nyc-taxi-regressor', run_id='0c76291eef3945dba694c7ff4b9c19ff', run_link=None, source='/Users/ryujaesung/kade/mlops-zoomcamp/week2/experiment-tracking/mlruns/1/0c76291eef3945dba694c7ff4b9c19ff/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=4>

In [22]:
import pandas as pd
from sklearn.metrics import mean_squared_error

def read_dataframe(filename):
    df = pd.read_parquet(filename)
    
    df['lpep_dropoff_datetime'] = pd.to_datetime(df['lpep_dropoff_datetime'])
    df['lpep_pickup_datetime'] = pd.to_datetime(df['lpep_pickup_datetime'])
    
    df['duration'] = df['lpep_dropoff_datetime'] - df['lpep_pickup_datetime']
    df['duration'] = df['duration'].apply(lambda td: td.total_seconds() / 60)
    
    df = df[(df['duration'] >= 1) & (df['duration'] <= 60)]
    
    categorical = ["PULocationID", "DOLocationID"]
    df[categorical] = df[categorical].astype(str)
    
    return df

def preprocess(df, dv):
    
    df['PU_DO'] = df['PULocationID'] + "_" + df['DOLocationID']
    categorical = ["PU_DO"]
    numerical = ["trip_distance"]

    train_dicts = df[categorical + numerical].to_dict(orient="records")
    return dv.transform(train_dicts)

def test_model(stage, X_test, y_test, name="nyc-taxi-regressor"):
    model = mlflow.pyfunc.load_model(f"models:/{name}/{stage}")
    y_pred = model.predict(X_test)
    return {"rmse": mean_squared_error(y_test, y_pred, squared=False)}

In [23]:
df = read_dataframe("../../data/green_tripdata_2021-01.parquet")

In [35]:
client.download_artifacts(run_id=run_id, path="preprocessor", dst_path=".")

  client.download_artifacts(run_id=run_id, path="preprocessor", dst_path=".")


'/Users/ryujaesung/kade/mlops-zoomcamp/week2/experiment-tracking/preprocessor'

In [36]:
import pickle

with open("./preprocessor/preprocessor.b", "rb") as f_in:
    dv = pickle.load(f_in)

In [37]:
X_test = preprocess(df, dv)

In [38]:
target = "duration"
y_test = df[target].values

In [44]:
%time test_model("Production", X_test, y_test)

 - mlflow (current: 2.3.2, required: mlflow==2.3)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.


CPU times: user 22.3 s, sys: 479 ms, total: 22.8 s
Wall time: 3.59 s


{'rmse': 4.2308439867650565}