# get model from mlflow registry

based on https://github.com/DataTalksClub/mlops-zoomcamp/blob/main/02-experiment-tracking/model-registry.ipynb and adapted to this project

## prepare mlflow connection

In [6]:
import mlflow
import os
from dotenv import load_dotenv
from mlflow.tracking import MlflowClient

load_dotenv()

# Note please adapt this to your setup
PATH_TO_THIS_REPO = "/home/ubuntu/mlops_zoomcamp_homework/"

tracking_server_ip = os.getenv("TRACKING_SERVER_HOST", "localhost")
tracking_server_port = os.getenv("TRACKING_SERVER_HOST_PORT", "5000")
MLFLOW_TRACKING_URI = f"http://{tracking_server_ip}:{tracking_server_port}"
model_storage_path = os.path.join(PATH_TO_THIS_REPO, "model")
print("storing model to path", model_storage_path)

if not os.path.exists(model_storage_path):
    os.makedirs(model_storage_path)

client = MlflowClient(tracking_uri=MLFLOW_TRACKING_URI)

storing model to path /home/ubuntu/mlops_zoomcamp_homework/model


## get experiments

In [7]:
client.list_experiments()

#run_str = f"runs:/{run_id}/model"
#learn = mlflow.pyfunc.load_model(run_str, dst_path=model_storage_path)

  client.list_experiments()


[<Experiment: artifact_location='s3://mlflow-artifacts-remote-xydo/0', creation_time=None, experiment_id='0', last_update_time=None, lifecycle_stage='active', name='Default', tags={}>,
 <Experiment: artifact_location='s3://mlflow-artifacts-remote-xydo/1', creation_time=None, experiment_id='1', last_update_time=None, lifecycle_stage='active', name='my-experiment-1', tags={}>,
 <Experiment: artifact_location='s3://mlflow-artifacts-remote-xydo/2', creation_time=None, experiment_id='2', last_update_time=None, lifecycle_stage='active', name='project_resnet50_first', tags={}>,
 <Experiment: artifact_location='s3://mlflow-artifacts-remote-xydo/3', creation_time=None, experiment_id='3', last_update_time=None, lifecycle_stage='active', name='project_resnet50_parallel', tags={}>,
 <Experiment: artifact_location='s3://mlflow-artifacts-remote-xydo/4', creation_time=None, experiment_id='4', last_update_time=None, lifecycle_stage='active', name='project_resnet50_power', tags={}>,
 <Experiment: artif

In [13]:
# select an experiment run ID
# NOTE: adapt to your experiment, get your wanted one. In my case 'project_resnet50_v1'
EXPERIMENT_ID = "6"  # for experiment name project_resnet50_v1

## get runs from experiment

In [15]:
from mlflow.entities import ViewType

runs = client.search_runs(
    experiment_ids=EXPERIMENT_ID,
    filter_string="metrics.accuracy > 0.5",
    run_view_type=ViewType.ACTIVE_ONLY,
    max_results=5,
    order_by=["metrics.accuracy DESC"]
)

for i, run in enumerate(runs):
    print(f"run id: {run.info.run_id}, accuracy: {run.data.metrics['accuracy']:.4f}")
    if i == 0:
        best_run_id = run.info.run_id

best_run_id

run id: e1c3003940e14f3f872dea8521bb1cd6, accuracy: 0.7838
run id: 5e8554e66b654f54ae52a7aeb9ff8b0d, accuracy: 0.5135


'e1c3003940e14f3f872dea8521bb1cd6'

In [None]:
# NOTE: store run_id of best run in .env file to be used during deployment in format runs:/e1c3003940e14f3f872dea8521bb1cd6/model

## register best model in model registry

In [16]:
import mlflow

mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
model_uri = f"runs:/{best_run_id}/model"
model_name = "best_resnet50_model"
mlflow.register_model(model_uri=model_uri, name="best_resnet50_model")


Successfully registered model 'best_resnet50_model'.
2022/09/25 07:02:14 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: best_resnet50_model, version 1
Created version '1' of model 'best_resnet50_model'.


<ModelVersion: creation_timestamp=1664089334596, current_stage='None', description='', last_updated_timestamp=1664089334596, name='best_resnet50_model', run_id='e1c3003940e14f3f872dea8521bb1cd6', run_link='', source='s3://mlflow-artifacts-remote-xydo/6/e1c3003940e14f3f872dea8521bb1cd6/artifacts/model', status='READY', status_message='', tags={}, user_id='', version='1'>

In [17]:
latest_versions = client.get_latest_versions(name=model_name)

for version in latest_versions:
    print(f"version: {version.version}, stage: {version.current_stage}")

version: 1, stage: None


In [20]:
# get latest version of model and mark as in 'Production' stage
# NOTE: adapt your model_version regarding to the output of the last cell
model_version = 1
new_stage = "Production"
client.transition_model_version_stage(
    name=model_name,
    version=model_version,
    stage=new_stage,
    archive_existing_versions=False
)

<ModelVersion: creation_timestamp=1664089334596, current_stage='Production', description='', last_updated_timestamp=1664089429007, name='best_resnet50_model', run_id='e1c3003940e14f3f872dea8521bb1cd6', run_link='', source='s3://mlflow-artifacts-remote-xydo/6/e1c3003940e14f3f872dea8521bb1cd6/artifacts/model', status='READY', status_message='', tags={}, user_id='', version='1'>

## TODO compare new model with with current deployed model and move to production stage
from here it can be picked up by a CI/CD stage

1) get test dataclass
2) evaluate both models on it
3) if new model from "Staging" is better, move it to "Production" stage and remove old model from it


In [None]:
# NOTE: execute these cells only if you already have a model in production and want to replace it
# TODO update these cells (from MLOps Zoomcamp) matching this project

from sklearn.metrics import mean_squared_error
import pandas as pd


def read_dataframe(filename):
    df = pd.read_csv(filename)

    df.lpep_dropoff_datetime = pd.to_datetime(df.lpep_dropoff_datetime)
    df.lpep_pickup_datetime = pd.to_datetime(df.lpep_pickup_datetime)

    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)
    
    return df


def preprocess(df, dv):
    df['PU_DO'] = df['PULocationID'] + '_' + df['DOLocationID']
    categorical = ['PU_DO']
    numerical = ['trip_distance']
    train_dicts = df[categorical + numerical].to_dict(orient='records')
    return dv.transform(train_dicts)


def test_model(name, stage, X_test, y_test):
    model = mlflow.pyfunc.load_model(f"models:/{name}/{stage}")
    y_pred = model.predict(X_test)
    return {"rmse": mean_squared_error(y_test, y_pred, squared=False)}