# get model from mlflow registry

based on https://github.com/DataTalksClub/mlops-zoomcamp/blob/main/02-experiment-tracking/model-registry.ipynb and adapted to this project

## prepare mlflow connection

In [1]:
import mlflow
import os
from dotenv import load_dotenv
from mlflow.tracking import MlflowClient

load_dotenv()

# MLFLOW_TRACKING_URI = "sqlite:///mlflow.db"

tracking_server_ip = os.getenv("TRACKING_SERVER_HOST", "localhost")
tracking_server_port = os.getenv("TRACKING_SERVER_HOST_PORT", "5000")
MLFLOW_TRACKING_URI = f"http://{tracking_server_ip}:{tracking_server_port}"
model_storage_path = os.path.join(os.getcwd(), "/model")
print("storing model to path", model_storage_path)
os.makedirs(model_storage_path)

client = MlflowClient(tracking_uri=MLFLOW_TRACKING_URI)

ModuleNotFoundError: No module named 'dotenv'

## get experiments

In [None]:
client.list_experiments()

#run_str = f"runs:/{run_id}/model"
#learn = mlflow.pyfunc.load_model(run_str, dst_path=model_storage_path)

In [None]:
# select an experiment run ID
EXPERIMENT_ID = "project_resnet50_v1"

## get runs from experiment

In [2]:
from mlflow.entities import ViewType

runs = client.search_runs(
    experiment_ids=EXPERIMENT_ID,
    filter_string="metrics.accuracy > 0.05",
    run_view_type=ViewType.ACTIVE_ONLY,
    max_results=5,
    order_by=["metrics.accuracy DSC"]
)

for i, run in enumerate(runs):
    print(f"run id: {run.info.run_id}, accuracy: {run.data.metrics['accuracy']:.4f}")
    if i == 0:
        best_run_id = run.info.run_id

best_run_id

NameError: name 'client' is not defined

In [None]:
# store run_id of best run in .env file to be used during deployment

## register best model in model registry

In [None]:
import mlflow

mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
model_uri = f"runs:/{best_run_id}/model"
model_name = "best_resnet50_model"
mlflow.register_model(model_uri=model_uri, name="best_resnet50_model")


In [None]:
latest_versions = client.get_latest_versions(name=model_name)

for version in latest_versions:
    print(f"version: {version.version}, stage: {version.current_stage}")

In [None]:
# get latest version of model and mark as in 'staging' stage
model_version = 4
new_stage = "Staging"
client.transition_model_version_stage(
    name=model_name,
    version=model_version,
    stage=new_stage,
    archive_existing_versions=False
)

## compare new model with with current deployed model and move to production stage
from here it can be picked up by a CI/CD stage

1) get test dataclass
2) evaluate both models on it
3) if new model from "Staging" is better, move it to "Production" stage and remove old model from it


In [None]:

from sklearn.metrics import mean_squared_error
import pandas as pd


def read_dataframe(filename):
    df = pd.read_csv(filename)

    df.lpep_dropoff_datetime = pd.to_datetime(df.lpep_dropoff_datetime)
    df.lpep_pickup_datetime = pd.to_datetime(df.lpep_pickup_datetime)

    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)
    
    return df


def preprocess(df, dv):
    df['PU_DO'] = df['PULocationID'] + '_' + df['DOLocationID']
    categorical = ['PU_DO']
    numerical = ['trip_distance']
    train_dicts = df[categorical + numerical].to_dict(orient='records')
    return dv.transform(train_dicts)


def test_model(name, stage, X_test, y_test):
    model = mlflow.pyfunc.load_model(f"models:/{name}/{stage}")
    y_pred = model.predict(X_test)
    return {"rmse": mean_squared_error(y_test, y_pred, squared=False)}