# MLflow intro

### What to track?
* Source code
* Environment
* Data
* Model
* Hyperparameters
* Metrics

### What does MLflow do?
A python package that could do tracking, models, model registry, projects.

### Installing MLflow

pip: `pip install mlflow`

conda: `conda install -c conda-forge mlflow`

To run the MLflow UI locally we use the command, make sure this command is running from where the db is located:

```
mlflow ui --backend-store-uri sqlite:///mlflow.db --port 5004
```

Setup MLflow experiment:

Import mlflow in notebook, set the database address and experiment name.

In [1]:
import mlflow
mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("nyc_taxis")

<Experiment: artifact_location='/Users/yihanzhou/PycharmProjects/mlops-zoomcamp/00-my-work/mlruns/1', creation_time=1716360482562, experiment_id='1', last_update_time=1716360482562, lifecycle_stage='active', name='nyc_taxis', tags={}>

In [2]:
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import mean_squared_error
import pickle
import os

In [3]:
data_2401_path = "/Users/yihanzhou/PycharmProjects/mlops-zoomcamp/data/green_tripdata_2024-01.parquet"
data_2402_path = "/Users/yihanzhou/PycharmProjects/mlops-zoomcamp/data/green_tripdata_2024-02.parquet"

In [4]:
def read_dataframe(path):
    df = pd.read_parquet(path)
    df['duration'] = df['lpep_dropoff_datetime'] - df['lpep_pickup_datetime']
    df['duration'] = df['duration'].apply(lambda x: x.total_seconds()/60)
    df = df[(df['duration'] > 1.0) & (df['duration'] < 60.0)]
    categorical = ['PULocationID','DOLocationID']
    df[categorical] = df[categorical].astype(str)
    return df

In [5]:
df_train = read_dataframe(data_2401_path)
df_val = read_dataframe(data_2402_path)

In [6]:
df_train['PU_DOLocationID'] = df_train['PULocationID']+"_"+df_train['DOLocationID']
df_val['PU_DOLocationID'] = df_val['PULocationID']+"_"+df_val['DOLocationID']

In [7]:
categorical = ['PULocationID','DOLocationID','PU_DOLocationID']
numerical = ['trip_distance']

dv = DictVectorizer()

train_dicts = df_train[categorical+numerical].to_dict(orient="records")
x_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical+numerical].to_dict(orient="records")
x_val = dv.transform(val_dicts)

In [8]:
target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

# Track model run

```
mlflow.{model}.autolog() ## make sure this is before mlflow.start_run()
with mlflow.start_run() as run:
	# .... modeling....

	mlflow.set_tag('tag_name', 'tag_value')
	mlflow.log_param('param_name', param_obj) or mlflow.log_params(parm_obj)
	mlflow.sklearn.log_model(moldel_obj, artifact_path = 'path')
	mlflow.log_metric('metric_name', metric_obj)
	mlflow.log_artifact(artifact_obj or local_artifact_path, artifact_path)

mlflow.stop_run() ## not necessary if using `with mlflow.start_run() as run:`
```

In [9]:
lr = LinearRegression()
lr.fit(x_train,y_train)

y_pred = lr.predict(x_val)

mean_squared_error(y_val,y_pred,squared=False) 



5.919781845355132

In [16]:
import os
# Create the pick file first
# Define the base directory where you want to save the model
base_dir = "/Users/yihanzhou/PycharmProjects/mlops-zoomcamp/00-my-work/wk2/models"
model_filename = "lin_reg.bin"
model_path = os.path.join(base_dir, model_filename)

# Ensure the directory exists, create if it does not
os.makedirs(base_dir, exist_ok=True)

# Save the model (assuming `lr` is your model variable)
with open(model_path, 'wb') as f:
    pickle.dump(lr, f)

In [17]:
with mlflow.start_run():
    mlflow.set_tag("developer", "yihan")
    mlflow.log_param("train-data-path", "/Users/yihanzhou/PycharmProjects/mlops-zoomcamp/data/green_tripdata_2024-01.parquet")
    mlflow.log_param("val-data-path", "/Users/yihanzhou/PycharmProjects/mlops-zoomcamp/data/green_tripdata_2024-02.parquet")

    alpha = 0.01
    mlflow.log_param("alpha", alpha)

    lr = Lasso(alpha=alpha)
    lr.fit(x_train,y_train)
    
    y_pred = lr.predict(x_val)
    
    rmse = mean_squared_error(y_val,y_pred,squared=False) 
    mlflow.log_metric("rmse",rmse)
    mlflow.end_run()
    
    # save the model
    mlflow.log_artifact(local_path=model_path, artifact_path="models_pickle")



In [13]:
!ls

mlflow.db                [1m[36mpreprocessor[m[m             wk2_notes.ipynb
[1m[36mmlruns[m[m                   wk2_homework.ipynb
[1m[36mmodels[m[m                   wk2_model_registry.ipynb


In [None]:
lr = Ridge(alpha=0.01)
lr.fit(x_train,y_train)

y_pred = lr.predict(x_val)

mean_squared_error(y_val,y_pred,squared=False) 

### Track model run with XGBoost

```
mlflow.{model}.autolog() ## make sure this is before mlflow.start_run()
with mlflow.start_run() as run:
	# .... modeling....

	mlflow.set_tag('tag_name', 'tag_value')
	mlflow.log_param('param_name', param_obj) or mlflow.log_params(parm_obj)
	mlflow.log_model(moldel_obj, artifact_path = 'path')
	mlflow.log_metric('metric_name', metric_obj)
	mlflow.log_artifact(artifact_obj or local_artifact_path, artifact_path)

mlflow.stop_run() ## not necessary if using `with mlflow.start_run() as run:`
```

In [None]:
import xgboost as xgb

# bayesian methods to find the best set of hyperparameters, like optuna, ray tune(more comprehensive).

# fmin -> find minimal output from passed in values
# tpe  -> control the logic
# hp   -> control the search space
# STATUS_OK -> send signal at end of the run to tell the lib about the status
# trails -> track info of each run
# scope --> define integer range
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials 
from hyperopt.pyll import scope 

In [None]:
train = xgb.DMatrix(x_train, label=y_train)
valid = xgb.DMatrix(x_val, label=y_val)

In [None]:
def objective(params):
    
    with mlflow.start_run():
        mlflow.set_tag('model', 'xgboost')
        mlflow.log_params(params)
        booster = xgb.train(
            params=params,
            dtrain=train,
            num_boost_round=1000,
            evals=[(valid,"validation")],
            early_stopping_rounds=50
        )
        y_pred = booster.predict(valid)
        rmse = mean_squared_error(y_val,y_pred,squared=False) 
        mlflow.log_metric("rmse",rmse)
        
    return {'loss': rmse, 'status': STATUS_OK}

In [None]:
# Define the range we want the hyperopt to find
search_space = {
    # depth of trees, quniform returns real numbers, scope.int converts to an int as max_depth needs an int
    "max_depth": scope.int(hp.quniform("max_depth",4, 100, 1)),
    # range: exp(-3), exp(0) -> [0.05, 1]
    "learning_rate": hp.loguniform("learning_rate", -3, 0),
    "reg_alpha": hp.loguniform('reg_alpha', -5, -1),
    "reg_lambda": hp.loguniform('reg_lambda', -6, -1),
    "min_child_weight": hp.loguniform('min_child_weight', -1, 3),
    "objective":"reg:linear",
    "seed":42,
}

best_result = fmin(
    fn=objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=50,
    trials=Trials()
)

# Save model

## Autologging

Instead of logging the parameters by "Hand" by specifiying the logged parameters and passing them. We may use the Autologging feature in MLflow. There are two ways to use Autologging; First by enabling it globally in the code/Notebook using 
```python
mlflow.autolog()
```

or by enabling the framework-specific autologger; ex with XGBoost:

```python
mlflow.xgboost.autolog()
```
Both must be done before running the experiments.

The autologger then not only stores the model parameters for ease of use, it also stores other files inside the `model` (can be specified) folder inside our experiment artifact folder, these files include:
+ `conda.yaml` and `requirements.txt`: Files which define the current envrionment for use with either `conda` or `pip` respectively
+ `MLmodel` an internal MLflow file for organization
+ Other framework-specific files such as the model itself


In [58]:
mlflow.xgboost.autolog() # ensure that mlflow.xgboost.autolog() is called before any DMatrix objects are instantiated

params = {
        "learning_rate":0.05962727037069885,
        "max_depth":8,
        "min_child_weight": 1.6262090424037392,
        "objective": "reg:linear",
        "reg_alpha": 0.21113459659178727,
        "reg_lambda": 0.24977063291390747,
        "seed": 42}

train = xgb.DMatrix(x_train, label=y_train)
valid = xgb.DMatrix(x_val, label=y_val)
booster = xgb.train(
            params=params,
            dtrain=train,
            num_boost_round=1000,
            evals=[(valid,"validation")],
            early_stopping_rounds=50
        )

prep_path = "models/preprocessor.b"
os.makedirs(os.path.dirname(prep_path), exist_ok=True)
with open(prep_path, 'wb') as file:
    pickle.dump(dv, file)
mlflow.log_artifact(prep_path, artifact_path = "preprocessor")
mlflow.xgboost.log_model(booster, artifact_path = "models_mlflow")


[0]	validation-rmse:8.75440
[1]	validation-rmse:8.44120
[2]	validation-rmse:8.15312
[3]	validation-rmse:7.88895
[4]	validation-rmse:7.64594
[5]	validation-rmse:7.42486
[6]	validation-rmse:7.22222
[7]	validation-rmse:7.03797
[8]	validation-rmse:6.87173
[9]	validation-rmse:6.71989
[10]	validation-rmse:6.58330
[11]	validation-rmse:6.45792
[12]	validation-rmse:6.34404
[13]	validation-rmse:6.24138
[14]	validation-rmse:6.14943




[15]	validation-rmse:6.06460
[16]	validation-rmse:5.98775
[17]	validation-rmse:5.91877
[18]	validation-rmse:5.85717
[19]	validation-rmse:5.80083
[20]	validation-rmse:5.74932
[21]	validation-rmse:5.70417
[22]	validation-rmse:5.66419
[23]	validation-rmse:5.62669
[24]	validation-rmse:5.59157
[25]	validation-rmse:5.56037
[26]	validation-rmse:5.53387
[27]	validation-rmse:5.50760
[28]	validation-rmse:5.48418
[29]	validation-rmse:5.46260
[30]	validation-rmse:5.44421
[31]	validation-rmse:5.42660
[32]	validation-rmse:5.41038
[33]	validation-rmse:5.39503
[34]	validation-rmse:5.38172
[35]	validation-rmse:5.37006
[36]	validation-rmse:5.35918
[37]	validation-rmse:5.34890
[38]	validation-rmse:5.33952
[39]	validation-rmse:5.33033
[40]	validation-rmse:5.32135
[41]	validation-rmse:5.31492
[42]	validation-rmse:5.30652
[43]	validation-rmse:5.29838
[44]	validation-rmse:5.29253
[45]	validation-rmse:5.28650
[46]	validation-rmse:5.28110
[47]	validation-rmse:5.27610
[48]	validation-rmse:5.27167
[49]	validatio



<mlflow.models.model.ModelInfo at 0x33ca08a70>

Instead of autolog we can also log the model using `log_model`, before doing that we need to disable the global setting for autolog:
`mlflow.xgboost.autolog(disable=True)`
## Log model
we add a line to our `with mlflow.start_run()` block:

```python
mlflow.<framework>.log_model(model, artifact_path="models_mlflow")
```

where `<framework>` could be `sklearn`, `xgboost`...etc.
The `artifact_path` defines where in the `artifact_uri` the model is stored.

This will save our model inside `models_mlflow` directory in the experiment folder. (Using Autologging would store more data on parameters as well as the model. i.e: This is redundant when using the autologger).

## Log other artifacts
Sometimes we may want to save some artifacts, e.g. `DictVectorizer` object, for inference. In that case we save the artifact as:
```
    prep_path = "models/preprocessor.b"
    os.makedirs(os.path.dirname(prep_path), exist_ok=True)
    with open(prep_path, 'wb') as file:
        pickle.dump(dv, file)
    mlflow.log_artifact(prep_path, artifact_path = "preprocessor")
```

In [57]:
mlflow.xgboost.autolog(disable=True)

In [None]:
with mlflow.start_run():
    params = {
        "learning_rate":0.05962727037069885,
        "max_depth":8,
        "min_child_weight": 1.6262090424037392,
        "objective": "reg:linear",
        "reg_alpha": 0.21113459659178727,
        "reg_lambda": 0.24977063291390747,
        "seed": 42}
    
    train = xgb.DMatrix(x_train, label=y_train)
    valid = xgb.DMatrix(x_val, label=y_val)
    
    mlflow.log_params(params)
    
    booster = xgb.train(
                params=params,
                dtrain=train,
                num_boost_round=1000,
                evals=[(valid,"validation")],
                early_stopping_rounds=50
            )
    mlflow.log_metric("rmse", rmse)
    
    # log the preprocessor
    prep_path = "models/preprocessor.b"
    os.makedirs(os.path.dirname(prep_path), exist_ok=True)
    with open(prep_path, 'wb') as file:
        pickle.dump(dv, file)
    mlflow.log_artifact(prep_path, artifact_path = "preprocessor")
    
    mlflow.xgboost.log_model(booster, artifact_path = "models_mlflow")


# Retrieve model

To find the prediction code:
"experiment_name > Artifacts > models_mlflow > Make Predictions"

What to evaluate which model is good for production:
* Duration: How long is the training time?
* Metrics: Which one achieved the lowest error?
* Model size: 

In [None]:
logged_model = 'runs:/904f34a4b7c54aefa27e589d795c8277/models_mlflow'

# Load model as a PyFuncModel.
loaded_model = mlflow.pyfunc.load_model(logged_model)

In [None]:
# method 1: pass in as a python function
loaded_model

In [None]:
# method 2: pass in as a xgboost model
xgboost = mlflow.xgboost.load_model(logged_model)
xgboost

In [None]:
y_pred = xgboost.predict(valid)

In [None]:
y_pred[:10]

# Save both model and the dv

In [45]:
import mlflow.sklearn
from sklearn.pipeline import make_pipeline

In [38]:
train_dicts = df_train[categorical+numerical].to_dict(orient="records")
with mlflow.start_run():
    mlflow.log_param("train-data-path", "/Users/yihanzhou/PycharmProjects/mlops-zoomcamp/data/green_tripdata_2024-01.parquet")
    mlflow.log_param("val-data-path", "/Users/yihanzhou/PycharmProjects/mlops-zoomcamp/data/green_tripdata_2024-02.parquet")

    alpha = 0.01
    mlflow.log_param("alpha", alpha)
    
    dv = DictVectorizer()
    x_train = dv.fit_transform(train_dicts)

    lr = Lasso(alpha=alpha)
    lr.fit(x_train,y_train)
    
    # 1. save preprocessor
    prep_path = "models/preprocessor.b"
    os.makedirs(os.path.dirname(prep_path), exist_ok=True)
    with open(prep_path, 'wb') as file:
        pickle.dump(dv, file)
    mlflow.log_artifact(prep_path, artifact_path = "preprocessor")

    # 2. save model
    mlflow.sklearn.log_model(lr, artifact_path="models_pickle") 
    # mlflow.log_artifact will not create the snippet code
    
    y_pred = lr.predict(val_dicts)
    rmse = mean_squared_error(y_val,y_pred,squared=False) 
    mlflow.log_metric("rmse",rmse)
    mlflow.end_run()
    



In [46]:
# Alternative, save as a pipeline
train_dicts = df_train[categorical+numerical].to_dict(orient="records")
with mlflow.start_run():
    mlflow.log_param("train-data-path", "/Users/yihanzhou/PycharmProjects/mlops-zoomcamp/data/green_tripdata_2024-01.parquet")
    mlflow.log_param("val-data-path", "/Users/yihanzhou/PycharmProjects/mlops-zoomcamp/data/green_tripdata_2024-02.parquet")
    alpha = 0.01
    mlflow.log_param("alpha", alpha)
    
    pipepline = make_pipeline(
        DictVectorizer(),
        Lasso(alpha=alpha)
    )
    pipepline.fit(train_dicts, y_train)
    
    y_pred = pipepline.predict(val_dicts)
    
    rmse = mean_squared_error(y_val,y_pred,squared=False) 
    mlflow.log_metric("rmse",rmse)
    
    # save pipeline
    mlflow.sklearn.log_model(pipepline, artifact_path="models_pickle") 
    mlflow.end_run()
    



# Load both model and the dv

In [40]:
from mlflow.tracking import MlflowClient
MLFLOW_TRACKING_URI = "http://127.0.0.1:5005/"
RUN_ID= '6866dfdb9a044eee85683af944fb732b'

# 1. Load the preprocessor
client = MlflowClient(tracking_uri=MLFLOW_TRACKING_URI)
prep_path = client.download_artifacts(run_id=RUN_ID, path='preprocessor/preprocessor.b')
with open(prep_path, 'rb') as f_out:
    dv = pickle.load(f_out)
    
# 2. Load the model
logged_model = F'runs:/{RUN_ID}/models_pickle'
model = mlflow.pyfunc.load_model(logged_model)

# Load pipeline

In [47]:
from mlflow.tracking import MlflowClient
MLFLOW_TRACKING_URI = "http://127.0.0.1:5005/"
RUN_ID= 'bca046633ac94a0dacad64f65d5901fb'

logged_model = F'runs:/{RUN_ID}/models_pickle'
model = mlflow.pyfunc.load_model(logged_model)