### Q1. Install MLflow

In [2]:
!pip install mlflow



In [5]:
import mlflow
import pandas as pd

import os

In [6]:
mlflow.__version__

'2.13.0'

### Q2. Download and preprocess the data

In [None]:
!python preprocess_data.py --raw_data_path /workspaces/mlops-zoomcamp-2024/02-experiment-tracking/data --dest_path ./output

In [13]:
output_dir = '/workspaces/mlops-zoomcamp-2024/02-experiment-tracking/output'
output_list = os.listdir(output_dir)
output_list

['val.pkl', 'dv.pkl', 'train.pkl', 'test.pkl']

In [14]:
len(output_list)

4

### Q3. Train a model with autolog

In [3]:
!python train.py --data_path ./output

RMSE: 5.431162180141208


In [7]:
client = mlflow.tracking.MlflowClient()
experiment_id = client.get_experiment_by_name("Default").experiment_id
run = client.search_runs(experiment_id, order_by=["start_time desc"], max_results=1)[0]

print("Logged parameters:")
for key, value in run.data.params.items():
    print(f"{key}: {value}")

Logged parameters:
warm_start: False
max_samples: None
n_estimators: 100
random_state: 0
n_jobs: None
oob_score: False
min_impurity_decrease: 0.0
verbose: 0
max_features: auto
min_samples_split: 2
max_leaf_nodes: None
max_depth: 10
min_weight_fraction_leaf: 0.0
ccp_alpha: 0.0
min_samples_leaf: 1
criterion: squared_error
bootstrap: True


### Q4. Launch the tracking server locally

In [8]:
!pwd

/workspaces/mlops-zoomcamp-2024/02-experiment-tracking


In [9]:
!mkdir mlruns_db
!mkdir mlruns_artifacts

In [6]:
!ls

data		      mlruns		output		    train.py
homework-week2.ipynb  mlruns_artifacts	preprocess_data.py
hpo.py		      mlruns_db		register_model.py


In [1]:
!mlflow server \
    --backend-store-uri sqlite:///mlruns_db/mlflow.db \
    --default-artifact-root ./mlruns_artifacts \
    --serve-artifacts \
    --host 0.0.0.0 \
    --port 5000

[2024-06-15 23:06:52 +0000] [4717] [INFO] Starting gunicorn 22.0.0
[2024-06-15 23:06:52 +0000] [4717] [INFO] Listening at: http://0.0.0.0:5000 (4717)
[2024-06-15 23:06:52 +0000] [4717] [INFO] Using worker: sync
[2024-06-15 23:06:52 +0000] [4718] [INFO] Booting worker with pid: 4718
[2024-06-15 23:06:52 +0000] [4719] [INFO] Booting worker with pid: 4719
[2024-06-15 23:06:52 +0000] [4720] [INFO] Booting worker with pid: 4720
[2024-06-15 23:06:52 +0000] [4721] [INFO] Booting worker with pid: 4721
^C
[2024-06-15 23:11:02 +0000] [4717] [INFO] Handling signal: int
[2024-06-15 23:11:02 +0000] [4721] [INFO] Worker exiting (pid: 4721)
[2024-06-15 23:11:02 +0000] [4720] [INFO] Worker exiting (pid: 4720)
[2024-06-15 23:11:02 +0000] [4718] [INFO] Worker exiting (pid: 4718)
[2024-06-15 23:11:02 +0000] [4719] [INFO] Worker exiting (pid: 4719)


### Q5. Tune model hyperparameters

In [8]:
!pip install hyperopt

Collecting hyperopt
  Downloading hyperopt-0.2.7-py2.py3-none-any.whl (1.6 MB)
[K     |████████████████████████████████| 1.6 MB 25.3 MB/s eta 0:00:01
Collecting py4j
  Downloading py4j-0.10.9.7-py2.py3-none-any.whl (200 kB)
[K     |████████████████████████████████| 200 kB 77.9 MB/s eta 0:00:01
[?25hInstalling collected packages: py4j, hyperopt
Successfully installed hyperopt-0.2.7 py4j-0.10.9.7


In [10]:
!python hpo.py

100%|██████████| 15/15 [00:59<00:00,  3.95s/trial, best loss: 5.335419588556921]


### Q6. Promote the best model to the model registry

In [11]:
!python register_model.py

2024/06/15 23:31:52 INFO mlflow.tracking.fluent: Experiment with name 'random-forest-best-models' does not exist. Creating a new experiment.
Successfully registered model 'rf-best-model'.
2024/06/15 23:32:53 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: rf-best-model, version 1
Created version '1' of model 'rf-best-model'.
