## Q1. Install MLflow

In [1]:
!conda config -q --append channels conda-forge
!conda install -q --yes --file "https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/main/02-experiment-tracking/requirements.txt"

Channels:
 - defaults
 - conda-forge
Platform: osx-64
Collecting package metadata (repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.



## Q2. Download and preprocess the data

In [2]:
import os

def download_file(url, subpath=''):
  path = os.path.join(subpath, url.split('/')[-1])
  if subpath:
    !mkdir -p $subpath
  !curl -s -S $url -o $path
  return path

In [3]:
download_file("https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/main/cohorts/2024/02-experiment-tracking/homework/preprocess_data.py")

for date in ['2023-01', '2023-02', '2023-03']:
  data_url = f'https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_{date}.parquet'
  data_path = download_file(data_url, "data")
  print(data_path)

data/green_tripdata_2023-01.parquet
data/green_tripdata_2023-02.parquet
data/green_tripdata_2023-03.parquet


In [4]:
!python preprocess_data.py --raw_data_path ./data --dest_path ./output

In [5]:
_, _, files = next(os.walk("output"))
len(files)

4

## Q3. Train a model with autolog

In [6]:
# download_file("https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/main/cohorts/2024/02-experiment-tracking/homework/train.py")

It's necessary to install an additional package to use MLFlow UI. See https://github.com/mlflow/mlflow/issues/1951

In [7]:
!conda install mlflow-ui-dbg

Channels:
 - defaults
 - conda-forge
Platform: osx-64
Collecting package metadata (repodata.json): done
Solving environment: done

# All requested packages already installed.



In [8]:
!python train.py



In [9]:
from mlflow import MlflowClient
client = MlflowClient()

In [10]:
runs = client.search_runs(
    experiment_ids="0",
    filter_string="",
    max_results=1
)
runs[0].data.params['min_samples_split']

'2'

## Q4. Launch the tracking server locally

```bash
mlflow server --backend-store-uri 'sqlite:///backend.db' --default-artifact-root 'artifacts'
```

## Q5. Tune model hyperparameters

In [11]:
# download_file("https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/main/cohorts/2024/02-experiment-tracking/homework/hpo.py")

In [12]:
!python hpo.py

100%|██████████| 15/15 [00:58<00:00,  3.88s/trial, best loss: 5.335419588556921]


In [13]:
mlclient = MlflowClient("sqlite:///backend.db")

In [14]:
runs = mlclient.search_runs(
    experiment_ids="1",
    filter_string="",
    max_results=1,
    order_by=["metrics.rmse ASC"]
)
runs[0].data.metrics['rmse']

5.335419588556921

## Q6. Promote the best model to the model registry

In [15]:
# download_file("https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/main/cohorts/2024/02-experiment-tracking/homework/register_model.py")

In [16]:
!python register_model.py

Registered model 'best-model-ever' already exists. Creating a new version of this model...
2024/05/27 13:04:20 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: best-model-ever, version 2
Created version '2' of model 'best-model-ever'.


In [17]:
runs = mlclient.search_runs(
    experiment_ids="2",
    filter_string="",
    max_results=1,
    order_by=["metrics.test_rmse ASC"]
)
runs[0].data.metrics['test_rmse']

5.567408012462019