# Week 2 - Experiment Tracking and Model Registry with MLFlow

## 1. Installation

In [1]:
!mlflow --version

mlflow, version 1.26.0


## 2. Download and preprocess

Use green taxi trip records from 2021-01 to 2021-03

In [26]:
# URL format: https://s3.amazonaws.com/nyc-tlc/trip+data/green_tripdata_2021-01.parquet
# looking for 2021-01, 02, 03
# use requests

import requests
from pathlib import Path

def download_from_url(url: str, file_dir: str):
    """Wrapper function for requests library to stream download,
    i.e. without needing to store entire file in memory, and allows
    download to proceed in chunks

    Args:
    url: string
        direct url to the file for download, e.g.
        https://s3.amazonaws.com/nyc-tlc/trip+data/green_tripdata_2021-01.parquet

    file_dir: string
        path to the download destination directory
    """
    file_dir = Path(file_dir)
    if not file_dir:
        Path.mkdir(file_dir)

    # local_file = url.split('/')[-1].replace(" ", "_")
    # use built-in method to extract filename:
    local_file = Path(url).name
    local_path = Path(file_dir) / local_file

    if not local_path.exists():
        r = requests.get(url, stream=True)
        if r.ok:
            print('Saving to ', local_path)
            with open(local_path, 'wb') as f:
                for chunk in r.iter_content(chunk_size=1024*8):
                    # if chunk:
                    # iter_content will never return None type
                    f.write(chunk)
                    # f.flush()
                    # os.fsync(f.fileno())
        else:
            # HTTP status 4xx/5xx
            print(f'Download failed with code {r.status_code}\n{r.text}')
    else:
        print(f'File already exists:\n{local_path}')

In [27]:
# url = 'https://s3.amazonaws.com/nyc-tlc/trip+data/green_tripdata_2021-01.parquet'
parquet_urls = [f'https://s3.amazonaws.com/nyc-tlc/trip+data/green_tripdata_2021-{month:02}.parquet'
                for month in range(1, 4)] 
dest_path = Path.cwd().parents[1] / 'data'

for url in parquet_urls:
    download_from_url(url, dest_path)

File already exists:
/home/klang/mlops-notes/data/green_tripdata_2021-01.parquet
File already exists:
/home/klang/mlops-notes/data/green_tripdata_2021-02.parquet
File already exists:
/home/klang/mlops-notes/data/green_tripdata_2021-03.parquet


### Preprocess

Run `preprocess_data.py` and examine the outputs

In [28]:
!python preprocess_data.py --raw_data_path ~/mlops-notes/data --dest_path ./output

In [29]:
output_path = Path.cwd() / 'output'
list(output_path.glob('*.*'))

[PosixPath('/home/klang/mlops-notes/notebooks/w2-mlflow/output/dv.pkl'),
 PosixPath('/home/klang/mlops-notes/notebooks/w2-mlflow/output/valid.pkl'),
 PosixPath('/home/klang/mlops-notes/notebooks/w2-mlflow/output/test.pkl'),
 PosixPath('/home/klang/mlops-notes/notebooks/w2-mlflow/output/train.pkl')]

## Train with autolog

Use random forest regressor via `train.py`. The script loads the outputs from previous step, trains model on `train.pkl`, and calculates RMSE on `valid.pkl`.

Modify so that MLflow's **autolog** is enabled. Launch MLflow UI to confirm tracking

In [None]:
import pandas as pd
import xgboost as xgb
import mlflow
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

from sklearn.metrics import mean_squared_error

import pickle

In [30]:
!python train.py