In [1]:
import os
import pickle

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [2]:
import mlflow
from mlflow.tracking import MlflowClient
from mlflow.entities import ViewType

mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("nyc-taxi-experiment")

client = MlflowClient(tracking_uri = 'sqlite:///mlflow.db')

2023/05/31 20:16:12 INFO mlflow.tracking.fluent: Experiment with name 'nyc-taxi-experiment' does not exist. Creating a new experiment.


In [3]:
# 1
print("Question 1:")
!mlflow --version

Question 1:
mlflow, version 2.3.2


In [4]:
# 2
!python3 preprocess_data.py --raw_data_path ../data/ --dest_path ../output/
print("Question 2: {}".format(os.path.getsize('../output/dv.pkl')))

Question 2: 153660


In [5]:
# 3
!python3 train.py --data_path ../output/

experiment = client.get_experiment_by_name('nyc-taxi-experiment')
run = client.search_runs(
    experiment_ids=experiment.experiment_id,
    run_view_type=ViewType.ACTIVE_ONLY
)[0]
print("Question 3: {}".format(run.data.params['max_depth']))

2023/05/31 20:16:18 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
Question 3: 10


In [6]:
# 4
!python3 ./hpo.py

experiment = client.get_experiment_by_name('random-forest-hyperopt')
best_run = client.search_runs(
    experiment_ids=experiment.experiment_id,
    run_view_type=ViewType.ACTIVE_ONLY,
    max_results=1,
    order_by=["metrics.RMSE ASC"]
)[0]
print("Question 4: {}".format(best_run.data.metrics['RMSE']))

2023/05/31 20:16:33 INFO mlflow.tracking.fluent: Experiment with name 'random-forest-hyperopt' does not exist. Creating a new experiment.
[32m[I 2023-05-31 20:16:33,163][0m A new study created in memory with name: no-name-1babc9af-2a0e-4c21-88c0-49aec0cca758[0m
[32m[I 2023-05-31 20:16:35,563][0m Trial 0 finished with value: 2.451379690825458 and parameters: {'n_estimators': 25, 'max_depth': 20, 'min_samples_split': 8, 'min_samples_leaf': 3}. Best is trial 0 with value: 2.451379690825458.[0m
[32m[I 2023-05-31 20:16:36,341][0m Trial 1 finished with value: 2.4667366020368333 and parameters: {'n_estimators': 16, 'max_depth': 4, 'min_samples_split': 2, 'min_samples_leaf': 4}. Best is trial 0 with value: 2.451379690825458.[0m
[32m[I 2023-05-31 20:16:38,541][0m Trial 2 finished with value: 2.449827329704216 and parameters: {'n_estimators': 34, 'max_depth': 15, 'min_samples_split': 2, 'min_samples_leaf': 4}. Best is trial 2 with value: 2.449827329704216.[0m
[32m[I 2023-05-31 20:16

In [7]:
# 5
!python3 ./register_model.py

experiment = client.get_experiment_by_name('random-forest-best-models')
best_run = client.search_runs(
    experiment_ids=experiment.experiment_id,
    run_view_type=ViewType.ACTIVE_ONLY,
    max_results=1,
    order_by=["metrics.test_rmse ASC"]
)[0]
print("Question 5: {}".format(best_run.data.metrics['test_rmse']))

2023/05/31 20:16:48 INFO mlflow.tracking.fluent: Experiment with name 'random-forest-best-models' does not exist. Creating a new experiment.
Successfully registered model 'nyc-taxi-regressor'.
2023/05/31 20:17:10 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: nyc-taxi-regressor, version 1
Created version '1' of model 'nyc-taxi-regressor'.
Question 5: 2.2854691906481364
