# Car Price Modelling - Error Metric Logging

This notebook will focus on Using the mlflow.log_artifacts functionality to save csv files for each model run under a new artifact_path in mlflow

In [48]:
import os, sys

import pandas as pd
import numpy as np
import json
import plotly.express as px
import plotly.graph_objs as go
from plotly.subplots import make_subplots
import plotly.io as pio
import mlflow
from mlflow.models.signature import infer_signature
from dotenv import load_dotenv, find_dotenv
import seaborn as sns
import matplotlib.pyplot as plt
import pickle
import tempfile 

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MultiLabelBinarizer
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_validate, cross_val_score
from xgboost import XGBRegressor
from sklearn.ensemble import GradientBoostingRegressor
from hyperopt import fmin, tpe, hp, SparkTrials, STATUS_OK, Trials


cur_dir = os.getcwd()
SRC_PATH = cur_dir[: cur_dir.index("fortunato-wheels-engine") + len("fortunato-wheels-engine")]
if SRC_PATH not in sys.path:
    sys.path.append(SRC_PATH)

from src.data.car_ads import CarAds
from src.logs import get_logger
from src.data.training_preprocessing import preprocess_ads_for_training
from src.evaluate import price_model
from src.training.custom_components import MultiHotEncoder

load_dotenv(find_dotenv())

logger = get_logger(__name__)

AZURE_MLFLOW_URI = os.environ.get("AZURE_MLFLOW_URI")
mlflow.set_tracking_uri(AZURE_MLFLOW_URI)

sns.set_theme(style="whitegrid")
sns.set(rc={"figure.figsize": (8, 12)})
# set context to notebook
sns.set_context("notebook", font_scale=1.5, rc={"lines.linewidth": 2.5})
plt.rcParams["font.family"] = "sans serif"

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Load in current car adds

In [7]:
ads = CarAds()
ads.get_car_ads(
    data_dump=os.path.join(SRC_PATH, "data", "processed", "car-ads-dump_2023-07-18.csv")
)
# ads.get_car_ads()

2023-08-14 16:16:17,285 - src.data.car_ads - INFO - Loading car ads from /Users/jonah/Documents/fortunato/fortunato-wheels-engine/data/processed/car-ads-dump_2023-07-18.csv...


  self.df = pd.read_csv(data_dump, parse_dates=["listed_date"])


In [8]:
# inital preprocessing
ads.preprocess_ads()

2023-08-14 16:20:38,360 - src.data.car_ads - INFO - Vehicle option preprocessing complete, kept top 50 options by count.
2023-08-14 16:20:38,366 - src.data.car_ads - INFO - Done preprocessing car ads, took 212s.


In [9]:
# select model features and split into train and test sets

model_features = [
    "age_at_posting",
    "mileage_per_year",
    "make",
    "model",
    "price",
    "wheel_system",
    "options_list",
]


# preprocess ads for training
preprocessed_ads = preprocess_ads_for_training(
    ads.df, model_features=model_features, exclude_new_vehicle_ads=True
)

train_df, test_df = train_test_split(
    preprocessed_ads,
    test_size=0.2,
    random_state=42,
    stratify=preprocessed_ads["model"],
)

# with features selected drop all with null values
train_df = train_df[model_features].dropna().reset_index(drop=True)
test_df = test_df[model_features].dropna().reset_index(drop=True)

X_train = train_df.drop(columns=["price"])
y_train = train_df["price"]
X_test = test_df.drop(columns=["price"])
y_test = test_df["price"]

2023-08-14 16:25:26,661 - src.data.training_preprocessing - INFO - Preprocessing ads for training, starting with 3779395 ads
2023-08-14 16:25:54,156 - src.data.training_preprocessing - INFO - Preprocessing ads for training, ending with 1737985 ads


In [10]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1390388 entries, 0 to 1390387
Data columns (total 7 columns):
 #   Column            Non-Null Count    Dtype  
---  ------            --------------    -----  
 0   age_at_posting    1390388 non-null  int64  
 1   mileage_per_year  1390388 non-null  float64
 2   make              1390388 non-null  object 
 3   model             1390388 non-null  object 
 4   price             1390388 non-null  float64
 5   wheel_system      1390388 non-null  object 
 6   options_list      1390388 non-null  object 
dtypes: float64(2), int64(1), object(4)
memory usage: 74.3+ MB


In [11]:
train_df.head()

Unnamed: 0,age_at_posting,mileage_per_year,make,model,price,wheel_system,options_list
0,5,17145.0,Chevrolet,Cruze,19597.0,FWD,[none-listed]
1,5,41345.6,Toyota,Corolla,9123.0,FWD,"[bluetooth, backup-camera]"
2,3,37570.666667,Hyundai,Sonata,15601.0,FWD,"[navigation-system, bluetooth]"
3,3,14539.333333,Chevrolet,Trax,20944.0,FWD,"[navigation-system, bluetooth, backup-camera, ..."
4,3,30886.333333,Ford,Fusion,20568.0,FWD,"[bluetooth, backup-camera]"


In [12]:
numeric_features = ["age_at_posting", "mileage_per_year"]

categorical_features = ["model", "wheel_system", "make"]

multi_label_features = ["options_list"]

# make column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
        ("multi", MultiHotEncoder(), multi_label_features),
    ]
)

## Load and Fit model

In [16]:
from azureml.core import Workspace
from azureml.core.model import Model

# what version of the model to download/evaluate
PRICE_PREDICTION_MODEL_VER = 3
PRICE_PREDICTION_MODEL_PATH = os.path.join(
    os.pardir,
    "models",
    "all-vehicles-price-prediction",
    str(PRICE_PREDICTION_MODEL_VER),
)

In [17]:
ws = Workspace.from_config(
    # assumed running from root of repo
    path=os.path.join(
        os.pardir, "src", "deployment", "price_prediction_api", "config.json"
    )
)

price_model = Model(
    ws, "all-vehicles-price-prediction", version=PRICE_PREDICTION_MODEL_VER
)

# try:
#     price_model.download(
#         target_dir=PRICE_PREDICTION_MODEL_PATH,
#         exist_ok=True,
# )
# except WebserviceException as e:
#     print(f"model has already been downloaded: {e}")

price_model.download(
    target_dir=PRICE_PREDICTION_MODEL_PATH,
    exist_ok=True,
)

Performing interactive authentication. Please follow the instructions on the terminal.


The default web browser has been opened at https://login.microsoftonline.com/organizations/oauth2/v2.0/authorize. Please continue the login in the web browser. If no web browser is available or if the web browser fails to open, use device code flow with `az login --use-device-code`.
The following tenants require Multi-Factor Authentication (MFA). Use 'az login --tenant TENANT_ID' to explicitly login to a tenant.
11dd9329-1350-4ea4-a890-e686013a0091 'Default Directory'


AzureMLException: AzureMLException:
	Message: No subscriptions found for jonah.hamilton@gmail.com.
	InnerException None
	ErrorResponse 
{
    "error": {
        "message": "No subscriptions found for jonah.hamilton@gmail.com."
    }
}

In [18]:
# Load the model from file
model_path = os.path.join(PRICE_PREDICTION_MODEL_PATH, "model", "model.pkl")

with open(model_path, "rb") as f:
    model = pickle.load(f)

In [19]:
# Predict on test set
y_pred = model.predict(X_test)

# add predicted price to test_df, round to 1 decimal place
full_df = test_df.copy(deep=True).assign(predicted_price=y_pred.round(1))
full_df

Unnamed: 0,age_at_posting,mileage_per_year,make,model,price,wheel_system,options_list,predicted_price
0,3,8368.666667,Ford,Escape,28484.0,AWD,"[leather-seats, sunroof-moonroof, navigation-s...",25831.8
1,11,18272.727273,Ford,F-150,11044.0,unknown,[none-listed],14642.5
2,12,16250.000000,GMC,Acadia,9500.0,unknown,[none-listed],11060.7
3,5,10565.400000,Buick,Encore,18723.0,FWD,"[navigation-system, bluetooth, backup-camera, ...",18702.8
4,9,15126.333333,GMC,Acadia,9377.0,FWD,"[navigation-system, bluetooth, backup-camera, ...",11747.8
...,...,...,...,...,...,...,...,...
347592,4,13000.000000,BMW,4 Series,34999.0,unknown,"[air-conditioning, bluetooth, cruise-control, ...",38563.7
347593,5,8603.600000,Buick,Encore,16330.0,FWD,"[navigation-system, bluetooth, backup-camera]",18847.1
347594,3,8830.000000,Volkswagen,Tiguan,24800.0,AWD,"[bluetooth, backup-camera]",25209.4
347595,4,60445.250000,Toyota,Tundra,31800.0,unknown,[bluetooth],30223.1


In [22]:
price_model.calculate_evaluation_metrics(y_test, y_pred)

Unnamed: 0,metric,value
0,rmse,5105.0
1,mape,0.1219
2,r2,0.9167


In [None]:
price_model.calculate_evaluation_metrics(y_test, y_pred)

In [50]:
price_model.calculate_evaluation_metrics_by_make(full_df)

Unnamed: 0,make,count,RMSE,MAPE,R2
0,Ford,48871,6143.5,0.141,0.8868
1,GMC,13319,5630.6,0.1327,0.9166
2,Buick,5916,2882.4,0.0821,0.9234
3,Audi,5322,4339.2,0.1109,0.9289
4,BMW,10094,5626.7,0.1382,0.9218
5,Chevrolet,41036,5014.8,0.1159,0.9203
6,Nissan,26017,3105.7,0.1095,0.9096
7,Volkswagen,9944,3017.4,0.1269,0.9195
8,Hyundai,15834,3249.2,0.1241,0.8507
9,Kia,12591,2937.1,0.1131,0.8512


In [51]:
price_model.calculate_evaluation_metrics_by_model(full_df)

Unnamed: 0,make,model,count,RMSE,MAPE,R2
0,Ford,Escape,7351,3092.2,0.1062,0.8765
1,Ford,F-150,11541,8869.4,0.1971,0.7736
2,GMC,Acadia,2668,3836.2,0.1004,0.9022
3,Buick,Encore,2228,2374.9,0.0683,0.7523
4,Audi,A4,1223,3747.2,0.1479,0.9303
...,...,...,...,...,...,...
264,Lincoln,Navigator,381,4763.9,0.0844,0.9784
265,Tesla,Model S,282,9030.9,0.0831,0.8311
266,Land Rover,Range Rover Velar,216,6842.7,0.0778,0.7356
267,Alfa Romeo,Giulia,210,6437.1,0.0788,0.6061


In [43]:
# turn into function and add to price_model.py
train_df.groupby(["make", "model"]).agg(
    {"price": "mean", "mileage_per_year": "mean"}
).reset_index().rename(
    columns={"price": "mean_price", "mileage_per_year": "mean_mileage"}
).assign(
    count=train_df.groupby(["make", "model"]).size().values
).round(
    {"mean_price": 1, "mean_mileage": 1}
)

Unnamed: 0,make,model,mean_price,mean_mileage,count
0,Acura,ILX,24109.5,12991.0,1462
1,Acura,MDX,32942.2,14683.1,6165
2,Acura,RDX,33613.7,14806.9,4928
3,Acura,TL,11213.2,13299.9,1734
4,Acura,TLX,32749.3,14390.8,3009
...,...,...,...,...,...
264,Volkswagen,Passat,17666.4,13600.2,8238
265,Volkswagen,Tiguan,25601.5,14968.4,7908
266,Volvo,S60,27617.5,11957.4,1967
267,Volvo,XC60,37182.1,14673.0,2406


## Setting up Hyperopt


In [57]:
# first metric is the one to minimize
metrics = ["neg_mean_absolute_percentage_error", "neg_root_mean_squared_error", "r2"]


def objective(params):
    classifier_type = params["type"]
    del params["type"]
    if classifier_type == "gradient_boosting":
        clf = GradientBoostingRegressor(**params)
    elif classifier_type == "xgboost":
        clf = XGBRegressor(**params)
    else:
        return 0

    pipe = Pipeline(
        steps=[
            ("preprocessor", preprocessor),
            ("regressor", clf),
        ]
    )

    # manually run cross_validate and get train/test rmse, mape, and r2
    model_cv_results = (
        pd.DataFrame(
            cross_validate(
                pipe,
                X_train.head(1000),
                y_train.head(1000),
                cv=5,
                scoring=metrics,
                return_train_score=True,
                n_jobs=-1,
            )
        )
        .agg(["mean", "std"])
        .T
    )

    # log metrics to mlflow
    with mlflow.start_run():
        # log train and test for each metric
        for m in metrics:
            mlflow.log_metric(
                f"{m}_train_mean", model_cv_results.loc[f"train_{m}"]["mean"]
            )
            mlflow.log_metric(
                f"{m}_test_mean", model_cv_results.loc[f"test_{m}"]["mean"]
            )
            mlflow.log_metric(
                f"{m}_train_std", model_cv_results.loc[f"train_{m}"]["std"]
            )
            mlflow.log_metric(f"{m}_test_std", model_cv_results.loc[f"test_{m}"]["std"])

        # log params
        mlflow.log_params(params)
        # log the type of model
        mlflow.log_param("model_type", classifier_type)

        fit_model = pipe.fit(X_train.head(1000), y_train.head(1000))

        # log model
        mlflow.sklearn.log_model(
            fit_model,
            "model",
            signature=infer_signature(X_train.head(1000), y_train.head(1000)),
        )

        # predict on test set
        y_pred = fit_model.predict(X_test.head(1000))

        # add predicted price to test_df, round to 1 decimal place
        full_df = (
            test_df.head(1000).copy(deep=True).assign(predicted_price=y_pred.round(1))
        )

        # calculate evaluation metrics by model
        metrics_by_model = price_model.calculate_evaluation_metrics_by_model(full_df)

        # calculate evaluation metrics by make
        metrics_by_make = price_model.calculate_evaluation_metrics_by_make(full_df)

        # turn into function and add to price_model.py
        # train_data_metrics = (
        #     train_df.head(1000)
        #     .copy(deep=True)
        #     .groupby(["make", "model"])
        #     .agg({"price": "mean", "mileage_per_year": "mean"})
        #     .reset_index()
        #     .rename(columns={"price": "mean_price", "mileage_per_year": "mean_mileage"})
        #     .assign(count=train_df.groupby(["make", "model"]).size().values)
        #     .round({"mean_price": 1, "mean_mileage": 1})
        # )

        with tempfile.TemporaryDirectory() as tmpdir:
            # Save model metrics to CSV file
            model_metrics_fname = os.path.join(tmpdir, "metrics_by_model.csv")
            metrics_by_model.to_csv(model_metrics_fname, index=False)

            # Save make metrics to CSV file
            make_metrics_fname = os.path.join(tmpdir, "metrics_by_make.csv")
            metrics_by_make.to_csv(make_metrics_fname, index=False)

            # Save train metrics to CSV file
            # train_metrics_fname = os.path.join(tmpdir, "train_data_metrics.csv")
            # train_data_metrics.to_csv(train_metrics_fname, index=False)

            # Log metrics files as artifacts
            mlflow.log_artifact(model_metrics_fname, artifact_path="evaluate/")
            mlflow.log_artifact(make_metrics_fname, artifact_path="evaluate/")
            # mlflow.log_artifact(train_metrics_fname, artifact_path="evaluate/")

    # make negative mape positive so it minimizes it
    result = {
        "loss": -model_cv_results.loc["test_" + metrics[0]]["mean"],
        "status": STATUS_OK,
    }

    return result

In [55]:
search_space = hp.choice(
    "classifier_type",
    [
        {
            "type": "gradient_boosting",
            "max_features": hp.choice("max_features", ["sqrt", "log2"]),
            "max_depth": hp.uniformint("max_depth", 15, 30),
            "min_samples_split": hp.uniformint("dtree_min_samples_split", 20, 40),
            "n_estimators": hp.uniformint("n_estimators", 150, 300),
        }
    ],
)

In [46]:
X_train.head()

Unnamed: 0,age_at_posting,mileage_per_year,make,model,wheel_system,options_list
0,5,17145.0,Chevrolet,Cruze,FWD,[none-listed]
1,5,41345.6,Toyota,Corolla,FWD,"[bluetooth, backup-camera]"
2,3,37570.666667,Hyundai,Sonata,FWD,"[navigation-system, bluetooth]"
3,3,14539.333333,Chevrolet,Trax,FWD,"[navigation-system, bluetooth, backup-camera, ..."
4,3,30886.333333,Ford,Fusion,FWD,"[bluetooth, backup-camera]"


In [58]:
# mlflow.set_experiment("price-prediction-v3-gradboost")
mlflow.set_experiment("sandbox")
mlflow.sklearn.autolog(disable=True)

search_algorithm = tpe.suggest

best_hyperparams = fmin(
    fn=objective,
    space=search_space,
    algo=search_algorithm,
    max_evals=1,
    trials=Trials(),
)

  0%|          | 0/1 [00:00<?, ?trial/s, best loss=?]

  inputs = _infer_schema(model_input)





























































job exception: name 'train_data_metrics' is not defined



  0%|          | 0/1 [00:12<?, ?trial/s, best loss=?]


NameError: name 'train_data_metrics' is not defined