# Car Price Modelling & Analysis

Building on the inital round of training and model exploration, this notebook will focus on improving the MAPE and RMSE scores of the model. XGBoost will still be utilized but MAPE will be prioritized over RMSE and the 'options_list' feature will be added to the model.

In [9]:
import os, sys

import pandas as pd
import numpy as np
import json
import plotly.express as px
import plotly.graph_objs as go
from plotly.subplots import make_subplots
import plotly.io as pio
import mlflow
from mlflow.models.signature import infer_signature
from dotenv import load_dotenv, find_dotenv
import seaborn as sns
import matplotlib.pyplot as plt
import pickle

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MultiLabelBinarizer
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_validate, cross_val_score
from xgboost import XGBRegressor
from sklearn.ensemble import GradientBoostingRegressor

cur_dir = os.getcwd()
SRC_PATH = cur_dir[: cur_dir.index("fortunato-wheels-engine") + len("fortunato-wheels-engine")]
if SRC_PATH not in sys.path:
    sys.path.append(SRC_PATH)

from src.data.car_ads import CarAds
from src.logs import get_logger
from src.data.training_preprocessing import preprocess_ads_for_training

logger = get_logger(__name__)

AZURE_MLFLOW_URI = os.environ.get("AZURE_MLFLOW_URI")
mlflow.set_tracking_uri(AZURE_MLFLOW_URI)

sns.set_theme(style="whitegrid")
sns.set(rc={"figure.figsize": (8, 12)})
# set context to notebook
sns.set_context("notebook", font_scale=1.5, rc={"lines.linewidth": 2.5})
plt.rcParams["font.family"] = "sans serif"

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Load in current car adds

In [2]:
ads = CarAds()
ads.get_car_ads(data_dump=os.path.join(SRC_PATH, "data", "processed", "car-ads-dump_2023-07-18.csv"))
# ads.get_car_ads()

2023-08-02 19:51:09,589 - src.data.car_ads - INFO - Loading car ads from /Users/jonah/Documents/fortunato/fortunato-wheels-engine/data/processed/car-ads-dump_2023-07-18.csv...


  self.df = pd.read_csv(data_dump, parse_dates=["listed_date"])


In [3]:
# inital preprocessing
ads.preprocess_ads()

2023-08-02 19:55:46,415 - src.data.car_ads - INFO - Vehicle option preprocessing complete, kept top 50 options by count.
2023-08-02 19:55:46,422 - src.data.car_ads - INFO - Done preprocessing car ads, took 230s.


In [4]:
# select model features and split into train and test sets

model_features = [
    "age_at_posting",
    "mileage_per_year",
    "make",
    "model",
    "price",
    "wheel_system",
    "options_list"
]


# preprocess ads for training
preprocessed_ads = preprocess_ads_for_training(
    ads.df,
    model_features=model_features, 
    exclude_new_vehicle_ads=True
)

train_df, test_df = train_test_split(
    preprocessed_ads,
    test_size=0.2,
    random_state=42,
    stratify=preprocessed_ads["model"],
)

# with features selected drop all with null values
train_df = train_df[model_features].dropna().reset_index(drop=True)
test_df = test_df[model_features].dropna().reset_index(drop=True)

X_train = train_df.drop(columns=["price"])
y_train = train_df["price"]
X_test = test_df.drop(columns=["price"])
y_test = test_df["price"]

2023-08-02 20:02:25,912 - src.data.training_preprocessing - INFO - Preprocessing ads for training, starting with 3779395 ads
2023-08-02 20:02:54,161 - src.data.training_preprocessing - INFO - Preprocessing ads for training, ending with 1737985 ads


In [6]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1390388 entries, 0 to 1390387
Data columns (total 7 columns):
 #   Column            Non-Null Count    Dtype  
---  ------            --------------    -----  
 0   age_at_posting    1390388 non-null  int64  
 1   mileage_per_year  1390388 non-null  float64
 2   make              1390388 non-null  object 
 3   model             1390388 non-null  object 
 4   price             1390388 non-null  float64
 5   wheel_system      1390388 non-null  object 
 6   options_list      1390388 non-null  object 
dtypes: float64(2), int64(1), object(4)
memory usage: 74.3+ MB


In [5]:
train_df.head()

Unnamed: 0,age_at_posting,mileage_per_year,make,model,price,wheel_system,options_list
0,5,17145.0,Chevrolet,Cruze,19597.0,FWD,[none-listed]
1,5,41345.6,Toyota,Corolla,9123.0,FWD,"[bluetooth, backup-camera]"
2,3,37570.666667,Hyundai,Sonata,15601.0,FWD,"[navigation-system, bluetooth]"
3,3,14539.333333,Chevrolet,Trax,20944.0,FWD,"[navigation-system, bluetooth, backup-camera, ..."
4,3,30886.333333,Ford,Fusion,20568.0,FWD,"[bluetooth, backup-camera]"


In [44]:
from __future__ import annotations

from typing import Any, Callable, Sequence

import numpy as np
import numpy.typing as npt
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.utils.validation import check_is_fitted


# Class used from bryant1410L: https://github.com/scikit-learn/scikit-learn/issues/11309#issuecomment-1528042914

class MultiHotEncoder(BaseEstimator, TransformerMixin):
    """Wraps `MultiLabelBinarizer` in a form that can work with `ColumnTransformer`. It makes it accept multiple inputs.

    Note that the input `X` has to be a `pandas.DataFrame`.
    """

    def __init__(self, binarizer_creator: Callable[[], Any] | None = None, dtype: npt.DTypeLike | None = None) -> None:
        self.binarizer_creator = binarizer_creator or MultiLabelBinarizer
        self.dtype = dtype

        self.binarizers = []
        self.categories_ = self.classes_ = []
        self.columns = []

    def fit(self, X: pd.DataFrame, y: Any = None) -> MultiHotEncoder:  # noqa
        self.columns = X.columns.to_list()

        for column_name in X:
            binarizer = self.binarizer_creator().fit(X[column_name])
            self.binarizers.append(binarizer)
            self.classes_.append(binarizer.classes_)  # noqa

        return self

    def transform(self, X: pd.DataFrame) -> np.ndarray:
        check_is_fitted(self)

        if len(self.classes_) != X.shape[1]:
            raise ValueError(f"The fit transformer deals with {len(self.classes_)} columns "
                             f"while the input has {X.shape[1]}.")

        return np.concatenate([binarizer.transform(X[c]).astype(self.dtype)
                               for c, binarizer in zip(X, self.binarizers)], axis=1)

    def get_feature_names_out(self, input_features: Sequence[str] = None) -> np.ndarray:
        check_is_fitted(self)

        cats = self.categories_

        if input_features is None:
            input_features = self.columns
        elif len(input_features) != len(self.categories_):
            raise ValueError(f"input_features should have length equal to number of features ({len(self.categories_)}),"
                             f" got {len(input_features)}")

        return np.asarray([input_features[i] + "_" + str(t) for i in range(len(cats)) for t in cats[i]])

In [59]:
numeric_features = ["age_at_posting", "mileage_per_year"]

categorical_features = ["model", "wheel_system", "make"] 

multi_label_features = ["options_list"]

# make column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
        ("multi", MultiHotEncoder(), multi_label_features)
    ]
)

In [60]:
preprocessor.fit_transform(X_train[:10]).shape

(10, 33)

In [68]:
mlb = MultiLabelBinarizer()

mlb.fit_transform(X_train['options_list'][:10])


array([[0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
       [0, 1, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 1, 0, 0, 0],
       [0, 1, 1, 0, 0, 0, 1, 0, 1, 0],
       [0, 1, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 1, 1, 0, 0, 0, 0, 0, 0, 0],
       [1, 0, 0, 1, 0, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
       [0, 1, 1, 0, 1, 1, 1, 0, 1, 0],
       [0, 1, 1, 0, 0, 1, 1, 0, 0, 1]])

In [69]:
mlb.classes_

array(['air-conditioning', 'backup-camera', 'bluetooth', 'cruise-control',
       'heated-seats', 'leather-seats', 'navigation-system',
       'none-listed', 'remote-start', 'sunroof-moonroof'], dtype=object)

In [67]:
mhe = MultiHotEncoder()

mhe.fit_transform(pd.DataFrame(X_train['options_list'][:10]))


array([[0., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 1., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 1., 0., 0., 0.],
       [0., 1., 1., 0., 0., 0., 1., 0., 1., 0.],
       [0., 1., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 1., 0., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 1., 0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 1., 1., 0., 1., 1., 1., 0., 1., 0.],
       [0., 1., 1., 0., 0., 1., 1., 0., 0., 1.]])

In [70]:
mhe.get_feature_names_out()

array(['options_list_air-conditioning', 'options_list_backup-camera',
       'options_list_bluetooth', 'options_list_cruise-control',
       'options_list_heated-seats', 'options_list_leather-seats',
       'options_list_navigation-system', 'options_list_none-listed',
       'options_list_remote-start', 'options_list_sunroof-moonroof'],
      dtype='<U30')

In [62]:
from sklearn.pipeline import make_pipeline

# metrics = ["neg_mean_absolute_percentage_error","neg_root_mean_squared_error", "r2"]
metrics = ["neg_mean_absolute_percentage_error"]

gbr_pipe = make_pipeline(
    preprocessor,
    GradientBoostingRegressor(
        random_state=123
    )
)

model_cv_results = pd.DataFrame(cross_validate(
        gbr_pipe,
        X_train[:100],
        y_train[:100],
        cv=5,
        scoring=metrics,
        return_train_score=True,
        n_jobs=-1,
    )).agg(["mean", "std"]).T

In [63]:
model_cv_results

Unnamed: 0,mean,std
fit_time,0.029618,0.000964
score_time,0.002594,0.000116
test_neg_mean_absolute_percentage_error,-0.339801,0.100092
train_neg_mean_absolute_percentage_error,-0.067729,0.007637


## Setting up Hyperopt


In [None]:
# first metric is the one to minimize
metrics = ["neg_mean_absolute_percentage_error","neg_root_mean_squared_error", "r2"]

def objective(params):
    classifier_type = params['type']
    del params['type']
    if classifier_type == 'gradient_boosting':
        clf = GradientBoostingRegressor(**params)
    elif classifier_type == 'xgboost':
        clf = XGBRegressor(**params)
    else:
        return 0

    pipe = Pipeline(
        steps=[
            ("preprocessor", preprocessor),
            ("regressor", clf),
        ]
    )

    # manually run cross_validate and get train/test rmse, mape, and r2
    model_cv_results = pd.DataFrame(cross_validate(
        pipe,
        X_train,
        y_train,
        cv=5,
        scoring=metrics,
        return_train_score=True,
        n_jobs=-1,
    )).agg(["mean", "std"]).T


    # log metrics to mlflow
    with mlflow.start_run():

        # log train and test for each metric
        for m in metrics:
            mlflow.log_metric(f"{m}_train_mean", model_cv_results.loc[f"train_{m}"]["mean"])
            mlflow.log_metric(f"{m}_test_mean", model_cv_results.loc[f"test_{m}"]["mean"])
            mlflow.log_metric(f"{m}_train_std", model_cv_results.loc[f"train_{m}"]["std"])
            mlflow.log_metric(f"{m}_test_std", model_cv_results.loc[f"test_{m}"]["std"])

        # log params
        mlflow.log_params(params)
        # log the type of model
        mlflow.log_param("model_type", classifier_type)

        fit_model = pipe.fit(X_train, y_train)

        # log model
        mlflow.sklearn.log_model(fit_model, "model", signature=infer_signature(X_train, y_train))

    # make negative mape positive so it minimizes it
    result = { 'loss': -model_cv_results.loc["test_" + metrics[0]]["mean"], 'status': STATUS_OK}

    return result

In [None]:
search_space = hp.choice('classifier_type', [
    {
        'type': 'gradient_boosting',
        'max_features': hp.choice('max_features', ['sqrt', 'log2']),
        'max_depth': hp.uniformint('max_depth', 15, 30),
        'min_samples_split': hp.uniformint('dtree_min_samples_split', 20, 40),
        'n_estimators': hp.uniformint('n_estimators', 150, 300),
    }
])

In [None]:
X_train.head()

In [None]:
mlflow.set_experiment("price-prediction-v3-gradboost")
mlflow.sklearn.autolog(disable=True)

search_algorithm = tpe.suggest

best_hyperparams = fmin(
fn=objective, 
space=search_space,
algo=search_algorithm,
max_evals=2,
trials= Trials())

## Begin new training round with best hyperparmetes and log to mlflow

In [None]:
# change to new best params
# params = {
#         'max_features': "log2",
#         'max_depth': 25,
#         'min_samples_split': 35,
#         'n_estimators': 400
#     }

mlflow.set_experiment("price-prediction-v3")
mlflow.sklearn.autolog(disable=True)

pipe = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("regressor", GradientBoostingRegressor(**params)),
    ]
)

# manually run cross_validate and get train/test rmse, mape, and r2
model_cv_results = pd.DataFrame(cross_validate(
    pipe,
    X_train,
    y_train,
    cv=5,
    scoring=metrics,
    return_train_score=True,
    n_jobs=-1,
)).agg(["mean", "std"]).T


# log metrics to mlflow
with mlflow.start_run():

    # log train and test for each metric
    for m in metrics:
        mlflow.log_metric(f"{m}_train_mean", model_cv_results.loc[f"train_{m}"]["mean"])
        mlflow.log_metric(f"{m}_test_mean", model_cv_results.loc[f"test_{m}"]["mean"])
        mlflow.log_metric(f"{m}_train_std", model_cv_results.loc[f"train_{m}"]["std"])
        mlflow.log_metric(f"{m}_test_std", model_cv_results.loc[f"test_{m}"]["std"])

    # log params
    mlflow.log_params(params)
    # log the type of model
    mlflow.log_param("model_type", "gradient_boosting")

    fit_model = pipe.fit(X_train, y_train)

    # log model
    mlflow.sklearn.log_model(fit_model, "model", signature=infer_signature(X_train, y_train))