# Car Price Modelling & Analysis

Building on the inital round of training and model exploration, this notebook will focus on improving the MAPE and RMSE scores of the model. XGBoost will still be utilized but MAPE will be prioritized over RMSE and the 'options_list' feature will be added to the model.

In [None]:
import os, sys

import pandas as pd
import numpy as np
import json
import plotly.express as px
import plotly.graph_objs as go
from plotly.subplots import make_subplots
import plotly.io as pio
import mlflow
from mlflow.models.signature import infer_signature
from dotenv import load_dotenv, find_dotenv
import seaborn as sns
import matplotlib.pyplot as plt
import pickle
from hyperopt import fmin, tpe, hp, SparkTrials, STATUS_OK, Trials

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MultiLabelBinarizer
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_validate, cross_val_score
from xgboost import XGBRegressor
from sklearn.ensemble import GradientBoostingRegressor

cur_dir = os.getcwd()
SRC_PATH = cur_dir[: cur_dir.index("fortunato-wheels-engine") + len("fortunato-wheels-engine")]
if SRC_PATH not in sys.path:
    sys.path.append(SRC_PATH)

from src.data.car_ads import CarAds
from src.logs import get_logger
from src.data.training_preprocessing import preprocess_ads_for_training
from src.training.custom_components import MultiHotEncoder

# set AZUREML_ARTIFACTS_DEFAULT_TIMEOUT to 900 seconds
os.environ["AZUREML_ARTIFACTS_DEFAULT_TIMEOUT"] = "900"

load_dotenv(find_dotenv())

logger = get_logger(__name__)

AZURE_MLFLOW_URI = os.environ.get("AZURE_MLFLOW_URI")
mlflow.set_tracking_uri(AZURE_MLFLOW_URI)

sns.set_theme(style="whitegrid")
sns.set(rc={"figure.figsize": (8, 12)})
# set context to notebook
sns.set_context("notebook", font_scale=1.5, rc={"lines.linewidth": 2.5})
plt.rcParams["font.family"] = "sans serif"

%load_ext autoreload
%autoreload 2

## Load in current car adds

In [None]:
ads = CarAds()
ads.get_car_ads(data_dump=os.path.join(SRC_PATH, "data", "processed", "car-ads-dump_2023-07-18.csv"))
# ads.get_car_ads()

In [None]:
# inital preprocessing
ads.preprocess_ads()

In [None]:
# select model features and split into train and test sets

model_features = [
    "age_at_posting",
    "mileage_per_year",
    "make",
    "model",
    "price",
    "wheel_system",
    "options_list"
]


# preprocess ads for training
preprocessed_ads = preprocess_ads_for_training(
    ads.df,
    model_features=model_features, 
    exclude_new_vehicle_ads=True
)

train_df, test_df = train_test_split(
    preprocessed_ads,
    test_size=0.2,
    random_state=42,
    stratify=preprocessed_ads["model"],
)

# with features selected drop all with null values
train_df = train_df[model_features].dropna().reset_index(drop=True)
test_df = test_df[model_features].dropna().reset_index(drop=True)

X_train = train_df.drop(columns=["price"])
y_train = train_df["price"]
X_test = test_df.drop(columns=["price"])
y_test = test_df["price"]

In [None]:
train_df.info()

In [None]:
train_df.head()

In [None]:
numeric_features = ["age_at_posting", "mileage_per_year"]

categorical_features = ["model", "wheel_system", "make"] 

multi_label_features = ["options_list"]

# make column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
        ("multi", MultiHotEncoder(), multi_label_features)
    ]
)

In [None]:
preprocessor.fit_transform(X_train[:10]).shape

In [None]:
mlb = MultiLabelBinarizer()

mlb.fit_transform(X_train['options_list'][:10])


In [None]:
mlb.classes_

In [None]:
mhe = MultiHotEncoder()

mhe.fit_transform(pd.DataFrame(X_train['options_list'][:10]))


In [None]:
mhe.get_feature_names_out()

In [None]:
# from sklearn.pipeline import make_pipeline

# metrics = ["neg_mean_absolute_percentage_error","neg_root_mean_squared_error", "r2"]
# # metrics = ["neg_mean_absolute_percentage_error"]

# gbr_pipe = make_pipeline(
#     preprocessor,
#     GradientBoostingRegressor(
#         random_state=123
#     )
# )

# model_cv_results = pd.DataFrame(cross_validate(
#         gbr_pipe,
#         X_train[:100],
#         y_train[:100],
#         cv=5,
#         scoring=metrics,
#         return_train_score=True,
#         n_jobs=-1,
#     )).agg(["mean", "std"]).T

# model_cv_results

## Setting up Hyperopt


In [None]:
# first metric is the one to minimize
metrics = ["neg_mean_absolute_percentage_error","neg_root_mean_squared_error", "r2"]

def objective(params):
    classifier_type = params['type']
    del params['type']
    if classifier_type == 'gradient_boosting':
        clf = GradientBoostingRegressor(**params)
    elif classifier_type == 'xgboost':
        clf = XGBRegressor(**params)
    else:
        return 0

    pipe = Pipeline(
        steps=[
            ("preprocessor", preprocessor),
            ("regressor", clf),
        ]
    )
    # start mlflow run for training timing tracking
    with mlflow.start_run():

        # manually run cross_validate and get train/test rmse, mape, and r2
        model_cv_results = pd.DataFrame(cross_validate(
            pipe,
            X_train,
            y_train,
            cv=5,
            scoring=metrics,
            return_train_score=True,
            n_jobs=-1,
        )).agg(["mean", "std"]).T


        # log metrics to mlflow
        # log train and test for each metric
        for m in metrics:
            mlflow.log_metric(f"{m}_train_mean", model_cv_results.loc[f"train_{m}"]["mean"])
            mlflow.log_metric(f"{m}_test_mean", model_cv_results.loc[f"test_{m}"]["mean"])
            mlflow.log_metric(f"{m}_train_std", model_cv_results.loc[f"train_{m}"]["std"])
            mlflow.log_metric(f"{m}_test_std", model_cv_results.loc[f"test_{m}"]["std"])

        # log params
        mlflow.log_params(params)
        # log the type of model
        mlflow.log_param("model_type", classifier_type)

        fit_model = pipe.fit(X_train, y_train)

        # log model
        mlflow.sklearn.log_model(fit_model, "model", signature=infer_signature(X_train, y_train))

    # make negative mape positive so it minimizes it
    result = { 'loss': -model_cv_results.loc["test_" + metrics[0]]["mean"], 'status': STATUS_OK}

    return result

## Gradient Boosting Regressor Modelling

In [None]:
search_space = hp.choice('classifier_type', [
    {
        'type': 'gradient_boosting',
        'max_features': hp.choice('max_features', ['sqrt', 'log2']),
        'max_depth': hp.uniformint('max_depth', 15, 30),
        'min_samples_split': hp.uniformint('dtree_min_samples_split', 20, 40),
        'n_estimators': hp.uniformint('n_estimators', 150, 300),
    }
])

In [None]:
# X_train.head()

In [None]:
mlflow.set_experiment("price-prediction-v3")
mlflow.sklearn.autolog(disable=True)

search_algorithm = tpe.suggest

best_hyperparams = fmin(
fn=objective, 
space=search_space,
algo=search_algorithm,
max_evals=20,
trials= Trials())

## XGBoost Model Training

In [None]:
search_space = hp.choice('classifier_type', [
    {
        'type': 'xgboost',
        'max_depth': hp.uniformint('max_depth', 15, 35),
        'min_child_weight': hp.uniformint('min_child_weight', 0, 10),
        'subsample': hp.uniform('subsample', 0.5, 1),
        'n_estimators': hp.uniformint('n_estimators', 150, 300),
        'learning_rate': hp.uniform('learning_rate', 0.01, 0.2),
        'gamma': hp.uniform('gamma', 0, 9),
    }
])

In [None]:
mlflow.set_experiment("price-prediction-v3")
mlflow.sklearn.autolog(disable=True)

search_algorithm = tpe.suggest

best_hyperparams = fmin(
fn=objective, 
space=search_space,
algo=search_algorithm,
max_evals=50,
trials= Trials())