# Car Price Modelling & Analysis

This notebook contains the initial exploration of modelling approaches to tackle the following challenges:

1. A user is looking for a specific make/model of car and wants to know how much it might cost.
2. A user is looking for a specific type of car and wondering what makes/models fit within that range (e.g. SUV, mini-van, truck)

The results should be easily interpretable by users. Expected user flow is as follows:

1. User is looking for make/model
   1. User comes to the site
   2. Selects type of vehicle they're looking for
      - From: SUV, Truck, Van, Sedan, Sports Car
   3. After that they can select a make/model if they want but is optional and will show info for average price by make and/or model (?)
   4. They can put in a desired age of vehicle in years as well as desired mileage range but is optional.
   5. With the inputs entered the following cases are displayed:
       - Just type of vehicle: a break down by make is shown with price by age of vehicle
       - Type of vehicle and make: a break down by model of vehicle price and age of vehicle
       - Type/make/model: details on price by age as well as mileage
       - Type/budget price: find for all makes and models the age that is closest to the budget amount how to optimize for both age and mileage? Have milage per year perhaps?
         - Possibly leave out mileage and include the average mileapge per year plus conf. interval to say this age with ~ X-Y km's?


In [None]:
import os, sys

import pandas as pd
import numpy as np
import json
import plotly.express as px
import mlflow
from mlflow.models.signature import infer_signature
from dotenv import load_dotenv, find_dotenv
import seaborn as sns
import matplotlib.pyplot as plt


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.dummy import DummyRegressor
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, HistGradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_validate, cross_val_score
from xgboost import XGBRegressor
from hyperopt import fmin, tpe, hp, SparkTrials, STATUS_OK, Trials

cur_dir = os.getcwd()
SRC_PATH = cur_dir[: cur_dir.index("fortunato-wheels-engine") + len("fortunato-wheels-engine")]
if SRC_PATH not in sys.path:
    sys.path.append(SRC_PATH)

from src.data.car_ads import CarAds
from src.logs import get_logger

logger = get_logger(__name__)

AZURE_MLFLOW_URI = os.environ.get("AZURE_MLFLOW_URI")
mlflow.set_tracking_uri(AZURE_MLFLOW_URI)

sns.set_theme(style="whitegrid")
sns.set(rc={"figure.figsize": (8, 12)})
# set context to notebook
sns.set_context("notebook", font_scale=1.5, rc={"lines.linewidth": 2.5})
plt.rcParams["font.family"] = "sans serif"

%load_ext autoreload
%autoreload 2

In [None]:
ads = CarAds()
# ads.get_car_ads(sources=["cargurus", "kijiji"])
ads.get_car_ads(data_dump=os.path.join(SRC_PATH, "data", "processed", "car-ads-dump_2023-07-18.csv"))

In [None]:
ads.df.info()

In [None]:
ads.preprocess_ads()

In [None]:
# save preprocessed data to parquet
# output_path = os.path.join(SRC_PATH, "data", "processed", "car-ads-dump_2023-07-18.csv")

# ads.export_to_csv(output_path)

# EDA

What are the most common manufacturers and models of vehicles?

In [None]:
# make a single plot with two plots side by side
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(24, 12))

# only plot the top 15 makes
sns.countplot(
    y="make", data=ads.df, order=ads.df["make"].value_counts().iloc[:10].index, ax=ax1,
    palette="winter"
)
ax1.set_xticklabels(ax1.get_xticklabels(), rotation=40, ha="right")
ax1.set_xlabel("Number of Ads")
ax1.set_ylabel("Make")
# remove the x ax1is tick labels
ax1.tick_params(axis="x", which="both", bottom=False, top=False, labelbottom=False)
ax1.set_title(
    "Top 10 Manufacturers by Number of Ads in the\nFortunato Wheels DataBase", fontsize=23
)

# add the number of ads at the right of each bar divided by 1000
for p in ax1.patches:
    width = p.get_width()
    ax1.text(
        width + 1000,
        p.get_y() + p.get_height() / 2 + 0.1,
        "{:1.0f}".format(width / 1000) + "k",
        ha="left",
    )

# only plot the top 15 makes
sns.countplot(
    y="model", data=ads.df, order=ads.df["model"].value_counts().iloc[:10].index, ax=ax2,
    palette="winter"
)
ax2.set_xticklabels(ax2.get_xticklabels(), rotation=40, ha="right")
ax2.set_xlabel("Number of Ads")
ax2.set_ylabel("Model")
# remove the x ax2is tick labels
ax2.tick_params(axis="x", which="both", bottom=False, top=False, labelbottom=False)
ax2.set_title(
    "Top 10 Models by Number of Ads in the\nFortunato Wheels DataBase", fontsize=23
)

# add the number of ads at the right of each bar divided by 1000
for p in ax2.patches:
    width = p.get_width()
    ax2.text(
        width + 1000,
        p.get_y() + p.get_height() / 2 + 0.1,
        "{:1.0f}".format(width / 1000) + "k",
        ha="left",
    )

# add spacing between the two plots
plt.subplots_adjust(wspace=0.5)

plt.show()


What is the reported condition on most cars?

In [None]:
ads.df.condition.value_counts()

In [None]:
# plot number of ads posted per month from 2015-2023
fig, ax = plt.subplots(figsize=(24, 12))
sns.countplot(
    x=ads.df.listed_date.dt.year,
    hue=ads.df.listed_date.dt.month,
    data=ads.df,
    palette="winter",
)

ax.set_xlabel("Year")
ax.set_ylabel("Number of Ads")
ax.set_title("Number of Ads Posted per Month from 2015-2023", fontsize=23)
ax.legend(
    title="Month",
    loc="upper left",
    labels=[
        "January",
        "February",
        "March",
        "April",
        "May",
        "June",
        "July",
        "August",
        "September",
        "October",
        "November",
        "December",
    ],
)

plt.show()

In [None]:
# what is the number of ads available per year?
fig, ax = plt.subplots(figsize=(24, 12))
sns.countplot(x=ads.df.listed_date.dt.year, data=ads.df, palette="winter")

ax.set_xlabel("Year")
ax.set_ylabel("Number of Ads")
ax.set_title("Number of Ads Posted per Year from 2015-2023", fontsize=23)

plt.show()

In [None]:
# How has the price of Honda Civicc, Ford F-150, and Nissan Rigue changed over time?
# make a plot of price over time for each of the three models on the same plot

# resample prices and get the mean price for each month for each of Honda Civic, Ford F-150, and Nissan Rogue
# and the number of ads for each month for each of Honda Civic, Ford F-150, and Nissan Rogue
ads.df.ad_id.fillna(-1, inplace=True)
price_df = (
    ads.df[
        # ads.df.make.isin(["Civic", "F-150", "Rogue"])
        (ads.df.condition == "Used")
        & (ads.df.listed_date.dt.year >= 2022)
        & (ads.df.listed_date.dt.year <= 2024)
    ]
    .groupby([pd.Grouper(key="listed_date", freq="M")]) # "model", 
    # aggregate by mean price and count the number of ads in each quarter
    .agg({"price": "mean", "ad_id": "count"})
    .reset_index()
)

# make a single plot with two plots side by side
fig, ax = plt.subplots(1, 1, figsize=(15, 8))

# only plot the top 15 makes
sns.lineplot(
    x="listed_date",
    y="price",
    data=price_df[price_df.ad_id > 10],
    markers=True,
    marker="o",
    color="#0d6efd",
    ax=ax,
)
ax.set_xlabel("Year-Month")
ax.set_ylabel("Avg. Used Car Price ($CAD)")
ax.set_title(
    "Average Used Car Price is Down ~45% from 2022 Peak\n ",
    fontsize=26,
)
ax.set_ylim(bottom=0, top=price_df.price.max() * 1.15)
ax.set_xlim(
    left=price_df.listed_date.min(), right=price_df.listed_date.max()
)
# fill the area below the line plot with 0.5 opacity area
ax.fill_between(
    price_df.listed_date,
    price_df.price,
    color="#0d6efd",
    alpha=0.3,
    label=None
)

# put a text and marker at the highest average used car price
ax.annotate(
    f"Peak Avg. Used Car Price:\n${price_df.price.max()/1000:.0f}k CAD",
    xy=(price_df.listed_date[price_df.price.idxmax()], price_df.price.max()),
    xytext=(price_df.listed_date[price_df.price.idxmax()], price_df.price.max() * 1.05),
    arrowprops=dict(facecolor="black", shrink=0.05),
    fontsize=18,
    weight="bold",
    horizontalalignment="center",
)

# annotate used price as of July 2023
ax.annotate(
    f"July 2023\nAvg. Used Car Price:\n${price_df.price.iloc[-2]/1000:.0f}k CAD",
    xy=(price_df.listed_date.iloc[-2], price_df.price.iloc[-2]),
    xytext=(price_df.listed_date.iloc[-2], price_df.price.iloc[-2] * 1.15),
    arrowprops=dict(facecolor="black", shrink=0.05),
    fontsize=18,
    weight="bold",
    horizontalalignment="right",
)


# make plot tight layout
# plt.tight_layout()

plt.show()

In [None]:
price_df = (
    ads.df[
        ads.df.condition.isin(["Used", "New"])
        & (ads.df.listed_date > "2023-02-15")
        & (ads.df.listed_date.dt.year <= 2024)
    ]
    .groupby(["condition", pd.Grouper(key="listed_date", freq="SM")]) # "model", 
    # aggregate by count of ads in each month as column named ad_count
    .agg({"ad_id": "count"})
    .reset_index()
)

# make a single plot with two plots side by side
fig, ax = plt.subplots(1, 1, figsize=(15, 8))

# only plot the top 15 makes
sns.lineplot(
    x="listed_date",
    y="ad_id",
    hue="condition",
    data=price_df,
    markers=True,
    marker="o",
    # set colors for each line
    palette={"Used": "#0d6efd", "New": "#198754"},
    ax=ax,
)
ax.set_xlabel("Year-Month")
ax.set_ylabel("No. of Car Ads Posted")
ax.set_title(
    "New & Used Car Ads being Posted Are Decreasing in 2023\n ",
    fontsize=26,
)
# ax.set_ylim(bottom=0, top=price_df.price.max() * 1.15)
ax.set_xlim(
    left=price_df.listed_date.min(), right=price_df.listed_date.max()
)
# rotate x lables to 30 degrees
ax.set_xticklabels(ax.get_xticklabels(), rotation=30, ha="right")
# fill the area below the line plot with 0.5 opacity area
# ax.fill_between(
#     price_df.listed_date,
#     price_df.price,
#     color="#0d6efd",
#     alpha=0.3,
#     label=None
# )

plt.show()

In [None]:
# fix ads.df where if location is a list containing a single dict, convert to dict
ads.df.location = ads.df.location.apply(lambda x: x[0] if isinstance(x, list) else x)
ads.df.loc[ads.df.source == "kijiji", "province"] = ads.df[ads.df.source == "kijiji"].location.apply(lambda x: x["stateProvince"])

In [None]:
ads.df.columns

In [None]:
ads.df[ads.df.source == "kijiji"].location.head()

In [None]:
# fix ads.df where if location is a list containing a single dict, convert to dict
ads.df.location = ads.df.location.apply(lambda x: x[0] if isinstance(x, list) else x)
ads.df.loc[ads.df.source == "kijiji", "province"] = (
    ads.df[ads.df.source == "kijiji"]
    .location.apply(lambda x: x["stateProvince"])
)

ads_by_province = (
    ads.df[ads.df.source == "kijiji"]
    # extract stateProvince from location column as new column province
    .assign(province=lambda x: x.location.apply(lambda x: x["stateProvince"]))
    # group by province and count the number of ads in each province
    .groupby("province")
    .agg({"ad_id": "count"})
    .reset_index()
    .sort_values("ad_id", ascending=False)
)

ads_by_province

In [None]:
# what provinces have the most ads posted?
fig, ax = plt.subplots(figsize=(24, 12))
sns.barplot(
    x="province",
    y="ad_id",
    data=ads_by_province.head(10),
    # order=ads_by_province["province"].value_counts().iloc[:10].index,
    palette="winter",
)

ax.set_xlabel("Province")
ax.set_ylabel("Number of Ads")
ax.set_title("Top 10 Provinces by Number of Ads in the\nFortunato Wheels DataBase", fontsize=23)

plt.show()

# Price Prediction for for All Makes/Models

In [None]:
def preprocess_ads_for_training(
        ads_df:pd.DataFrame,
        model_features = [
            "age_at_posting",
            "mileage_per_year",
            "make",
            "model",
            "wheel_system",
        ],
        min_num_ads = 1000,
        max_age_at_posting = 20,
        min_price = 1000,
        max_price = 250000,
    ):

    logger.info(f"Preprocessing ads for training, starting with {len(ads_df)} ads")

    if "model" not in model_features:
        model_features.append("model")

    if "price" not in model_features:
        model_features = model_features + ["price"]

    preprocessed_df = ads_df[model_features].copy()

    # remove models with less than min_num_ads
    model_counts = preprocessed_df["model"].value_counts()
    models_to_keep = model_counts[model_counts > min_num_ads].index
    preprocessed_df = preprocessed_df[preprocessed_df["model"].isin(models_to_keep)]

    # remove NaN models and "other"
    preprocessed_df = preprocessed_df[~preprocessed_df["model"].isna()]
    preprocessed_df = preprocessed_df[preprocessed_df["model"].str.lower() != "other"]

    # remove ads with prices outside of min_price and max_price
    preprocessed_df = preprocessed_df.query("price > @min_price & price < @max_price")

    if "age_at_posting" in model_features:
        # remove cars older than max_age_at_posting years
        preprocessed_df = preprocessed_df[preprocessed_df["age_at_posting"] <= max_age_at_posting]

    if "wheel_system" in model_features:
        # replace NaN wheel_system with "unknown"
        preprocessed_df["wheel_system"] = preprocessed_df["wheel_system"].fillna("unknown")

    if "mileage_per_year" in model_features:
        # where ads have an age_at_posting of zero set mileage_per_year to 0
        preprocessed_df.loc[preprocessed_df["age_at_posting"] == 0, "mileage_per_year"] = 0
        # drop any other mileage per year NaNs
        preprocessed_df = preprocessed_df[~preprocessed_df["mileage_per_year"].isna()]

    logger.info(f"Preprocessing ads for training, ending with {len(preprocessed_df)} ads")

    return preprocessed_df

In [None]:
model_features = [
    "age_at_posting",
    "mileage_per_year",
    "make",
    "model",
    "price",
    "wheel_system",
    # "province"
]

preprocessed_ads = preprocess_ads_for_training(ads.df, model_features=model_features)

train_df, test_df = train_test_split(
    preprocessed_ads,
    test_size=0.2,
    random_state=42,
    stratify=preprocessed_ads["model"],
)

# with features selected drop all with null values
train_df = train_df[model_features].dropna().reset_index(drop=True)
test_df = test_df[model_features].dropna().reset_index(drop=True)

X_train = train_df.drop(columns=["price"])
y_train = train_df["price"]
X_test = test_df.drop(columns=["price"])
y_test = test_df["price"]

In [None]:
# plot how many ads there are by the top make_name values
fig = px.histogram(
    # ads.loc[ads.make_name.isin(ads.make_name.value_counts().index[:15])],
    train_df.loc[train_df.model.isin(train_df.model.value_counts().index[:60])],
    x="model",
    title="Number of Ads by Model",
    color="model",
    labels={"model": "Model"},
    color_discrete_sequence=px.colors.qualitative.Dark24,
    height=500,
    category_orders={"model": train_df.model.value_counts().index[:60]}
)
fig.show()

In [None]:
numeric_features = ["age_at_posting", "mileage_per_year"]

categorical_features = ["model", "wheel_system", "make"] #, "province"]

# make column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
    ]
)

## Baselines: Dummy Regressor & Linear Regression

In [None]:
model_results = {}
metrics = ["neg_root_mean_squared_error", "r2", "neg_mean_absolute_percentage_error"]

In [None]:
# make pipeline
dummy_pipe = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("regressor", DummyRegressor()),
    ]
)

# run cross_validation on dummy regressor
model_results["dummy"] = pd.DataFrame(cross_validate(
    dummy_pipe,
    X_train,
    y_train,
    cv=5,
    scoring=metrics,
    return_train_score=True,
)).agg(["mean", "std"]).T

In [None]:
# make pipeline
linreg_pipe = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("regressor", Ridge()),
    ]
)

# run cross_validation on dummy regressor
model_results["ridge"] = pd.DataFrame(cross_validate(
    linreg_pipe,
    X_train,
    y_train,
    cv=5,
    scoring=metrics,
    return_train_score=True,
)).agg(["mean", "std"]).T

In [None]:
pd.concat( 
    model_results,
    axis='columns'  # Get the right model names and mean/std as columns 
).xs( 
    'mean',  # Select only the 'std' columns 
    axis='columns',  # Cross-section the columns 
    level=1  # The 1st level ('mean', 'std') instead of the 0th level (the model names) 
).style.format( 
    precision=4  # Pandas `.style` does not honor previous rounding via `.round()` 
).background_gradient( 
    axis=None  # Color cells based on the entire matrix rather than row/column-wise 
)

In [None]:
model_results["ridge"].loc["test_" + metrics[0]]["mean"]

In [None]:
y_train.head()

## Setting up Hyperopt


In [None]:
metrics = ["neg_root_mean_squared_error", "r2", "neg_mean_absolute_percentage_error"]

def objective(params):
    classifier_type = params['type']
    del params['type']
    if classifier_type == 'gradient_boosting':
        clf = GradientBoostingRegressor(**params)
    elif classifier_type == 'xgboost':
        clf = XGBRegressor(**params)
    elif classifier_type == 'rf':
        clf = RandomForestRegressor(**params)
    elif classifier_type == 'ridge':
        clf = Ridge(**params)
    else:
        return 0

    pipe = Pipeline(
        steps=[
            ("preprocessor", preprocessor),
            ("regressor", clf),
        ]
    )

    # manually run cross_validate and get train/test rmse, mape, and r2
    model_cv_results = pd.DataFrame(cross_validate(
        pipe,
        X_train,
        y_train,
        cv=5,
        scoring=metrics,
        return_train_score=True,
        n_jobs=-1,
    )).agg(["mean", "std"]).T


    # log metrics to mlflow
    with mlflow.start_run():

        # log train and test for each metric
        for m in metrics:
            mlflow.log_metric(f"{m}_train_mean", model_cv_results.loc[f"train_{m}"]["mean"])
            mlflow.log_metric(f"{m}_test_mean", model_cv_results.loc[f"test_{m}"]["mean"])
            mlflow.log_metric(f"{m}_train_std", model_cv_results.loc[f"train_{m}"]["std"])
            mlflow.log_metric(f"{m}_test_std", model_cv_results.loc[f"test_{m}"]["std"])

        # log params
        mlflow.log_params(params)
        # log the type of model
        mlflow.log_param("model_type", classifier_type)

        fit_model = pipe.fit(X_train, y_train)

        # log model
        mlflow.sklearn.log_model(fit_model, "model", signature=infer_signature(X_train, y_train))

    # make negative rmse positive so it minimizes it
    result = { 'loss': -model_cv_results.loc["test_" + metrics[0]]["mean"], 'status': STATUS_OK}

    return result


In [None]:
# make column transformer
from sklearn.preprocessing import LabelEncoder

search_space = hp.choice('classifier_type', [
    {
        'type': 'gradient_boosting',
        'max_features': hp.choice('max_features', ['sqrt', 'log2']),
        'max_depth': hp.uniformint('max_depth', 15, 30),
        'min_samples_split': hp.uniformint('dtree_min_samples_split', 20, 40),
        'n_estimators': hp.uniformint('n_estimators', 150, 300),
    },
    # {
    #     'type': 'rf',
    #     'max_depth': hp.uniformint('max_depth', 5, 50),
    #     'max_features': hp.choice('max_features', ['sqrt', 'log2']),
    #     'min_samples_split': hp.uniform('min_samples_split', 0.1, 1),
    # },
    # {
    #     'type': 'ridge',
    #     'alpha': hp.uniform('alpha', 0.1, 100),
    # }
])

In [None]:
X_train.head()

In [None]:
mlflow.set_experiment("price-prediction-v2-gradboost")
mlflow.sklearn.autolog(disable=True)

search_algorithm = tpe.suggest

best_hyperparams = fmin(
fn=objective, 
space=search_space,
algo=search_algorithm,
max_evals=2,
trials= Trials())

## Hist Gradient Boosting Model

In [None]:
metrics = ["neg_root_mean_squared_error", "r2", "neg_mean_absolute_percentage_error"]

def objective(params):
    classifier_type = params['type']
    del params['type']
    if classifier_type == 'hist_grad_boost':
        clf = HistGradientBoostingRegressor(**params)
    elif classifier_type == 'xgboost':
        clf = XGBRegressor(**params)
    else:
        return 0

    pipe = Pipeline(
        steps=[
            ("preprocessor", hist_preprocessor),
            ("regressor", clf),
        ]
    )

    # manually run cross_validate and get train/test rmse, mape, and r2
    model_cv_results = pd.DataFrame(cross_validate(
        pipe,
        X_train,
        y_train,
        cv=5,
        scoring=metrics,
        return_train_score=True,
        n_jobs=-1,
    )).agg(["mean", "std"]).T


    # log metrics to mlflow
    with mlflow.start_run():

        # log train and test for each metric
        for m in metrics:
            mlflow.log_metric(f"{m}_train_mean", model_cv_results.loc[f"train_{m}"]["mean"])
            mlflow.log_metric(f"{m}_test_mean", model_cv_results.loc[f"test_{m}"]["mean"])
            mlflow.log_metric(f"{m}_train_std", model_cv_results.loc[f"train_{m}"]["std"])
            mlflow.log_metric(f"{m}_test_std", model_cv_results.loc[f"test_{m}"]["std"])

        # log params
        mlflow.log_params(params)
        # log the type of model
        mlflow.log_param("model_type", classifier_type)

        fit_model = pipe.fit(X_train, y_train)

        # log model
        mlflow.sklearn.log_model(fit_model, "model", signature=infer_signature(X_train, y_train))

    # make negative rmse positive so it minimizes it
    result = { 'loss': -model_cv_results.loc["test_" + metrics[0]]["mean"], 'status': STATUS_OK}

    return result


In [None]:
# make column transformer
from sklearn.preprocessing import LabelEncoder

hist_preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("cat", OneHotEncoder(sparse_output=True, handle_unknown="ignore"), categorical_features),
    ]
)

search_space = hp.choice('classifier_type', [
    # {
    #     'type': 'hist_grad_boost',
    #     "max_iter": hp.uniformint("max_iter", 50, 300),
    #     'max_bins': hp.uniformint('max_bins', 50, 255),
    #     'max_depth': hp.uniformint('max_depth', 10, 30),
    #     'min_samples_leaf': hp.uniformint('min_samples_leaf', 10, 30),
    #     # "categorical_features": hp.choice("categorical_features", [[False, False, True, True, True, True]]),
    # },
    {
        'type': 'xgboost',
        'max_depth': hp.uniformint('max_depth', 15, 40),
        'min_child_weight': hp.uniformint('min_child_weight', 1, 10),
        'subsample': hp.uniform('subsample', 0.5, 1),
        'n_estimators': hp.uniformint('n_estimators', 150, 400),
        'learning_rate': hp.uniform('learning_rate', 0.01, 0.2),
        'gamma': hp.uniform('gamma', 0.1, 1),
    }
])

In [None]:
X_train.head()

In [None]:
mlflow.set_experiment("price-prediction-v2-xgboost")
mlflow.sklearn.autolog(disable=True)

search_algorithm = tpe.suggest

best_hyperparams = fmin(
fn=objective, 
space=search_space,
algo=search_algorithm,
max_evals=400,
trials= Trials())

# train final model with best hyperparmetes and log to mlflow

In [None]:
params = {
        'max_features': "log2",
        'max_depth': 25,
        'min_samples_split': 35,
        'n_estimators': 400
    }

In [None]:
mlflow.set_experiment("price-prediction-v1")
mlflow.sklearn.autolog(disable=True)

pipe = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("regressor", GradientBoostingRegressor(**params)),
    ]
)

# manually run cross_validate and get train/test rmse, mape, and r2
model_cv_results = pd.DataFrame(cross_validate(
    pipe,
    X_train,
    y_train,
    cv=5,
    scoring=metrics,
    return_train_score=True,
    n_jobs=-1,
)).agg(["mean", "std"]).T


# log metrics to mlflow
with mlflow.start_run():

    # log train and test for each metric
    for m in metrics:
        mlflow.log_metric(f"{m}_train_mean", model_cv_results.loc[f"train_{m}"]["mean"])
        mlflow.log_metric(f"{m}_test_mean", model_cv_results.loc[f"test_{m}"]["mean"])
        mlflow.log_metric(f"{m}_train_std", model_cv_results.loc[f"train_{m}"]["std"])
        mlflow.log_metric(f"{m}_test_std", model_cv_results.loc[f"test_{m}"]["std"])

    # log params
    mlflow.log_params(params)
    # log the type of model
    mlflow.log_param("model_type", "gradient_boosting")

    fit_model = pipe.fit(X_train, y_train)

    # log model
    mlflow.sklearn.log_model(fit_model, "model", signature=infer_signature(X_train, y_train))

In [None]:
mlflow.set_experiment("price-prediction-quantiles-v1")
mlflow.sklearn.autolog(disable=True)
import time

params = {
        'max_features': "log2",
        'max_depth': 25,
        'min_samples_split': 35,
        'n_estimators': 400,
        "loss": "quantile",
    }

# log metrics to mlflow
with mlflow.start_run():

    # log params
    mlflow.log_params(params)
    # log the type of model
    mlflow.log_param("model_type", "quant_gradient_boosting")

    for a in [0.05, 0.95]:

        pipe = Pipeline(
            steps=[
                ("preprocessor", preprocessor),
                ("regressor", GradientBoostingRegressor(alpha=a, **params)),
            ]
        )

        print(f"Fitting q{a}...")
        start_time = time.time()
        fit_model = pipe.fit(X_train, y_train)
        print(f"Done fitting, took {time.time() - start_time:.0f}s, logging model...")
        # log model
        mlflow.sklearn.log_model(fit_model, f"model_q{a*100:.0f}", signature=infer_signature(X_train, y_train))

In [None]:
# export to csv the makes/models used in the model to be loaded in the website
X_train[["make", "model"]].drop_duplicates().to_csv(
    os.path.join(SRC_PATH, "models", "prediction-vehicle-make-model-config.csv"), index=False
)

In [None]:
X_train.make.unique()

# Train Model with Trim Options and Description

To try and improve the model performance, we will try to include the trim options and description of the vehicle in the model. 

The kijiji ads options are in the `features` column, where as cargurus ads options are in the `major_options` column. Depending on if the data is loaded from MongoDB or from CSV the list of options can either be string or list. List is preferred to explode the options into a row for each option then only collect the most common ones.

The code below is a work in progress and has the following bugs remaining as of July 21, 2023
- the function `get_car_options` is dropping some of the columns of the dataframe like `source`, don't know why, think it has something to do with how it handles empty/nan rows/values
- the cargurus ads don't have unique ad id's attached, so the index is reset and treated as a unique id for analysis purposes, this should be improved.

In [None]:
ads.df.columns

In [None]:
def get_car_options(ads_df:pd.DataFrame, top_n_options:int=50):
    """
    Get the car options from the major_options column in the ads dataframe
    and return a dataframe with the options one hot encoded
    """

    ads_df["options_list"] = None

    # parse cargurus strings of options into list of options
    ads_df.loc[ads_df.source == "cargurus", "options_list"] = (
        ads_df.loc[ads_df.source == "cargurus", "major_options"]
        .str.strip("['']")
        .str.replace("'", "")
        .str.replace(", ", ",")
        .str.replace(" ", "-")
        .str.replace("/", "-")
        .str.lower()
        .str.split(",")
    )

    ads_df.loc[ads_df.source == "kijiji", "options_list"] = (
        ads_df.loc[ads_df.source == "kijiji", "features"]
        .str.strip("['']")
        .str.replace("'", "")
        .str.replace(", ", ",")
        .str.replace(" ", "-")
        .str.replace("/", "-")
        .str.lower()
        .str.split(",")
    )

    # reset the index to use as uniqwue id's for each ad as cargurus doesn't have unique id's
    ads_df = ads_df.reset_index().rename(columns={"index": "unique_id"})

    car_options_df = ads_df.explode("options_list")

    # explode the major_options column but only keep the top n options by count
    most_common_options = (
        car_options_df
        .options_list.value_counts()[:top_n_options]
        .index
        .to_list()
    )

    print(f"Keeping the top {top_n_options} options by count: {most_common_options}")

    # drop all rows where options_list is not in most_common_options but keep empty rows
    car_options_df = car_options_df[
        (car_options_df.options_list.isin(most_common_options))
        | (car_options_df.options_list.isna())
    ]

    # get the one hot encoded options for each ad by grouping opttions_list column 
    # into list and then exploding and one hot encoding
    car_options_df = (
        pd.get_dummies(car_options_df, columns=["options_list"])
        .groupby("unique_id", as_index=False)
        .sum(numeric_only=True)
    )

    return car_options_df

In [None]:
tmp_df = get_car_options(ads.df.copy(deep=True), top_n_options=10)

In [None]:
tmp_df.columns

In [None]:
tmp_df[["source", "features", "major_options", "options_list"]].tail()

In [None]:
# plot the count of top 30 options with stacked bars indicating count by source
fig, ax = plt.subplots(figsize=(24, 12))
sns.countplot(
    y="options_list",
    # x="unique_id",
    data=ads.df.tail(100).explode("options_list"), #.options_list.value_counts()[:30].reset_index(),
    # hue="source",
    # palette="winter",
    # order=ads.df[["source", "options_list"]].tail(100_000).explode("options_list").options_list.value_counts()[:5].index,
)

ax.set_xlabel("Number of Ads")
ax.set_ylabel("Options")
ax.set_title("Top 30 Options by Count", fontsize=23)

plt.show()