In [None]:
import os, sys

import pandas as pd
import numpy as np
import json
import plotly.express as px
import mlflow
from mlflow.models.signature import infer_signature
from dotenv import load_dotenv, find_dotenv
import seaborn as sns
import matplotlib.pyplot as plt
import pickle

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_validate, cross_val_score
from xgboost import XGBRegressor


cur_dir = os.getcwd()
SRC_PATH = cur_dir[: cur_dir.index("fortunato-wheels-engine") + len("fortunato-wheels-engine")]
if SRC_PATH not in sys.path:
    sys.path.append(SRC_PATH)

from src.data.car_ads import CarAds
from src.logs import get_logger

logger = get_logger(__name__)

AZURE_MLFLOW_URI = os.environ.get("AZURE_MLFLOW_URI")
mlflow.set_tracking_uri(AZURE_MLFLOW_URI)

sns.set_theme(style="whitegrid")
sns.set(rc={"figure.figsize": (8, 12)})
# set context to notebook
sns.set_context("notebook", font_scale=1.5, rc={"lines.linewidth": 2.5})
plt.rcParams["font.family"] = "sans serif"

%load_ext autoreload
%autoreload 2

## Load in current car adds

In [None]:
ads = CarAds()
# ads.get_car_ads(sources=["cargurus", "kijiji"])
ads.get_car_ads(data_dump=os.path.join(SRC_PATH, "data", "processed", "car-ads-dump_2023-07-18.csv"))

In [None]:
ads.df.info()

In [None]:
ads.df.describe()

In [None]:
ads.preprocess_ads()

In [None]:
def preprocess_ads_for_training(
        ads_df:pd.DataFrame,
        model_features = [
            "age_at_posting",
            "mileage_per_year",
            "make",
            "model",
            "wheel_system",
        ],
        min_num_ads = 1000,
        max_age_at_posting = 20,
        min_price = 1000,
        max_price = 250000,
    ):

    logger.info(f"Preprocessing ads for training, starting with {len(ads_df)} ads")

    if "model" not in model_features:
        model_features.append("model")

    if "price" not in model_features:
        model_features = model_features + ["price"]

    preprocessed_df = ads_df[model_features].copy()

    # remove models with less than min_num_ads
    model_counts = preprocessed_df["model"].value_counts()
    models_to_keep = model_counts[model_counts > min_num_ads].index
    preprocessed_df = preprocessed_df[preprocessed_df["model"].isin(models_to_keep)]

    # remove NaN models and "other"
    preprocessed_df = preprocessed_df[~preprocessed_df["model"].isna()]
    preprocessed_df = preprocessed_df[preprocessed_df["model"].str.lower() != "other"]

    # remove ads with prices outside of min_price and max_price
    preprocessed_df = preprocessed_df.query("price > @min_price & price < @max_price")

    if "age_at_posting" in model_features:
        # remove cars older than max_age_at_posting years
        preprocessed_df = preprocessed_df[preprocessed_df["age_at_posting"] <= max_age_at_posting]

    if "wheel_system" in model_features:
        # replace NaN wheel_system with "unknown"
        preprocessed_df["wheel_system"] = preprocessed_df["wheel_system"].fillna("unknown")

    if "mileage_per_year" in model_features:
        # where ads have an age_at_posting of zero set mileage_per_year to 0
        preprocessed_df.loc[preprocessed_df["age_at_posting"] == 0, "mileage_per_year"] = 0
        # drop any other mileage per year NaNs
        preprocessed_df = preprocessed_df[~preprocessed_df["mileage_per_year"].isna()]

    logger.info(f"Preprocessing ads for training, ending with {len(preprocessed_df)} ads")

    return preprocessed_df

In [None]:
model_features = [
    "age_at_posting",
    "mileage_per_year",
    "make",
    "model",
    "price",
    "wheel_system"
]

preprocessed_ads = preprocess_ads_for_training(ads.df, model_features=model_features)

train_df, test_df = train_test_split(
    preprocessed_ads,
    test_size=0.2,
    random_state=42,
    stratify=preprocessed_ads["model"],
)

# with features selected drop all with null values
train_df = train_df[model_features].dropna().reset_index(drop=True)
test_df = test_df[model_features].dropna().reset_index(drop=True)

X_train = train_df.drop(columns=["price"])
y_train = train_df["price"]
X_test = test_df.drop(columns=["price"])
y_test = test_df["price"]

In [None]:
# plot how many ads there are by the top make_name values
fig = px.histogram(
    # ads.loc[ads.make_name.isin(ads.make_name.value_counts().index[:15])],
    train_df.loc[train_df.model.isin(train_df.model.value_counts().index[:60])],
    x="model",
    title="Number of Ads by Model",
    color="model",
    labels={"model": "Model"},
    color_discrete_sequence=px.colors.qualitative.Dark24,
    height=500,
    category_orders={"model": train_df.model.value_counts().index[:60]}
)
fig.show()

In [None]:
# plot how many ads there are by the top 20 make values
fig = px.histogram(
    # ads.loc[ads.make_name.isin(ads.make_name.value_counts().index[:15])],
    train_df.loc[train_df.make.isin(train_df.make.value_counts().index[:20])],
    x="make",
    title="Number of Ads by Make",
    color="make",
    labels={"make": "Make"},
    color_discrete_sequence=px.colors.qualitative.Dark24,
    height=500,
    category_orders={"make": train_df.make.value_counts().index[:20]}
)
fig.show()

In [None]:
train_df.age_at_posting.value_counts()

In [None]:
train_df.age_at_posting.value_counts().sort_values(ascending=False)

In [None]:
# plot how many ads there are by the top 20 make values
fig = px.histogram(
    # ads.loc[ads.make_name.isin(ads.make_name.value_counts().index[:15])],
    train_df.loc[train_df.make.isin(train_df.make.value_counts().index[:20])],
    x="make",
    title="Number of Ads by Make",
    color="make",
    labels={"make": "Make"},
    color_discrete_sequence=px.colors.qualitative.Dark24,
    height=500,
    category_orders={"make": train_df.make.value_counts().index[:20]}
)
fig.show()

## Load and Fit model

In [None]:
# Load the model from file
model_path = os.path.join(SRC_PATH, "models", "job_sleepy_star_599wg0mt", "model.pkl")

with open(model_path, 'rb') as f:
    model = pickle.load(f)

# Fit model 
model.fit(X_train, y_train)


In [None]:
# Predict on test set
y_pred = model.predict(X_test)

# Score model (MAPE, RMSE, r2) 
mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

In [None]:
rmse

In [None]:
# add predicted price to test_df, round to 1 decimal place

full_df = test_df.copy(deep=True)
full_df['predicted_price'] = y_pred.round(1)
full_df

In [None]:
fig = px.scatter(full_df, x='price', y='predicted_price', opacity=0.5)
fig.update_layout(
    xaxis_title="Actual Prices",
    yaxis_title="Predicted Prices",
    title="Scatter plot: Actual Prices vs. Predicted Prices",
    width=800,
    height=400,
)
fig.show()

In [None]:
fig = px.scatter(full_df.loc[full_df.make.isin(full_df.make.value_counts().index[:20])], 
                x='price',
                y='predicted_price', 
                color='make',
                opacity=0.5,
                facet_col='make',
                facet_col_wrap=5
                )
fig.update_layout(
    xaxis_title="Actual Prices",
    yaxis_title="Predicted Prices",
    title="Scatter plot: Actual Prices vs. Predicted Prices",
    width=800,
    height=400,
)
fig.show()

In [None]:
# get unique make model combinations
unique_make_models = full_df[['make', 'model']].drop_duplicates()
unique_makes = full_df['make'].unique()

mape_by_make_model = {}
rmse_by_make_model = {}

mape_by_make = {}
rmse_by_make= {}

# calcualte metrics for each make model combination
for make, model in unique_make_models.values:
    subset_data = full_df[(full_df['make'] == make) & (full_df['model'] == model)]
    y_actual_subset = subset_data['price']
    y_predicted_subset = subset_data['predicted_price']
        
    mape = np.mean(np.abs((y_actual_subset - y_predicted_subset) / y_actual_subset)) * 100 
    rmse = np.sqrt(mean_squared_error(y_actual_subset, y_predicted_subset))
        
    mape_by_make_model[(make, model)] = mape
    rmse_by_make_model[(make, model)] = rmse
    
# calcualte metrics for each make
for make in unique_makes:
    subset_data = full_df[full_df['make'] == make]
    y_actual_subset = subset_data['price']
    y_predicted_subset = subset_data['predicted_price']
        
    mape = np.mean(np.abs((y_actual_subset - y_predicted_subset) / y_actual_subset)) * 100 
    rmse = np.sqrt(mean_squared_error(y_actual_subset, y_predicted_subset))
     
    mape_by_make[make] = mape
    rmse_by_make[make] = rmse


# save results to dataframes
make_model_metrics = (
    pd.DataFrame({'MAPE': mape_by_make_model, 'RMSE': rmse_by_make_model})
    .reset_index()
    .rename(columns={'level_0': 'make', 'level_1': 'model'})
)

make_metrics = (
    pd.DataFrame({'MAPE': mape_by_make, 'RMSE': rmse_by_make})
    .reset_index()
    .rename(columns={'index': 'make'})
)

In [None]:

# plot MAPE and RMSE by make
fig_mape = px.bar(make_metrics,
                x='make', 
                y='MAPE',
                color='make',
                opacity=0.5,
                )
fig_mape.update_layout(title='MAPE by Make', 
                       xaxis_title='Make', 
                       yaxis_title='MAPE', 
                       font=dict(size=14),
                       xaxis={'categoryorder':'total descending'})

fig_rmse = px.bar(make_metrics,
                x='make', 
                y='RMSE',
                color='make',
                opacity=0.5,
                )
fig_rmse.update_layout(title='RMSE by Make', 
                       xaxis_title='Make', 
                       yaxis_title='RMSE', 
                       font=dict(size=14),
                       xaxis={'categoryorder':'total descending'})

fig_mape.show()
fig_rmse.show()

In [None]:
import plotly.graph_objs as go
from plotly.subplots import make_subplots

#create subplot with two rows
fig = make_subplots(rows=2, cols=1, shared_xaxes=True, vertical_spacing=0.1)

# add MAPE bar chart to subplot
fig_mape = go.Bar(x=make_metrics['make'], y=make_metrics['MAPE'], name='MAPE')
fig.add_trace(fig_mape, row=1, col=1)

# add RMSE bar chart to subplot
fig_rmse = go.Bar(x=make_metrics['make'], y=make_metrics['RMSE'], name='RMSE')
fig.add_trace(fig_rmse, row=2, col=1)

# update layout
fig.update_layout(title='MAPE and RMSE by Make', 
                  xaxis_title='Make', 
                  font=dict(size=14))

# update y-axis titles
fig.update_yaxes(title_text='MAPE', row=1, col=1)
fig.update_yaxes(title_text='RMSE', row=2, col=1)

fig.show()

In [135]:
full_df['age_at_posting'].value_counts()

 0     262503
 3      74861
-1      56142
 1      56069
 2      46690
 4      32013
 5      25702
 6      21960
 7      18908
 8      16125
 9      14016
 10     11724
 12      9389
 11      9253
 13      8651
 14      6201
 15      5989
 16      4876
 17      3563
 18      2641
 19      2007
 20      1635
-2          4
Name: age_at_posting, dtype: int64

In [136]:
# create bins for age_at_posting column
bins = pd.IntervalIndex.from_tuples([(-2, -1), (-1, 1), (1, 3), (3, 5), (5, 8), (8, 10), (10, 15), (15, 20)])

# create age_bins column
full_df['age_bins'] = pd.cut(full_df['age_at_posting'], bins)

# print value counts for age_bins column
print(full_df['age_bins'].value_counts())

(-1, 1]     318572
(1, 3]      121551
(3, 5]       57715
(5, 8]       56993
(-2, -1]     56142
(10, 15]     39483
(8, 10]      25740
(15, 20]     14722
Name: age_bins, dtype: int64
