In [1]:
import os

import pandas as pd
import numpy as np

from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import (
    FunctionTransformer,
    StandardScaler,
    OneHotEncoder
)
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor

import bentoml

## Split train / test data

In [2]:
data = pd.read_csv("../data/ford.csv")
data["model"] = data.model.str.strip(" ")

In [3]:
train_data, test_data = train_test_split(data, test_size=0.2, random_state=1234)
train_data = train_data.reset_index(drop=True)
test_data = test_data.reset_index(drop=True)

In [4]:
"""
Replace model categories with their corresponding frequencies in train data.
- will be saved and used for serving?
"""

model_replace_rules = (train_data.model.value_counts() / len(train_data)).to_dict()
for k, v in model_replace_rules.items():
    print(f"{k} = {v:0.4f}")

Fiesta = 0.3647
Focus = 0.2567
Kuga = 0.1242
EcoSport = 0.0626
C-MAX = 0.0298
Mondeo = 0.0294
Ka+ = 0.0290
B-MAX = 0.0195
S-MAX = 0.0164
Grand C-MAX = 0.0140
Galaxy = 0.0128
Edge = 0.0116
KA = 0.0104
Tourneo Custom = 0.0044
Puma = 0.0043
Grand Tourneo Connect = 0.0035
Mustang = 0.0034
Tourneo Connect = 0.0018
Fusion = 0.0010
Streetka = 0.0001
Ranger = 0.0001
Escort = 0.0001


In [5]:
"""
We will replace fueltype not belongs to ("Petrol", "Diesel") with "Others".
"""

train_data.fuelType.value_counts()

Petrol      9722
Diesel      4628
Hybrid        19
Electric       2
Other          1
Name: fuelType, dtype: int64

In [6]:
"""
Use all these 3 classes.
"""

train_data.transmission.value_counts()

Manual       12392
Automatic     1108
Semi-Auto      872
Name: transmission, dtype: int64

## Transformations for each column

In [7]:
"""
Input: mileage, tax
Transform: log(1+x) => StandardScaler
"""

log_stdz_transf = Pipeline([
    ("Log1P", FunctionTransformer(np.log1p)),
    ("Scaler", StandardScaler())
])

In [8]:
"""
Input: transmission
Transform: OneHotEncoder
- handle_unknown = 'ignore' (encode data never has been observed to zeros)
"""

ts_transf = OneHotEncoder(handle_unknown="ignore", sparse=False)

In [9]:
"""
Input: fuelType
Transform: ReplaceMinorClass => OneHotEncoder
- ReplaceMinorClass: replace minor classes with "Others"
"""

def _replace_minor_fuel_type(x, **kwargs):
    major_types = kwargs["major_types"]
    x = pd.DataFrame(x, columns=["fuelType"]).copy()
    x[~x.fuelType.isin(major_types)] = "Others"
    return x.values


fuel_type_transf = Pipeline([
    (
        "Replacer",
        FunctionTransformer(
            _replace_minor_fuel_type,
            kw_args={"major_types": ["Petrol", "Diesel"]},
        )
    ),
    ("OneHotEncoder", OneHotEncoder(handle_unknown="ignore", sparse=False))
])

In [10]:
"""
Input: model
Transform: NumericConverter
- NumericConverter: convert model to its corresponding frequencies in train data
"""

def _convert_model_to_numeric(x, **kwargs):
    model_replace_rules = kwargs["model_replace_rules"]
    x = pd.DataFrame(x, columns=["model"]).copy()
    x[~x.model.isin(model_replace_rules.keys())] = 0.0
    for k, v in model_replace_rules.items():
        x[x == k] = v
    return x.values.astype("float")


model_name_transf = FunctionTransformer(
    _convert_model_to_numeric,
    kw_args={"model_replace_rules": model_replace_rules},
)

In [11]:
"""
Input: year, mpg, engineSize
Transform: StandardScaler()
"""

stdz_transf = StandardScaler()

In [12]:
transformers = ColumnTransformer(
    [
        ("LogStandardize", log_stdz_transf, ["mileage", "tax"]),
        ("OneHot", ts_transf, ["transmission"]),
        ("ReplaceOneHot", fuel_type_transf, ["fuelType"]),
        ("StringToNumeric", model_name_transf, ["model"]),
        ("Standardize", stdz_transf, ["year", "mpg", "engineSize"]),    
    ],
    remainder="drop",
    n_jobs=1,
)

## Build pipeline and fit regression model

In [13]:
model = XGBRegressor(
    max_depth=None,
    n_estimators=100,
    random_state=214
)

In [14]:
pipeline = Pipeline([
    ("FeatureEngineerring", transformers),
    ("Regressorm", model)
])

In [15]:
X_train = train_data.drop("price", axis=1)
y_train = train_data["price"].values

X_test = test_data.drop("price", axis=1)
y_test = test_data["price"].values

In [16]:
pipeline.fit(X=X_train, y=y_train)

## Train / Test accuracies

In [17]:
pred_train = np.clip(pipeline.predict(train_data), a_min=0.0, a_max=np.infty)
pred_test = np.clip(pipeline.predict(test_data), a_min=0.0, a_max=np.infty)

In [18]:
def mean_absolute_percentile_error(y_true, y_pred):
    y_true = np.squeeze(np.array(y_true))
    y_pred = np.squeeze(np.array(y_pred))    
    return np.nanmean(np.abs((y_true - y_pred) / y_true))

In [19]:
print(f"Train RMSE (Euro): {mean_squared_error(y_train, pred_train)**(1/2.):0.2f}")
print(f"Train MAE (Euro): {mean_absolute_error(y_train, pred_train):0.2f}")
print(f"Train MAPE (%): {mean_absolute_percentile_error(y_train, pred_train)*100:0.2f}")

print(f"Test RMSE (Euro): {mean_squared_error(y_test, pred_test)**(1/2.):0.2f}")
print(f"Test MAE (Euro): {mean_absolute_error(y_test, pred_test):0.2f}")
print(f"Test MAPE (%): {mean_absolute_percentile_error(y_test, pred_test)*100:0.2f}")

Train RMSE (Euro): 867.03
Train MAE (Euro): 643.81
Train MAPE (%): 5.54
Test RMSE (Euro): 1216.98
Test MAE (Euro): 816.38
Test MAPE (%): 7.21


## Save bentoml model

In [20]:
bentoml.sklearn.save_model(
    name="ford_used_car_price",
    model=pipeline,
)

Model(tag="ford_used_car_price:lv5ywatoeowusasc", path="/opt/project/bentoml/models/ford_used_car_price/lv5ywatoeowusasc/")