In [2]:
import os

import pandas as pd
import numpy as np

from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import (
    FunctionTransformer,
    StandardScaler,
    OneHotEncoder
)
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor

import bentoml

In [31]:
data = pd.read_csv("../data/ford.csv")
data["model"] = data.model.str.strip(" ")

In [41]:
train_data, test_data = train_test_split(data, test_size=0.2, random_state=1234)

In [50]:
'''
replace model categories with their corresponding frequencies in train data
- will be saved and used for serving?
'''

model_replace_rules = (train_data.model.value_counts() / len(train_data)).to_dict()
for k, v in model_replace_rules.items():
    print(f"{k} = {v:0.4f}")

Fiesta = 0.3647
Focus = 0.2567
Kuga = 0.1242
EcoSport = 0.0626
C-MAX = 0.0298
Mondeo = 0.0294
Ka+ = 0.0290
B-MAX = 0.0195
S-MAX = 0.0164
Grand C-MAX = 0.0140
Galaxy = 0.0128
Edge = 0.0116
KA = 0.0104
Tourneo Custom = 0.0044
Puma = 0.0043
Grand Tourneo Connect = 0.0035
Mustang = 0.0034
Tourneo Connect = 0.0018
Fusion = 0.0010
Streetka = 0.0001
Ranger = 0.0001
Escort = 0.0001


In [53]:
train_data.fuelType.value_counts()

Petrol      9722
Diesel      4628
Hybrid        19
Electric       2
Other          1
Name: fuelType, dtype: int64

In [57]:
train_data.transmission.value_counts()

Manual       12392
Automatic     1108
Semi-Auto      872
Name: transmission, dtype: int64

In [56]:
'''
Input: mileage
Transform: log(1+x) => standard scaling
'''

mileage_transformer = Pipeline([
    ("Log1P", FunctionTransformer(np.log1p)),
    ("Scaler", StandardScaler())
])

In [58]:
'''
Input: transmission
Transform: OneHotEncode
- handle_unknown = 'ignore' => encode data never has been observed to zeros
'''

transmission_transformer = OneHotEncoder(handle_unknown="ignore", sparse=False)

In [None]:
'''
Input: fuelType
Transform: Imputer => OntHotEncode
- impute "Electric", "Hybrid" and "Other" 
'''

# Replace the fuelType "Electric" or "Hybrid" with "other"
# and OneHotEncoder will ignore it.
# This is a small trick to implement the frequency encoder
Pipeline(
    [
        (
            f"inputer_{d}",
            SimpleImputer(
                missing_values=d,
                strategy="constant",
                fill_value="other"
            )
        )
        for d in DROP_FUEL_TYPE
    ]+[
        (
            "OneHotEncoder",
            OneHotEncoder(
                handle_unknown="ignore",
                sparse=False
            ),
        )
    ]
),

In [29]:
data.model.str.strip(" ").unique()

array(['Fiesta', 'Focus', 'Puma', 'Kuga', 'EcoSport', 'C-MAX', 'Mondeo',
       'Ka+', 'Tourneo Custom', 'S-MAX', 'B-MAX', 'Edge',
       'Tourneo Connect', 'Grand C-MAX', 'KA', 'Galaxy', 'Mustang',
       'Grand Tourneo Connect', 'Fusion', 'Ranger', 'Streetka', 'Escort',
       'Transit Tourneo'], dtype=object)

In [24]:
data.transmission.unique()

array(['Automatic', 'Manual', 'Semi-Auto'], dtype=object)

In [27]:
data.fuelType.unique()

array(['Petrol', 'Diesel', 'Hybrid', 'Electric', 'Other'], dtype=object)

In [None]:
# Define the transformer pipeline for the dataframe
feature_eng_tfm = ColumnTransformer(
    [
        (
            "Mileage",
            Pipeline([
                ("Log1P", FunctionTransformer(np.log1p)),
                ("Scaler", StandardScaler())
            ]),
            ["mileage"]
        ),
        (
            "transmission",
            OneHotEncoder(
                handle_unknown="ignore",
                sparse=False
            ),
            ["transmission"]
        ),
        (
            "fuelType",
            # Replace the fuelType "Electric" or "Hybrid" with "other"
            # and OneHotEncoder will ignore it.
            # This is a small trick to implement the frequency encoder
            Pipeline(
                [
                    (
                        f"inputer_{d}",
                        SimpleImputer(
                            missing_values=d,
                            strategy="constant",
                            fill_value="other"
                        )
                    )
                    for d in DROP_FUEL_TYPE
                ]+[
                    (
                        "OneHotEncoder",
                        OneHotEncoder(
                            handle_unknown="ignore",
                            sparse=False
                        ),
                    )
                ]
            ),
            ["fuelType"]
        ),
        (
            "model",
            # replace model with the corresponding frequency in MODEL_REPLACE_RULES
            Pipeline(
                [
                    (
                        f"inputer_{column_value}",
                        SimpleImputer(
                            missing_values=column_value,
                            strategy="constant",
                            fill_value=value
                        )
                    )
                    for column_value, value
                    in MODEL_REPLACE_RULES.items()
                ]
            ),
            ["model"]
        ),
        (
            "year",
            Pipeline([
                ("Scaler", StandardScaler())
            ]),
            ["year"]
        ),
        (
            "engineSize",
            Pipeline([
                ("Scaler", StandardScaler())
            ]),
            ["engineSize"]
        ),
        (
            "tax",
            Pipeline([
                ("Log1P", FunctionTransformer(np.log1p)),
                ("Scaler", StandardScaler())
            ]),
            ["tax"]
        ),
        (
            "mpg",
            Pipeline([
                ("Scaler", StandardScaler())
            ]),
            ["mpg"]
        )
    ],
    remainder="drop"
)


In [None]:
import pandas as pd
import numpy as np

from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer, StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor

from sklearn.metrics import mean_squared_error, mean_absolute_error

import bentoml

from datetime import datetime
import os

# Replace the categories with their corresponding frequencies
# previously calculated
MODEL_REPLACE_RULES = {
    'Focus': 7.4e-05,
    ' Ranger': 7.4e-05,
    ' Transit Tourneo': 7.4e-05,
    ' Escort': 7.4e-05,
    ' Streetka': 0.000148,
    ' Fusion': 0.000891,
    ' Tourneo Connect': 0.002004,
    ' Mustang': 0.003414,
    ' Tourneo Custom': 0.003637,
    ' Grand Tourneo Connect': 0.003637,
    ' Puma': 0.004527,
    ' KA': 0.01091,
    ' Edge': 0.011578,
    ' Galaxy': 0.012097,
    ' Grand C-MAX': 0.012765,
    ' S-MAX': 0.016328,
    ' B-MAX': 0.018703,
    ' C-MAX': 0.029761,
    ' Ka+': 0.030058,
    ' Mondeo': 0.030503,
    ' EcoSport': 0.063307,
    ' Kuga': 0.124091,
    ' Focus': 0.254861,
    ' Fiesta': 0.366558,
    'Other': 0
}

DROP_FUEL_TYPE = ["Electric", "Hybrid"]

if __name__ == "__main__":
    # read test data from csv file ford_test.csv
    TRAIN_PATH = os.path.join(os.path.dirname(__file__), "data/ford_train.csv")
    TEST_PATH = os.path.join(os.path.dirname(__file__), "data/ford_test.csv")

    ford_df = pd.read_csv(TRAIN_PATH)

    # Define the transformer pipeline for the dataframe
    feature_eng_tfm = ColumnTransformer(
        [
            (
                "Mileage",
                Pipeline([
                    ("Log1P", FunctionTransformer(np.log1p)),
                    ("Scaler", StandardScaler())
                ]),
                ["mileage"]
            ),
            (
                "transmission",
                OneHotEncoder(
                    handle_unknown="ignore",
                    sparse=False
                ),
                ["transmission"]
            ),
            (
                "fuelType",
                # Replace the fuelType "Electric" or "Hybrid" with "other"
                # and OneHotEncoder will ignore it.
                # This is a small trick to implement the frequency encoder
                Pipeline(
                    [
                        (
                            f"inputer_{d}",
                            SimpleImputer(
                                missing_values=d,
                                strategy="constant",
                                fill_value="other"
                            )
                        )
                        for d in DROP_FUEL_TYPE
                    ]+[
                        (
                            "OneHotEncoder",
                            OneHotEncoder(
                                handle_unknown="ignore",
                                sparse=False
                            ),
                        )
                    ]
                ),
                ["fuelType"]
            ),
            (
                "model",
                # replace model with the corresponding frequency in MODEL_REPLACE_RULES
                Pipeline(
                    [
                        (
                            f"inputer_{column_value}",
                            SimpleImputer(
                                missing_values=column_value,
                                strategy="constant",
                                fill_value=value
                            )
                        )
                        for column_value, value
                        in MODEL_REPLACE_RULES.items()
                    ]
                ),
                ["model"]
            ),
            (
                "year",
                Pipeline([
                    ("Scaler", StandardScaler())
                ]),
                ["year"]
            ),
            (
                "engineSize",
                Pipeline([
                    ("Scaler", StandardScaler())
                ]),
                ["engineSize"]
            ),
            (
                "tax",
                Pipeline([
                    ("Log1P", FunctionTransformer(np.log1p)),
                    ("Scaler", StandardScaler())
                ]),
                ["tax"]
            ),
            (
                "mpg",
                Pipeline([
                    ("Scaler", StandardScaler())
                ]),
                ["mpg"]
            )
        ],
        remainder="drop"
    )

    # Define the regressor model
    xgb_reg = XGBRegressor(
        max_depth=None,
        n_estimators=100,
        random_state=214,
    )

    ml_model = Pipeline([
        ("FeatureEngineering", feature_eng_tfm),
        ("Regressor", xgb_reg)
    ])

    # Train the model to predict the price of the car
    print(f"[{datetime.now()}] Training the model...")

    ml_model.fit(
        ford_df.drop(columns=["price"]),
        ford_df["price"]
    )

    print(f"[{datetime.now()}] Saving the model...")

    # Model on Test Data
    # MSE
    print(f"[{datetime.now()}] Calculating MSE on the test data...")
    ford_df_test = pd.read_csv(TEST_PATH)
    y_pred = ml_model.predict(ford_df_test)
    mse = mean_squared_error(ford_df_test["price"], y_pred)
    print(f"[{datetime.now()}] MSE on the test data: {mse}")
    # MAE
    mae = mean_absolute_error(ford_df_test["price"], y_pred)
    print(f"[{datetime.now()}] MAE on the test data: {mae}")

    # Save the model to a bento service
    model_id = bentoml.sklearn.save_model(
        "ford_price_predictor",
        ml_model
    )

    print(f"[{datetime.now()}] Model - {ml_model}")