In [46]:
import os
import pickle
import numpy as np
import pandas as pd
import warnings

from feature_engine.encoding import OneHotEncoder
from feature_engine.imputation import (
    ArbitraryNumberImputer,
    CategoricalImputer,
)
from feature_engine.selection import DropConstantFeatures
from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor

warnings.filterwarnings("ignore", category=DeprecationWarning)

In [47]:
CATEGORICAL_COLUMNS = ["holiday_name"]
NUMERICAL_COLUMNS = [
    "holiday",
    "shutdown",
    "mini_shutdown",
    "shops_closed",
    "winter_school_holidays",
    "school_holidays",
    "blackout",
    "mov_change",
    "frankfurt_shutdown",
    "year",
    "month",
    "day_of_month",
    "day_of_week",
    "day_of_year",
    "weekend",
]
DATE_COLUMNS = ["date"]
FEATRURE_COLUMNS = NUMERICAL_COLUMNS + CATEGORICAL_COLUMNS # + DATE_COLUMNS
TARGET_COLUMNS = "orders"
TRAIN_LEVEL = ["warehouse"]
MODEL_PATH = "model_registry"

# 1. Preprocessing


In [48]:
# load the data
PATH = "../data/"
df_train_new = pd.read_csv(f"{PATH}train_new.csv", parse_dates=["date"])
df_test_new = pd.read_csv(f"{PATH}test_new.csv", parse_dates=["date"])


In [49]:
print(df_train_new.shape)
df_train_new = df_train_new.query("missing == 0")
print(df_train_new.shape)

(7710, 23)
(7340, 23)


In [50]:
print(df_test_new.shape)
df_test_new = df_test_new.query("missing == 0")
print(df_test_new.shape)

(427, 23)
(397, 23)


# 3. Model Training


In [51]:
pipeline = Pipeline(
    [
        # Impute missing categorical except mean-encoded ones, normally happens in the test data
        (
            "missing_categoricals",
            CategoricalImputer(
                imputation_method="missing", variables=CATEGORICAL_COLUMNS
            ),
        ),
        # Impute 0 for missing numericals
        (
            "missing_numerical",
            ArbitraryNumberImputer(variables=NUMERICAL_COLUMNS, arbitrary_number=0),
        ),
        # OneHotEncode the rest of categorical
        (
            "onehot_encoding",
            OneHotEncoder(
                top_categories=6,
                variables=CATEGORICAL_COLUMNS,
                ignore_format=True,
            ),
        ),
        # Drop Constant features
        ("drop_constant", DropConstantFeatures(tol=1)),
        ("xgb", XGBRegressor(n_estimators=100, random_state=42)),
    ]
)

In [52]:
def train_model_core(
    df: pd.DataFrame,
    train_level: list,
    feature_columns: list,
    target_column: str,
    pipeline: Pipeline,
    model_dir: str = "model_registry",
) -> pd.DataFrame:
    """This function run the train for a single level.

    Args:
        df (pd.DataFrame): A dataframe including level columns, features, and target columns
        train_level (list): training level i.e. `['warehouse', 'item_class']`
        feature_columns (list): Feature columns.
        target_column (str): traget columns should be a single string.
        model_dir (str, optional): Name of the directory for saving the model pickel files. Defaults to "model_registry".

    Returns:
        pd.DataFrame: A single row dataframe including the train level, model registry path, and error if we have any.
    """

    e = ""
    model_path = ""
    train_level_value = df[train_level].iloc[0, :].to_list()

    try:
        X = df[feature_columns]
        y = df[target_column]

        model = pipeline.fit(X, y)

        # saving the model to model registry
        os.makedirs(model_dir, exist_ok=True)
        model_string = "_".join(
            [f"{a}_{b}" for a, b in zip(train_level, train_level_value)]
        )
        model_path = os.path.join(model_dir, f"model_{model_string}.pkl")
        with open(model_path, "wb") as f:
            pickle.dump(model, f)

    except Exception as e:
        print(e)

    train_df = pd.DataFrame(
        [train_level_value + ["_".join(train_level)] + [model_path] + [e]],
        columns=[*train_level, "train_level", "model_path", "error"],
    )

    return train_df

In [53]:
def score_model_core(
    df: pd.DataFrame,
    train_level: list,
    feature_columns: list,
    target_column: str,
    model_dir: str = "model_registry",
) -> pd.DataFrame:
    """Loading a model from model registry and inference.

    Args:
        df (pd.DataFrame): A dataframe including level columns, features, and target columns
        train_level (list): training level i.e. `['warehouse', 'item_class']`
        feature_columns (list): Feature columns.
        target_column (str): traget columns should be a single string.
        model_dir (str, optional): Name of the directory for saving the model pickel files. Defaults to "model_registry".

    Returns:
        pd.DataFrame: A dataframe including all input dataframe with forecast and error column added at the end.
    """
    e = ""
    df = df.copy()
    train_level_value = df[train_level].iloc[0, :].to_list()

    try:
        X = df[feature_columns]
        y = df[target_column]

        model_string = "_".join(
            [f"{a}_{b}" for a, b in zip(train_level, train_level_value)]
        )
        model_path = os.path.join(model_dir, f"model_{model_string}.pkl")
        with open(model_path, "rb") as f:
            model = pickle.load(f)

        df.loc[:, "forecast"] = model.predict(X)
        df.loc[:, "model_path"] = model_path

    except Exception as e:
        df.loc[:, "forecast"] = np.null
        df.loc[:, "model_path"] = np.null

    df.loc[:, "error"] = e
    return df

## 3.1. Single Model Testing


In [54]:
warehouse = "Prague_1"
_df = df_train_new.query(f"warehouse== '{warehouse}'")
_df2 = df_test_new.query(f"warehouse== '{warehouse}'")

In [55]:
train_model_core(
    _df, TRAIN_LEVEL, FEATRURE_COLUMNS, TARGET_COLUMNS, pipeline, model_dir=MODEL_PATH
)

Unnamed: 0,warehouse,train_level,model_path,error
0,Prague_1,warehouse,model_registry\model_warehouse_Prague_1.pkl,


In [56]:
_df2.isnull().sum()

date                       0
warehouse                  0
orders                    61
holiday_name               0
holiday                    0
shutdown                   0
mini_shutdown              0
shops_closed               0
winter_school_holidays     0
school_holidays            0
blackout                   0
mov_change                 0
frankfurt_shutdown         0
precipitation              0
snow                       0
id                         0
missing                    0
year                       0
month                      0
day_of_month               0
day_of_week                0
day_of_year                0
weekend                    0
dtype: int64

In [57]:
score_model_core(
    _df2, TRAIN_LEVEL, FEATRURE_COLUMNS, TARGET_COLUMNS, model_dir=MODEL_PATH
)

Unnamed: 0,date,warehouse,orders,holiday_name,holiday,shutdown,mini_shutdown,shops_closed,winter_school_holidays,school_holidays,...,missing,year,month,day_of_month,day_of_week,day_of_year,weekend,forecast,model_path,error
0,2024-03-16,Prague_1,,no_holiday,0,0,0,0,0,0,...,0,2024,3,16,5,76,1,10271.309570,model_registry\model_warehouse_Prague_1.pkl,
1,2024-03-17,Prague_1,,no_holiday,0,0,0,0,0,0,...,0,2024,3,17,6,77,1,10328.383789,model_registry\model_warehouse_Prague_1.pkl,
2,2024-03-18,Prague_1,,no_holiday,0,0,0,0,0,0,...,0,2024,3,18,0,78,0,9720.911133,model_registry\model_warehouse_Prague_1.pkl,
3,2024-03-19,Prague_1,,no_holiday,0,0,0,0,0,0,...,0,2024,3,19,1,79,0,9441.130859,model_registry\model_warehouse_Prague_1.pkl,
4,2024-03-20,Prague_1,,no_holiday,0,0,0,0,0,0,...,0,2024,3,20,2,80,0,9291.542969,model_registry\model_warehouse_Prague_1.pkl,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56,2024-05-11,Prague_1,,no_holiday,0,0,0,0,0,0,...,0,2024,5,11,5,132,1,10292.479492,model_registry\model_warehouse_Prague_1.pkl,
57,2024-05-12,Prague_1,,no_holiday,0,0,0,0,0,0,...,0,2024,5,12,6,133,1,10035.849609,model_registry\model_warehouse_Prague_1.pkl,
58,2024-05-13,Prague_1,,no_holiday,0,0,0,0,0,0,...,0,2024,5,13,0,134,0,9903.984375,model_registry\model_warehouse_Prague_1.pkl,
59,2024-05-14,Prague_1,,no_holiday,0,0,0,0,0,0,...,0,2024,5,14,1,135,0,9321.391602,model_registry\model_warehouse_Prague_1.pkl,


## 3.2. Batch Model Training


In [58]:
df_train_new.groupby(TRAIN_LEVEL).apply(
    train_model_core,
    TRAIN_LEVEL,
    FEATRURE_COLUMNS,
    TARGET_COLUMNS,
    pipeline,
    model_dir=MODEL_PATH,
    include_groups=True,
).reset_index(drop=True)

Unnamed: 0,warehouse,train_level,model_path,error
0,Brno_1,warehouse,model_registry\model_warehouse_Brno_1.pkl,
1,Budapest_1,warehouse,model_registry\model_warehouse_Budapest_1.pkl,
2,Frankfurt_1,warehouse,model_registry\model_warehouse_Frankfurt_1.pkl,
3,Munich_1,warehouse,model_registry\model_warehouse_Munich_1.pkl,
4,Prague_1,warehouse,model_registry\model_warehouse_Prague_1.pkl,
5,Prague_2,warehouse,model_registry\model_warehouse_Prague_2.pkl,
6,Prague_3,warehouse,model_registry\model_warehouse_Prague_3.pkl,


In [59]:
df_test_new.groupby(TRAIN_LEVEL).apply(
    score_model_core,
    TRAIN_LEVEL,
    FEATRURE_COLUMNS,
    TARGET_COLUMNS,
    model_dir=MODEL_PATH,
    include_groups=True,
).reset_index(drop=True)

Unnamed: 0,date,warehouse,orders,holiday_name,holiday,shutdown,mini_shutdown,shops_closed,winter_school_holidays,school_holidays,...,missing,year,month,day_of_month,day_of_week,day_of_year,weekend,forecast,model_path,error
0,2024-03-16,Brno_1,,no_holiday,0,0,0,0,0,0,...,0,2024,3,16,5,76,1,9377.536133,model_registry\model_warehouse_Brno_1.pkl,
1,2024-03-17,Brno_1,,no_holiday,0,0,0,0,0,0,...,0,2024,3,17,6,77,1,8303.705078,model_registry\model_warehouse_Brno_1.pkl,
2,2024-03-18,Brno_1,,no_holiday,0,0,0,0,0,0,...,0,2024,3,18,0,78,0,8439.435547,model_registry\model_warehouse_Brno_1.pkl,
3,2024-03-19,Brno_1,,no_holiday,0,0,0,0,0,0,...,0,2024,3,19,1,79,0,8443.680664,model_registry\model_warehouse_Brno_1.pkl,
4,2024-03-20,Brno_1,,no_holiday,0,0,0,0,0,0,...,0,2024,3,20,2,80,0,8422.693359,model_registry\model_warehouse_Brno_1.pkl,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
392,2024-05-11,Prague_3,,no_holiday,0,0,0,0,0,0,...,0,2024,5,11,5,132,1,5515.541016,model_registry\model_warehouse_Prague_3.pkl,
393,2024-05-12,Prague_3,,no_holiday,0,0,0,0,0,0,...,0,2024,5,12,6,133,1,5364.634766,model_registry\model_warehouse_Prague_3.pkl,
394,2024-05-13,Prague_3,,no_holiday,0,0,0,0,0,0,...,0,2024,5,13,0,134,0,5219.994141,model_registry\model_warehouse_Prague_3.pkl,
395,2024-05-14,Prague_3,,no_holiday,0,0,0,0,0,0,...,0,2024,5,14,1,135,0,5173.205566,model_registry\model_warehouse_Prague_3.pkl,
