In [1]:
import pandas as pd
from feature_engine.datetime import DatetimeFeatures
from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor
import os
import pickle
import numpy as np
import warnings
import pandas as pd
from feature_engine.encoding import MeanEncoder, OneHotEncoder, OrdinalEncoder
from feature_engine.imputation import (
    ArbitraryNumberImputer,
    CategoricalImputer,
    MeanMedianImputer,
)
from feature_engine.selection import DropConstantFeatures
from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor
from sktime.transformations.series.summarize import WindowSummarizer
from feature_engine.imputation import DropMissingData
from feature_engine.selection import DropFeatures
from feature_engine.timeseries.forecasting import WindowFeatures, LagFeatures

warnings.filterwarnings("ignore", category=DeprecationWarning)

In [2]:
CATEGORICAL_COLUMNS = ["holiday_name"]
NUMERICAL_COLUMNS = [
    "holiday",
    "shutdown",
    "mini_shutdown",
    "shops_closed",
    "winter_school_holidays",
    "school_holidays",
    "blackout",
    "mov_change",
    "frankfurt_shutdown",
    # "date_year",
    # "date_month",
    # "date_day_of_month",
    # "date_day_of_week",
    # "date_day_of_year",
    # "date_weekend",
]
DATE_COLUMNS = ["date"]

TARGET_COLUMNS = "orders"
FEATRURE_COLUMNS = NUMERICAL_COLUMNS + CATEGORICAL_COLUMNS + DATE_COLUMNS
TRAIN_LEVEL = ["warehouse"]
MODEL_PATH = "model_registry"

In [3]:
# load the data
PATH = "./data/"
df_train = pd.read_csv(f"{PATH}train_new.csv", parse_dates=["date"])
df_train["split"] = "train"

df_test = pd.read_csv(f"{PATH}test_new.csv", parse_dates=["date"])
df_test["split"] = "test"

cols = df_train.columns
df_test = df_test[cols]

df = pd.concat([df_train, df_test], axis=0)

In [4]:
pipeline = Pipeline(
    [
        # ('drop_na_lags', DropMissingData(variables=new_cols)),
        # Impute missing categorical except mean-encoded ones, normally happens in the test data
        (
            "missing_categoricals",
            CategoricalImputer(
                imputation_method="missing", variables=CATEGORICAL_COLUMNS
            ),
        ),
        # Impute 0 for missing numericals
        (
            "missing_numerical",
            ArbitraryNumberImputer(variables=NUMERICAL_COLUMNS, arbitrary_number=0),
        ),
        # OneHotEncode the rest of categorical
        (
            "onehot_encoding",
            OneHotEncoder(
                top_categories=6,
                variables=CATEGORICAL_COLUMNS,
                ignore_format=True,
            ),
        ),
        # Drop Constant features
        ("drop_constant", DropConstantFeatures(tol=1)),
        # ("xgb", XGBRegressor(n_estimators=100, random_state=42)),
    ]
)

In [5]:
pipeline_lags = Pipeline(
    [
        (
            "lag_window",
            # Create window features using the transformer.
            WindowFeatures(
                variables=[TARGET_COLUMNS],
                functions=["mean"],
                window=[1, 7, 365],  # Day, week, year.
                freq="D",
            ),
        ),
        ("lag_features", LagFeatures(variables=[TARGET_COLUMNS], periods=[1, 7, 365])),
        # ("drop_missing", DropFeatures(features_to_drop=[TARGET_COLUMNS])),
    ]
)

In [6]:
def train_model_core_recursive(
    df: pd.DataFrame,
    train_level: list,
    feature_columns: list,
    target_column: str,
    pipeline_lags: Pipeline,
    pipeline: Pipeline,
    model_dir: str = "model_registry",
) -> pd.DataFrame:
    """This function run the train for a single level.

    Args:
        df (pd.DataFrame): A dataframe including level columns, features, and target columns
        train_level (list): training level i.e. `['warehouse', 'item_class']`
        feature_columns (list): Feature columns.
        target_column (str): traget columns should be a single string.
        model_dir (str, optional): Name of the directory for saving the model pickel files. Defaults to "model_registry".

    Returns:
        pd.DataFrame: A single row dataframe including the train level, model registry path, and error if we have any.
    """

    e = ""

    train_level_value = df[train_level].iloc[0, :].to_list()

    try:
        df2 = df.copy()
        df2.set_index("date", inplace=True)
        df2 = pipeline_lags.fit_transform(df2)
        df2.reset_index(inplace=True)

        new_cols = [col for col in df2.columns if col not in df.columns]
        df2.dropna(subset=new_cols, how="any", inplace=True)

        X = df2[feature_columns + new_cols]
        y = df2[target_column]

        model = pipeline.fit(X, y)

        # saving the model to model registry
        os.makedirs(model_dir, exist_ok=True)
        model_string = "_".join(
            [f"{a}_{b}" for a, b in zip(train_level, train_level_value)]
        )
        model_path = os.path.join(model_dir, f"model_{model_string}.pkl")
        with open(model_path, "wb") as f:
            pickle.dump(model, f)

    except Exception as e:
        print(e)
        model_path = ""

    train_df = pd.DataFrame(
        [train_level_value + ["_".join(train_level)] + [model_path] + [str(e)]],
        columns=[*train_level, "train_level", "model_path", "error"],
    )

    return train_df

In [7]:
warehouse = df_train["warehouse"].unique()[6]
_df = df_train.query(f"warehouse== '{warehouse}'").query(
    f"not {TARGET_COLUMNS}.isnull()"
)
_df2 = df_test.query(f"warehouse== '{warehouse}'")

In [8]:
train_model_core_recursive(
    _df,
    TRAIN_LEVEL,
    FEATRURE_COLUMNS,
    TARGET_COLUMNS,
    pipeline_lags,
    pipeline,
    model_dir=MODEL_PATH,
)

Unnamed: 0,warehouse,train_level,model_path,error
0,Budapest_1,warehouse,model_registry\model_warehouse_Budapest_1.pkl,


In [10]:
df_train.query(f"not {TARGET_COLUMNS}.isnull()").groupby(TRAIN_LEVEL).apply(
    train_model_core_recursive,
    TRAIN_LEVEL,
    FEATRURE_COLUMNS,
    TARGET_COLUMNS,
    pipeline_lags,
    pipeline,
    model_dir=MODEL_PATH,
    include_groups=True,
).reset_index(drop=True)

Unnamed: 0,warehouse,train_level,model_path,error
0,Brno_1,warehouse,model_registry\model_warehouse_Brno_1.pkl,
1,Budapest_1,warehouse,model_registry\model_warehouse_Budapest_1.pkl,
2,Frankfurt_1,warehouse,model_registry\model_warehouse_Frankfurt_1.pkl,
3,Munich_1,warehouse,model_registry\model_warehouse_Munich_1.pkl,
4,Prague_1,warehouse,model_registry\model_warehouse_Prague_1.pkl,
5,Prague_2,warehouse,model_registry\model_warehouse_Prague_2.pkl,
6,Prague_3,warehouse,model_registry\model_warehouse_Prague_3.pkl,
