In [0]:
%pip install mlflow
%load_ext autoreload
%autoreload 2

In [0]:
dbutils.library.restartPython()

In [0]:
import sys
print(sys.version_info)
print(f"Python version: {sys.version_info.major}.{sys.version_info.minor}.{sys.version_info.micro}")

In [0]:
dbutils.widgets.dropdown("env_stage", "dev", ["dev", "prod"], "Pipeline stage")
dbutils.widgets.dropdown("exclude_pms", "False", ["True", "False"], "Exclude PMS")
dbutils.widgets.dropdown("target_type", "REVENUE", ["REVENUE", "ROOMS"], "Target Type")
dbutils.widgets.dropdown("is_usd_currency", "True", ["True", "False"], "Use USD currency")
dbutils.widgets.text("selected_hotels", "", "Hotels")
dbutils.widgets.text("lag_numbers","1,7,14,28", "Lag Numbers")

In [0]:
import pandas as pd
import numpy as np
from pyspark.sql.functions import *
from pyspark.sql.types import *
import os
from autogluon.core.utils.loaders import load_pkl
import logging
import shutil
import mlflow
from mlflow import MlflowException
import mlflow.pyfunc
import time
import warnings

start_time = time.perf_counter()
warnings.filterwarnings("ignore")

In [0]:
ENV = getArgument("env_stage")

REPOPATH = "/Workspace/Repos/manik@surge.global/phg-data-mlsys/src"
cluster_name = spark.conf.get("spark.databricks.clusterUsageTags.clusterName") 

if (ENV == "dev") and ("dev" in cluster_name):
    print(f"Loading phgml package from repo {REPOPATH}")
    sys.path.append(os.path.abspath(REPOPATH))

In [0]:
# from phgml.models.model_wrapper import ModelWrapper
from phgml.models.model_strategy import StrategyLGBM, StrategyAG
from phgml.data.processing_distr_ca import remove_padded_cols
from phgml.reporting.output_metrics import *
from phgml.reporting.report_results import get_output_df, interpolated_fill # , correct_prediction_list
from phgml.data.data_types import inference_output_schema
from phgml.reporting.logging import get_dbx_logger
from phgml.data.config import ForecastingHotelConfigProvider,EnvironmentConfig
from phgml.utilities.task_utilities import str_to_bool, str_to_lst

In [0]:
from abc import ABC, abstractmethod
import pandas as pd
import numpy as np
import numpy.typing as npt
from typing import Optional, Tuple, Union, List, Dict, Any, Callable
from lightgbm import LGBMRegressor
from autogluon.tabular import TabularPredictor

__all__ = ["BaseStrategy", "StrategyLGBM", "StrategyAG", "StrategyLGBMFarField"]


class BaseStrategy(ABC):
    model_type = "base_strategy"

    def __init__(
        self,
        day_ahead: int,
        quantile_levels: list,
        cd_axis_targets: list,
        path: str,
        is_auto_reg: bool,
        # verbose: int,
    ):
        self.quantile_levels = quantile_levels
        self.day_ahead = day_ahead
        self.cd_axis_targets = cd_axis_targets
        self.target_prefix = self.cd_axis_targets[0][:2]
        self.path = path
        self.autoregressive_predictions = is_auto_reg
        # self.verbose = verbose

        self.objective = "quantile"
        self.sub_predictors: Dict[str, Dict[str, Any]] = {}

    @abstractmethod
    def _fit(self, train_data: pd.DataFrame) -> None:
        pass

    @abstractmethod
    def _predict(self, test_data: pd.Series) -> Dict[float, List[npt.NDArray]]:
        pass


class StrategyLGBM(BaseStrategy):
    model_type = "LGBM"

    def __init__(
        self,
        day_ahead: int,
        quantile_levels: list,
        cd_axis_targets: list,
        path: str,
        is_auto_reg: bool,
        n_jobs: int = 1,
        # verbose: int = -1,
    ):
        super().__init__(
            day_ahead=day_ahead,
            quantile_levels=quantile_levels,
            cd_axis_targets=cd_axis_targets,
            path=path,
            is_auto_reg=is_auto_reg,
            # verbose=verbose,
        )

    def _fit(self, train_data: pd.DataFrame) -> None:
        self.target_feature_dtypes = dict(train_data.dtypes)

        # using lambda function to identify dropping cols, rather than using a if condition inside the for loop
        drop_cols_func = lambda target_index: self.cd_axis_targets
        if self.autoregressive_predictions:
            drop_cols_func = lambda target_index: self.cd_axis_targets[target_index:]

        for index, target_ in enumerate(self.cd_axis_targets):
            x_data = train_data.drop(drop_cols_func(index), axis=1)
            y_data = train_data[[target_]]

            # if self.verbose != -1:
            print(
                "\t\ttarget: ",
                target_,
                " x_data_cols:",
                [col for col in x_data.columns if self.target_prefix in col][:6],
                " y_data_cols:",
                list(y_data.columns),
            )

            reg_objs = {}
            for qtile in self.quantile_levels:
                sub_predictor = LGBMRegressor(
                    objective=self.objective, alpha=qtile, verbose=-1, n_jobs=self.n_jobs
                )
                sub_predictor.fit(x_data, y_data)
                reg_objs[qtile] = sub_predictor
            self.sub_predictors[target_] = {
                "predictors": reg_objs,
                "targets": list(y_data.columns),
                "features": list(x_data.columns),
            }

    def _predict(self, test_data: pd.Series) -> Dict[float, List[npt.NDArray]]:

        # making sure dtypes are the same as in training, and filtering out the target columns from the dtypes dict since in test data its not there.
        needed_dtypes = {
            col: col_dtype
            for col, col_dtype in self.target_feature_dtypes.items()
            if "_tgt" not in col
        }
        test_data_cpy = (
            test_data.to_frame().T.reset_index(drop=True).copy().astype(needed_dtypes)
        )

        data = {qtile: test_data_cpy.copy() for qtile in self.quantile_levels}
        for index, target_ in enumerate(self.cd_axis_targets):
            feature_cols = self.sub_predictors[target_]["features"]
            other_cols = [col for col in feature_cols if self.target_prefix not in col]

            # if self.verbose != -1:
            print(
                "\t\ttarget: ",
                target_,
                " x_data_cols:",
                feature_cols[:6],
                " other_cols :",
                other_cols[:6],
            )
            for qtile in self.quantile_levels:
                pred = self.sub_predictors[target_]["predictors"][qtile].predict(
                    data[0.5][feature_cols]
                )
                data[qtile][target_] = pred
                data[qtile] = data[qtile].sort_index(axis=1)

        data = {
            qtile: data[qtile][self.cd_axis_targets].to_numpy() for qtile in data.keys()
        }
        return data


class StrategyAG(BaseStrategy):
    model_type = "AG"
    excluded_models = ["NN_TORCH"]

    def __init__(
        self,
        day_ahead: int,
        quantile_levels: list,
        cd_axis_targets: list,
        path: str,
        is_auto_reg: bool,
        # verbose: int = -1,
    ):
        super().__init__(
            day_ahead=day_ahead,
            quantile_levels=quantile_levels,
            cd_axis_targets=cd_axis_targets,
            path=path,
            is_auto_reg=is_auto_reg,
            # verbose=verbose,
        )
        self.included_model_types = ["GBM"]

    def _fit(self, train_data: pd.DataFrame, **kwargs) -> None:
        self.target_feature_dtypes = dict(train_data.dtypes)

        # using lambda function to identify dropping cols, rather than using a if condition inside the for loop
        drop_cols_func = lambda target_index: self.cd_axis_targets
        if self.autoregressive_predictions:
            drop_cols_func = lambda target_index: self.cd_axis_targets[target_index:]

        for index, target_ in enumerate(self.cd_axis_targets):
            x_data = train_data.drop(drop_cols_func(index), axis=1)
            y_data = train_data[[target_]]

            # if self.verbose != -1:
            print(
                "\t\ttarget: ",
                target_,
                " x_data_cols:",
                [col for col in x_data.columns if self.target_prefix in col][:6],
                " y_data_cols:",
                list(y_data.columns),
            )

            path_i = self.path + f"day{self.day_ahead}_{target_}"

            sub_predictor = TabularPredictor(
                label=target_,
                problem_type=self.objective,
                path=path_i,
                quantile_levels=self.quantile_levels,
                verbosity=1,
            )

            cols_to_consider = [target_] + list(x_data.columns)

            sub_predictor.fit(
                train_data=train_data[cols_to_consider],
                tuning_data=None,
                excluded_model_types=self.excluded_models,
                **kwargs,
            )

            self.sub_predictors[target_] = {
                "predictors": sub_predictor,
                "targets": list(y_data.columns),
                "features": list(x_data.columns),
            }

    def _predict(self, test_data: pd.Series) -> Dict[float, List[npt.NDArray]]:

        # making sure dtypes are the same as in training, and filtering out the target columns from the dtypes dict since in test data its not there.
        needed_dtypes = {
            col: col_dtype
            for col, col_dtype in self.target_feature_dtypes.items()
            if "_tgt" not in col
        }
        test_data_cpy = (
            test_data.to_frame().T.reset_index(drop=True).copy().astype(needed_dtypes)
        )

        data = {qtile: test_data_cpy.copy() for qtile in self.quantile_levels}
        for index, target_ in enumerate(self.cd_axis_targets):
            feature_cols = self.sub_predictors[target_]["features"]
            other_cols = [col for col in feature_cols if self.target_prefix not in col]

            # if self.verbose != -1:
            print(
                "\t\ttarget: ",
                target_,
                " x_data_cols:",
                feature_cols[:6],
                " other_cols :",
                other_cols[:6],
            )
            pred = self.sub_predictors[target_]["predictors"].predict(
                data[0.5][feature_cols]
            )
            for index, qtile in enumerate(self.quantile_levels):

                data[qtile][target_] = pred.iloc[:, index]
                data[qtile] = data[qtile].sort_index(axis=1)

        data = {
            qtile: data[qtile][self.cd_axis_targets].to_numpy() for qtile in data.keys()
        }
        return data


In [0]:
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor, as_completed

from phgml.models.base_model import BaseModel
# from phgml.models.model_strategy import BaseStrategy
# from phgml.models.model_strategy import StrategyLGBM, StrategyAG, StrategyLGBMFarField
from mlflow import MlflowClient
import mlflow
import pandas as pd
import numpy as np
import cloudpickle
from sys import version_info
import pickle
import os
import shutil
from typing import Optional, Tuple, Union, List, Dict, Any, Callable
import numpy.typing as npt
import re

class InnerException(Exception):
    pass

__all__ = ["ModelWrapper", "ModelWrapperMlflowModel", "ModelWrapperFarField"]

PYTHON_VERSION = "{major}.{minor}.{micro}".format(
    major=version_info.major, minor=version_info.minor, micro=version_info.micro
)

conda_env = {
    "channels": ["defaults"],
    "dependencies": [
        "python={}".format(PYTHON_VERSION),
        "pip",
        {
            "pip": [
                "mlflow",
                "lightgbm",
                "cloudpickle=={}".format(cloudpickle.__version__),
            ],
        },
    ],
    "name": "model_wrapper_env",
}


class ModelWrapper(BaseModel):
    """Custom class which wraps a model type to generate
    predictions in a timeseries format.
    """

    def __init__(
        self,
        cd_axis_max_lags: int,
        static_cols: List[str],
        model_strategy: BaseStrategy,
        is_auto_reg: bool = False,
        is_ca3_training: bool = True,
        prediction_horizon: int = 28,
        lag_numbers: List[int] = [1, 7, 14, 28],
        quantiles: List[float] = [0.5],
        mlflow_run_id: Optional[str] = None,
        hotel_id: Optional[str] = None,
        version: Optional[Union[str, int]] = None,
        stage: Optional[str] = None,
        target_type: str = "REVENUE",
        exclude_pms: bool = False,
        save_models: bool = True,
        local_root_dir: Optional[str] = None,
        model_type: str = "MODELWRAPPER",
        model_name_prefix: Optional[str] = None,
        meta_data: Dict[str, Any] = {},
        n_cd_lags: Optional[int] = None,
        **kwargs,
    ):
        super().__init__(
            model_type=model_strategy.model_type,
            prediction_horizon=prediction_horizon,
            lag_numbers=lag_numbers,
            quantiles=quantiles,
            mlflow_run_id=mlflow_run_id,
            hotel_id=hotel_id,
            version=version,
            stage=stage,
            target_type=target_type,
            exclude_pms=exclude_pms,
            save_models=save_models,
            local_root_dir=local_root_dir,
            model_name_prefix=model_name_prefix,
            meta_data=meta_data,
            **kwargs,
        )
        self.quantile_levels.sort()
        self.cd_axis_max_lags = cd_axis_max_lags
        self.sd_axis_lag_prefix = "lag"
        self.static_cols = static_cols
        self.n_cd_lags = n_cd_lags
        self.target_suffix = "_tgt"
        self.is_auto_reg = is_auto_reg

        self.model_strategy = model_strategy
        self.model_type = self.model_strategy.model_type
        self.all_cd_cols = [
            f"{self.target_prefix}{i}" for i in range(self.cd_axis_max_lags + 1)
        ]
        self.is_ca3_training = is_ca3_training

        # initializing targets variables
        self.target_cols: Dict[int, List[str]] = {}

        # initializing feature variables
        self.feature_cols: Dict[int, List[str]] = {}

        self.envs = ["dev", "qa", "prod"]

        if 0.5 not in self.quantile_levels:
            raise ValueError(
                "median quantile (0.5) is not included in the quantile_levels. please ensure that its included"
            )

    def save_model(self) -> None:
        """Saves the models in the local directory, which will then be logged as artifacts in MLflow"""
        if os.path.exists(self.local_root):
            self.clean()

        os.makedirs(self.local_dir)
        with open(self.local_path, "wb") as pkl_file:
            pickle.dump(obj=self, file=pkl_file, protocol=pickle.HIGHEST_PROTOCOL)

    def change_current_env_tags(self, incoming_tags: Dict[str, str]):

        env_model_tag_keys = set([f"model_stage_{env}" for env in self.envs])
        incoming_tags_keys = set(incoming_tags.keys())

        tags_detected = env_model_tag_keys.intersection(incoming_tags_keys)

        if len(tags_detected) > 0:
            client = MlflowClient()
            all_registered_models_info = client.search_model_versions(
                f"name ='{self.get_model_name()}'"
            )
            # sorting the model meta data list by version number of the considered model name in descending order
            sorted_model_versions = sorted(
                all_registered_models_info, key=lambda x: int(x.version), reverse=True
            )

            for version_meta in sorted_model_versions:
                for env_tag in tags_detected:

                    if (incoming_tags[env_tag] == "yes") and (
                        version_meta.tags.get(env_tag) == "yes"
                    ):
                        client.set_model_version_tag(
                            name=self.get_model_name(),
                            version=str(version_meta.version),
                            key=env_tag,
                            value="no",
                        )

    def log_models(self) -> None:
        """Carries out the mlflow model registry procedures"""
        print("Starting model logging")
        self.save_model()

        modelpath = self.get_model_log_path()
        print("Logging model")
        mlflow.pyfunc.log_model(
            artifact_path=self.get_model_log_path(),
            python_model=ModelWrapperMlflowModel(),
            artifacts=self.artifacts,
            conda_env=conda_env,
        )

        # enforcing lower case for env based string keys and values
        decap_meta_data = {}
        for key, value in self.meta_data.items():
            env_str_match = re.findall(pattern=f"({'|'.join(self.envs)})", string=key)
            if len(env_str_match) > 0:
                decap_meta_data[key.lower()] = (
                    value.lower() if isinstance(value, str) else value
                )
            else:
                decap_meta_data[key] = value

        self.meta_data = decap_meta_data

        self.change_current_env_tags(self.meta_data)

        print("Registering model")
        result = mlflow.register_model(
            self.get_model_register_path(),
            self.get_model_name(),
            tags=self.meta_data,
        )

    def clean(self) -> None:
        if os.path.exists(self.local_root):
            shutil.rmtree(self.local_root)

    def load_pyfunc_model(
        self, dst_path: Optional[str] = None, tag: Optional[Union[str, int]] = None
    ) -> mlflow.pyfunc.PyFuncModel:
        """Load and return the pyfunc model from the MLFlow model repository

        Args:
            dst_path (str, optional): Destination path to save the loaded model.
                                      If not provided the files will be saved in the local_root path.
                                      Defaults to None.
            tag (str, optional): Tag to specify the version or model stage to be loaded.
                                 If not provided the latest model version will be loaded.
                                Defaults to None.

        Returns:
            mlflow.pyfunc.model : pyfunc model
        """
        # self.local_dir = dst_path
        print(f"Loading model {self.get_model_uri()}")

        if dst_path is not None:
            self.local_root = dst_path

        if os.path.exists(self.local_root):
            self.clean()

        os.mkdir(self.local_root)

        model = mlflow.pyfunc.load_model(
            self.get_model_uri(tag=tag), dst_path=self.local_root
        )

        self.run_id = model._model_meta.run_id

        # following is a bit of a round about way to set local_dir
        # having the run id in the directory name is a bit troublesome as the run id is not available to us when we create the autogluon object
        # TODO make sure to remove the run id from the local_dir and include either or both task_type/exclude_pms
        # TODO make sure to set the local_dir consistently for both training and inference tasks
        # self.local_dir = "/ag_models/"

        # if self.exclude_pms:
        #     self.local_dir = f"ag_models_{self.hotel_id}_{self.run_id}/"

        # os.rename("artifacts",self.local_dir)

        return model

    def get_filtered_data(
        self, data: pd.DataFrame, day_ahead: int
    ) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
        target_columns = self.all_cd_cols[:day_ahead]
        target_columns = list(
            map(lambda target: target + self.target_suffix, target_columns)
        )

        # original target order
        target_columns_orig = target_columns.copy()
        target_columns.reverse()

        cd_axis_lag_columns = self.all_cd_cols[day_ahead:]
        if self.n_cd_lags != None:
            cd_axis_lag_columns = cd_axis_lag_columns[: self.n_cd_lags]

        sd_axis_lag_columns = [
            f"{self.sd_axis_lag_prefix}{SD_lag}"
            for SD_lag in self.lag_numbers
            if SD_lag > day_ahead
        ]

        # assigning target and feature variables corresponding to the particular day ahead. This will be retrieved through the attributes in the inference phase.
        self.target_cols[day_ahead] = target_columns
        self.feature_cols[day_ahead] = (
            cd_axis_lag_columns + sd_axis_lag_columns + self.static_cols
        )

        if self.is_ca3_training:
            # Condition helps us get the specific entry for the cancellation day index
            condition = data["forecast_index"] == (day_ahead - 1)
            filt_data = data[condition].copy()
        else:
            filt_data = data.copy()

        x_data = filt_data[self.feature_cols[day_ahead]]
        y_data = filt_data[self.target_cols[day_ahead]]

        return (
            x_data,
            y_data,
            filt_data[target_columns_orig + self.feature_cols[day_ahead]],
        )

    def train_inner(self, train_data: pd.DataFrame, day_ahead: int):
        x_train, y_train, xy_train = self.get_filtered_data(
            data=train_data, day_ahead=day_ahead
        )

        reg_obj = self.model_strategy(
            quantile_levels=self.quantile_levels,
            day_ahead=day_ahead,
            cd_axis_targets=self.target_cols[day_ahead],
            path=self.local_dir,
            is_auto_reg=self.is_auto_reg,
        )  # type: ignore
        reg_obj._fit(xy_train)

        # self.models[day_ahead] = reg_obj

        return day_ahead, reg_obj
    
    def predict_inner(self, train_data: pd.DataFrame, day_ahead: int):
        test_idx = day_ahead - 1

        try:
            needed_test_data = test_data[test_data.day_ahead == day_ahead].iloc[0]
        except IndexError as e:
            days_str = "days" if day_ahead > 1 else "day"

            error_msg = f"Error when predicting {day_ahead} {days_str} ahead: {e}"
            raise ValueError(error_msg)

        x_test = needed_test_data[self.feature_cols[day_ahead]]
        y_test = needed_test_data[self.target_cols[day_ahead]]

        predictor = self.models[day_ahead]
        y_pred_dct = predictor._predict(x_test)

        return y_pred_dct

    def train(self, train_data: pd.DataFrame, n_threads: int) -> None:
        """
        trains models for each day ahead quantile predictions and  relevant
        to the specified prediction_horizon value and the specific quantile
        levels.

        parameters:
            train_data = training data with the booking pace lags, stay date lags or
                    other features such as date features.

        Returns: None
        """
        with ThreadPoolExecutor(max_workers=n_threads) as executor:
            future_to_target = {executor.submit(
                self.train_inner, train_data, day_ahead): day_ahead for day_ahead in range(1, self.prediction_horizon + 1)}
            
            for future in as_completed(future_to_target):
                try:
                    day_ahead, reg_obj = future.result()
                except Exception as exc:
                    print(exc)
                else:
                    self.models[day_ahead] = reg_obj


        # for day_ahead in range(1, self.prediction_horizon + 1):
        #     print("\tday ahead: ", day_ahead)
        #     x_train, y_train, xy_train = self.get_filtered_data(
        #         data=train_data, day_ahead=day_ahead
        #     )

        #     reg_obj = self.model_strategy(
        #         quantile_levels=self.quantile_levels,
        #         day_ahead=day_ahead,
        #         cd_axis_targets=self.target_cols[day_ahead],
        #         path=self.local_dir,
        #         is_auto_reg=self.is_auto_reg,
        #     )  # type: ignore
        #     reg_obj._fit(xy_train)

        #     self.models[day_ahead] = reg_obj

        # if self.do_save_models:
        #     self.log_models()

    def predict(
        self, test_data: pd.DataFrame
    ) -> Dict[Union[str, float], Union[List[npt.NDArray], List[pd.Series]]]:
        """generating quantile predictions for the test data provided. test_data
        should be provided which aligns with the prediction_horizon. If
        test_data has less rows than the prediction_horizon, then the length
        of the test_data will be considered as the prediction horizon.

        eg: if prediction_horizon= 28, ideally test_data should have 28 rows
            which are relevant for 28 stay dates.

        parameters:
            test_data = test data which aligns with the prediction horizon.
                        rows of test_data <= prediction_horizon.

        Returns: Lists with actual values and corresponding predicted values along the booking axis leading upto the
          relevant stay date ahead
        """
        output_pred: Dict[
            Union[str, float], Union[List[npt.NDArray], List[pd.Series]]
        ] = {}

        with ThreadPoolExecutor(max_workers=n_threads) as executor:
            future_to_target = {executor.submit(
                self.predict_inner, test_data, day_ahead): day_ahead for day_ahead in range(1, self.prediction_horizon + 1)}
            
            for future in as_completed(future_to_target):
                try:
                    y_pred_dct = future.result()
                except Exception as exc:
                    print(exc)
                else:
                    # self.models[day_ahead] = reg_obj
                    if output_pred.get("y_test") == None:
                        output_pred["y_test"] = [y_test]
                    else:
                        output_pred["y_test"] += [y_test]

                    for qtile in self.quantile_levels:
                        if output_pred.get(qtile) == None:
                            output_pred[qtile] = [y_pred_dct[qtile][0]]
                        else:
                            output_pred[qtile] += [y_pred_dct[qtile][0]]

        return output_pred

        # for day_ahead in range(1, self.prediction_horizon + 1):
        #     test_idx = day_ahead - 1

        #     try:
        #         needed_test_data = test_data[test_data.day_ahead == day_ahead].iloc[0]
        #     except IndexError as e:
        #         days_str = "days" if day_ahead > 1 else "day"

        #         print(f"Error when predicting {day_ahead} {days_str} ahead")
        #         print(f"Encountered error {e}")
        #         print("Skipping this row")
        #         continue

        #     x_test = needed_test_data[self.feature_cols[day_ahead]]
        #     y_test = needed_test_data[self.target_cols[day_ahead]]

        #     predictor = self.models[day_ahead]
        #     y_pred_dct = predictor._predict(x_test)

        #     if output_pred.get("y_test") == None:
        #         output_pred["y_test"] = [y_test]
        #     else:
        #         output_pred["y_test"] += [y_test]

        #     for qtile in self.quantile_levels:
        #         if output_pred.get(qtile) == None:
        #             output_pred[qtile] = [y_pred_dct[qtile][0]]
        #         else:
        #             output_pred[qtile] += [y_pred_dct[qtile][0]]

        # return output_pred

In [0]:
# Disable adaptive query optimization
# Adaptive query optimization groups together smaller tasks into a larger tasks.
# This may result in limited parallelism if the parallel inference tasks are deemed to be too small by the query optimizer
# We are diableing AQE here to circumevent this limitation on parallelism
spark.conf.set("spark.sql.adaptive.enabled", "false")
               

In [0]:
REVENUE_COL = "_reservationRevenuePerRoomUSD"
ROOMS_COL = "_rooms"
PIPELINE = "INFERENCE"

WITHOUT_PMS = str_to_bool(getArgument("exclude_pms"))
IS_USD_CURRENCY = str_to_bool(getArgument("is_usd_currency"))
TARGET_TYPE = getArgument("target_type")
selected_hotels = str_to_lst(getArgument("selected_hotels"))
LAG_NUMBERS = list(map(int,str_to_lst(getArgument('lag_numbers'))))

### The start of the model data
MODEL_START_DATE = pd.to_datetime("2018-10-01")
COVID_START_DATE = pd.to_datetime("2020-03-01")
COVID_END_DATE = pd.to_datetime("2021-08-01")

CALC_UNCERTAINTY = False
# MODEL_TYPE = "XGB"  # Use "AG" to try out the auto gloun approach
MODEL_TYPE = "AG"

LEAD_WINDOW = 60

ML_EXPERIMENT_ID = 1079527465953184

if MODEL_TYPE == "XGB":
    RUN_ID = "92907cac187f4c8cadb63ff60a05d72e"  # XGB Run
elif CALC_UNCERTAINTY and (MODEL_TYPE == "AG"):
    RUN_ID = "9549361574484dc58fcf1b7d130541a0"
else:
    RUN_ID = "19dee6420aed45f29e956016c5ea6e8a"


lead_window_start_days = 14
lead_window_end_days = 60
prediction_horizon = 14

In [0]:
env_config = EnvironmentConfig(env=ENV, target=TARGET_TYPE, spark=spark, is_usd_currency=IS_USD_CURRENCY)
forecasting_config_provider = ForecastingHotelConfigProvider(spark=spark,env=ENV)
target_column = env_config.target_column
schema = inference_output_schema

In [0]:
# As a workaround for the bug PHG-2157
PARTITION_DATE = spark.sql(
    f"select max(confirmationDate) from {env_config.source_data_table}"
).collect()[0][0]
print(PARTITION_DATE)

max_inference_length = spark.sql(f'select max(inference_prediction_length) from {forecasting_config_provider.config_table_name}').collect()[0][0]
TEST_PARTIITON_END = PARTITION_DATE + pd.Timedelta(max_inference_length, "D")
print(TEST_PARTIITON_END)

In [0]:
# PARTITION_DATE = pd.to_datetime("2024-10-07")
# TEST_PARTIITON_END = pd.to_datetime("2024-11-04")

In [0]:
logger = get_dbx_logger(pipeline=PIPELINE,
                        task_type=TARGET_TYPE,
                        exclude_pms=WITHOUT_PMS)
logger.setLevel(logging.INFO)

In [0]:
def pyfunc_load_model_retry(model_uri, max_tries):
    '''Retry mechanism for loading models from mlflow model registry to 
    handle the model loading error
    '''
    loop_len = max_tries+1
    for i in range(loop_len):
            try:
                return mlflow.pyfunc.load_model(model_uri)
            except Exception as e:
                if i+1==loop_len:
                    raise e
                else:
                    print(e)
                    print(f'Retrying: attempt {i+1}')

In [0]:
def correct_prediction_list(y_med, y_test, y_upper, y_lower, target, available_rooms):
    y_med_lst = []
    y_upper_lst = []
    y_lower_lst = []

    for i, vals in enumerate(zip(y_med, y_test, y_upper, y_lower)):
        (
            y_med_corrected,
            y_test_corrected,
            y_upper_corrected,
            y_lower_corrected,
        ) = post_process_prediction(
            y_med=vals[0],
            y_test=vals[1],
            y_upper=vals[2],
            y_lower=vals[3],
            target=target,
            available_rooms=available_rooms,
        )

        y_med_lst.append(y_med_corrected)
        y_upper_lst.append(y_upper_corrected)
        y_lower_lst.append(y_lower_corrected)

    return y_med_lst, y_upper_lst, y_lower_lst


def post_process_prediction(y_med, y_test, y_upper, y_lower, target, available_rooms):
    # This step corrects the dipping issue
    (
        y_med_corrected,
        y_test_corrected,
        y_upper_corrected,
        y_lower_corrected,
    ) = correct_dipping(y_med=y_med, y_test=y_test, y_upper=y_upper, y_lower=y_lower)

    if target == "ROOMS":
        # If the target type is ROOMS we can try to correct the max rooms capping issue
        if available_rooms is None:
            raise ValueError("The argument available_rooms must be provided")

        (
            y_med_corrected,
            y_test_corrected,
            y_upper_corrected,
            y_lower_corrected,
        ) = correct_capping(
            y_med=y_med_corrected,
            y_test=y_test_corrected,
            y_upper=y_upper_corrected,
            y_lower=y_lower_corrected,
            available_rooms=available_rooms,
        )

    return y_med_corrected, y_test, y_upper_corrected, y_lower_corrected


def correct_capping(y_med, y_test, y_upper, y_lower, available_rooms):
    """
    Transforms forecast predictions to be below the available_rooms.

    Args:
        y_med : raw mean/median predictions
        y_test: actual values
        y_upper : raw upper quantile predictions
        y_lower : raw lower quantile predictions
        available_rooms: available number of rooms for a the specific hotel

    Returns corrected predicted values.
    """

    y_med_corrected = np.where(y_med > available_rooms, available_rooms, y_med)
    y_upper_corrected = np.where(y_upper > available_rooms, available_rooms, y_upper)
    y_lower_corrected = np.where(y_lower > available_rooms, available_rooms, y_lower)

    return y_med_corrected, y_test, y_upper_corrected, y_lower_corrected


def correct_dipping(y_med, y_test, y_upper, y_lower):
    """This returns the adjusted predicted values such that,
    1) median/mean predictions have a strict cumulative nature
    2) upper and lower quantile predictions are restricted to be on either
    side of the median/mean predictions.

    Args:
        y_med : raw mean/median predictions
        y_test: actual values
        y_upper : raw upper quantile predictions
        y_lower : raw lower quantile predictions

    Returns corrected predicted values.
    """
    last_val = y_med[0]

    # correcting predictions if they are lower than the last known actual value
    y_med_corrected = np.where(y_med < last_val, last_val, y_med)
    y_lower_corrected = y_lower #np.where(y_lower < last_val, last_val, y_lower)

    delta = y_med_corrected - np.abs(y_med)

    y_upper_corrected = y_upper + delta

    # further correction of predictions along the booking axis to have the cumulative nature
    for index in range(len(y_med)):
        # adjusting median/mean predictions
        if (y_med_corrected[index] < y_med_corrected[index - 1]) and (index > 0):
            y_med_corrected[index] = y_med_corrected[index - 1]

        # adjusting upper quantile predictions
        if y_med_corrected[index] > y_upper_corrected[index]:
            y_upper_corrected[index] = y_med_corrected[index]

        # adjusting lower quantile predictions
        if y_med_corrected[index] < y_lower_corrected[index]:
            y_lower_corrected[index] = y_med_corrected[index]

    return y_med_corrected, y_test, y_upper_corrected, y_lower_corrected

In [0]:
def prediction_wrapper(
    target_type, run_id, exclude_pms,hotel_config_provider,model_cache_dir,environment
):
    def predict_distributed(data):
        static_cols_ = ['year', 'quarter_of_year', 'month_of_year', 'week_of_year',
                         'day_of_year', 'month_of_quarter', 'week_of_quarter', 'day_of_quarter',
                           'week_of_month', 'day_of_month', 'holiday',
                             'day_of_week_0', 'day_of_week_1', 'day_of_week_2', 
                             'day_of_week_3', 'day_of_week_4', 'day_of_week_5', 'day_of_week_6']

        logger = get_dbx_logger("PHGML")
        
        max_lead_window = 100
        
        hotel_id = data["HotelID"].iloc[0]
        hotel_config = hotel_config_provider.get_config(hotel_id)
        model_type = hotel_config.inference_model_name

        print(f"Processing Hotel {hotel_id}")
        
        if target_type == "REVENUE":
            col_prefix = "RV"

            if hotel_config.forecast_currency is None:
                # If the target type is REVENUE, we should have a defined forecast_currency
                raise ValueError(f"Forecast currency cannot be None for target_type {target_type}")
            
        elif target_type == "ROOMS":
            col_prefix = "RM"
        
        data = remove_padded_cols(data,hotel_config.lead_window,max_lead_window,col_prefix)
        
        model_version = 1
        model_stage = "Staging"
        model_name = None

        try:

            if model_type == "LIGHTGBM":

                model_obj = ModelWrapper(
                                model_strategy=StrategyLGBM,
                                prediction_horizon=hotel_config.inference_length,
                                hotel_id=hotel_id,
                                target_type=target_type,
                                exclude_pms=exclude_pms,
                                cd_axis_max_lags=99, 
                                static_cols =static_cols_,)
                
                model_obj.set_latest_model_version(model_stage = environment)

                loaded_model = pyfunc_load_model_retry(model_obj.get_model_uri(), 6)
                
                loaded_model.unwrap_python_model().model_wrapper_model.prediction_horizon = hotel_config.inference_length
                #during training time, the target variables are suffixed as '_tgt' to differentiate between target booking pace values and feature booking pace values. but while doing daily inferences,
                # that distinction doesnt matter since we dont have the true values anyway, hence overriding the the target columns as below to avoid columns being not detected.
                loaded_model.unwrap_python_model().model_wrapper_model.target_cols = {day_ahead:[ f"{col_prefix}{j}" for j in range(day_ahead)] for day_ahead in range(1,hotel_config.inference_length+1)}
                    
            elif model_type == "AUTOGLUON":

                model_obj = ModelWrapper(
                                model_strategy=StrategyAG,
                                is_auto_reg=True,
                                prediction_horizon=hotel_config.inference_length,
                                hotel_id=hotel_id,
                                target_type=target_type,
                                exclude_pms=exclude_pms,
                                cd_axis_max_lags=99, 
                                static_cols =static_cols_,)


                model_obj.set_latest_model_version()
            
                pms = "PMS"
                if exclude_pms:
                    pms = "NOPMS"

                #dbfs_dir = f"/dbfs/mnt/models/forecasting/individual_hotels/{hotel_id}_{target_type}_{pms}/"
                dbfs_dir = f"{model_cache_dir}{hotel_id}_{target_type}_{pms}" 
                #f"/dbfs/mnt/models/forecasting/dev_individual_hotels/{hotel_id}_{target_type}_{pms}/"
                local_dir = model_obj.local_root

                if os.path.exists(local_dir):
                    shutil.rmtree(local_dir)

                # Copy cached model from blob storage to local dir
                
                shutil.copytree(dbfs_dir, local_dir)

                # load model
                loaded_model = load_pkl.load(path=model_obj.local_path)
                loaded_model.prediction_horizon = model_obj.prediction_horizon

            model_version = int(model_obj.version)
            model_name = [
                model_obj.get_model_name()
                for step in range(1, hotel_config.inference_length + 1)
            ]
            model_metadata = model_obj.get_remote_model_metadata()
            logger.info("Using model version {model_version}")

            logger.info(f"Inference length of model: {model_metadata.get('inference_length','NOT_FOUND')}")
            logger.info(f"Last trained date: {model_metadata.get('last_trained_date','NOT_FOUND')}")           

            output_dct = loaded_model.predict(data)
            y_pred_raw, y_test, y_upper_raw, y_lower_raw = output_dct[0.5], output_dct['y_test'], output_dct[0.9], output_dct[0.1]

            y_pred_interpolated = [interpolated_fill(day_ahead_array) for day_ahead_array in y_pred_raw]
            
            y_pred, y_upper, y_lower = correct_prediction_list(
                y_pred_interpolated, y_test, y_upper_raw, y_lower_raw,target_type,available_rooms = hotel_config.available_rooms
            )

            data["status"] = "complete"
            data["message"] = f"Successfully processed {hotel_id}"

            output_df = get_output_df(
                y_pred=y_pred,
                y_true=y_test,
                run_id=run_id,
                hotel_id=hotel_id,
                data=data.sort_values('day_ahead'),
                model_name=model_name,
                model_version=model_version,
                pms_sync_off=exclude_pms,
                forecast_currency=hotel_config.forecast_currency,
                prediction_horizon=hotel_config.inference_length,
                y_upper=y_upper,
                y_lower=y_lower,
                y_med_raw=y_pred_raw,
                y_upper_raw=y_upper_raw,
                y_lower_raw=y_lower_raw,
            )

            output_df["status"] = "complete"
            output_df["message"] = f"Successfully processed {hotel_id}"

        except MlflowException as e:
            if "RESOURCE_DOES_NOT_EXIST" in e.message:
                print(
                        f"Model {model_obj.get_model_name()} was not  found in the model registry. Skipping this model..."
                    )
            else:
                print("An MLFlowException occured")
                print(e)

            empty = pd.DataFrame(
                {
                    "HotelID": [hotel_id],
                    "run_id": [run_id],
                    "stay_date": [pd.Timestamp("1900-01-01")],
                    "booking_date": [pd.Timestamp("1900-01-01")],
                    "model_version": [0],
                    "timestamp": [pd.Timestamp("1900-01-01")],
                    "pms_sync_off": [exclude_pms],
                    "forecast_currency":[hotel_config.forecast_currency],
                    "day_index": [0],
                    "y_med": [0],
                    "model_name": [""],
                    "y_upper": [0],
                    "y_lower": [0],
                    "y_med_raw": [0],
                    "y_upper_raw": [0],
                    "y_lower_raw": [0],
                    "status": "incomplete",
                    "message": e.message,
                }
            )

            return empty

        except Exception as e:
            print(f"Hotel {hotel_id} encountered an error ")
            raise e
        finally:
            if model_type == "AUTOGLUON":
                model_obj.clean()

        return output_df

    return predict_distributed

In [0]:
env_config.preprocess_intermediate_table = "test_preprocess_intermediate_table"

In [0]:
logger.info("Read preprocessing data")
df = spark.sql(
    f"select * from {env_config.preprocess_intermediate_table}"
).withColumn("status", lit("incomplete"))

# df = df.filter(df.HotelID=='63662')
df = df.withColumn("_StayDates", to_timestamp("_StayDates", "yyyy-MM-dd")).orderBy(["HotelID", "_StayDates"])

df = df.withColumn('partition_date', lit(str(PARTITION_DATE)))
df = df.withColumn("day_ahead", datediff(col("_StayDates"), to_timestamp('partition_date', "yyyy-MM-dd")))

In [0]:
# DEBUG
# output = debug_prediction(df,MODEL_TYPE, TARGET_TYPE, ML_EXPERIMENT_ID, RUN_ID, WITHOUT_PMS, CALC_UNCERTAINTY,forecasting_config_provider,model_cache_dir=env_config.model_cache_dir)

In [0]:
# Group the data by hotel id and execute the inferences in parallel
logger.info("Starting parallell processing")
output_df = df.groupby("HotelID").applyInPandas(
    prediction_wrapper(
        target_type=TARGET_TYPE, 
        run_id=RUN_ID, 
        exclude_pms=WITHOUT_PMS, 
        hotel_config_provider=forecasting_config_provider,
        model_cache_dir=env_config.model_cache_dir,
        environment=ENV
    ),
    schema,
)

In [0]:
# logger.info("Drop intermediate results table if it exists")
# spark.sql(f"DROP TABLE IF EXISTS {env_config.inference_intermediate_table}")

In [0]:
env_config.inference_intermediate_table = "test_inference_intermediate_table"

In [0]:
logger.info(
    f"Writing inference results to temporary table {env_config.inference_intermediate_table}"
)
start_time_temp = time.perf_counter()
(
    output_df.write.mode("overwrite")
    .option("overwriteSchema", "true")
    .saveAsTable(env_config.inference_intermediate_table)
)
elapsed_time_temp = time.perf_counter() - start_time_temp
logger.info(f"Time elapsed {elapsed_time_temp}")
logger.info(f"Time elapsed in minutes {elapsed_time_temp/60}")

In [0]:
meta_columns = ["HotelID", "run_id", "timestamp", "pms_sync_off", "status", "message"]
results_table = spark.sql(f"select * from {env_config.inference_intermediate_table}")
output_meta = results_table.select(meta_columns).toPandas()

num_completed = output_meta[output_meta["status"] == "complete"]["HotelID"].nunique()
total = output_meta["HotelID"].nunique()
logger.info(f"{num_completed} out of {total} hotels processed succussfully")

In [0]:
incomplete = output_meta[~(output_meta["status"] == "complete")]

for row in incomplete.itertuples():
    logger.error(
        f"Error encountered when processing hotel {row.HotelID}: {row.message}"
    )

In [0]:
output_df = results_table.filter(results_table.status == "complete").drop(
    "status", "message"
)

#Drop forecast currency if TARGET_TYPE is ROOMS
if TARGET_TYPE == "ROOMS":
    output_df = output_df.drop("forecast_currency")

In [0]:
env_config.inference_output_table = "test_inference_output_table"

In [0]:
logger.info("Writing completed results to table")
file_format = "delta"

(
    output_df.write.format("delta")
    .mode("append")
    .partitionBy("HotelID")
    # .option("path", env_config.inference_output_table_blob)
    .option("overwriteSchema", "true")
    .saveAsTable(env_config.inference_output_table)
)

In [0]:
elapsed_time = time.perf_counter() - start_time
logger.info(f"Time elapsed {elapsed_time}")
logger.info(f"Time elapsed in minutes {elapsed_time/60}")