In [0]:
%pip install /dbfs/FileStore/python-wheels/dev/phgml-1.4.0-py3-none-any.whl -q

In [0]:
dbutils.widgets.dropdown("save_model", "True", ["True", "False"], "Save Model")
dbutils.widgets.text("model_tags","model_stage:CA3_develop", "Model Tags")

In [0]:
import pandas as pd
from datetime import datetime, timedelta
from typing import List
import re
import logging
from mlflow import MlflowException
from mlflow.client import MlflowClient
import os
import sys

In [0]:
def str_to_bool(value):
  FALSE_VALUES = ['false', 'no', '0']
  TRUE_VALUES = ['true', 'yes', '1']
  lvalue = str(value).lower()
  if lvalue in (FALSE_VALUES): return False
  if lvalue in (TRUE_VALUES):  return True
  raise Exception("String value should be one of {}, but got '{}'.".format(FALSE_VALUES + TRUE_VALUES, value))

def extract_param_values(value: str)-> List[str]:
    """
    The function takes comma seperated strings and return list of strings

    input params:
        value (str) : comma seperated strings

    output:
        (List) : list of strings
    """
    if value == "":
        return []
    elif "," in value:
        val_lst = value.split(",")
        return val_lst
    else:
        return [value]
    
def get_model_tags(model_tags_str):
    ''' A Validation for the model tag text through databricks utility'''
    valid_pattern = r'(\w+:\w+)'
    invalid_pattern = r'[^:,\w\s]'
    str_cpy=model_tags_str.replace(" ", '')

    not_allowed_symbols = re.findall(pattern=invalid_pattern, string=str_cpy)
    if len(not_allowed_symbols)>0:
        raise ValueError('''Unwanted characters detected. Allowed characters are colon(:), comma(,), word characters and white space characters
                            Please specify key values pairs as key1:value1,key2:value2
                        ''')
    else:
        matching_pairs = re.findall(pattern=valid_pattern, string=str_cpy)

        for matching_str in matching_pairs:
            str_cpy = str_cpy.replace(matching_str,'')
        str_cpy = str_cpy.replace(',','')
        
        if len(str_cpy)>0:
            raise ValueError('''unmatched string components detected. please check the specified string
                             Please specify key values pairs as key1:value1,key2:value2
                             ''')
        
    return {key:value for key,value in map(lambda x: x.split(':'),matching_pairs)}

In [0]:
params = {'WITHOUT_PMS': False,
 'SELECTED_HOTELS': ['63662','26834','1406','10443','71999','64942','26532','55810','56217','76630','27226','79051','79908','36063','27275'], 
 #,'26834','1406','10443','71999','64942','26532','55810'
 'ENV': 'dev',
 'CACHE_MODELS': False,
 'TARGET_TYPE': 'REVENUE',
 'MODEL_TYPE': 'FARFIELD',
 'REPOPATH': '/Workspace/Repos/manik@surge.global/phg-data-mlsys/src',
 'FORECAST_POINTS': [91, 84, 77, 70, 63, 56, 49, 42, 35],
 'CA_AWARE': True,
 'IS_USD_CURRENCY': True,
 'DAYS_AHEAD': 7,
 'MAX_FORECAST_POINT': 91,
 'MIN_FORECAST_POINT': 35,
 'MAX_TARGET_LEAD': 100,
 'MAX_LEAD': 151,
 'LAG_NUMBERS': [7,14,21,28,35,42,49,56,63,70,77,84,91,98,105,112,119,126,133,140,147],
 'MODEL_START_DATE': pd.to_datetime('2018-10-01 00:00:00'),
 'COVID_START_DATE': pd.to_datetime('2020-03-01 00:00:00'),
 'COVID_END_DATE': pd.to_datetime('2021-08-01 00:00:00'),
 'REVENUE_COL': '_reservationRevenuePerRoomUSD',
 'ROOMS_COL': '_rooms',
 'TARGET_COLUMN': '_reservationRevenuePerRoomUSD',
 'PREPROCESSED_TABLE': 'testing_data.pp_ff_preprocess_rv',
 'PARTITION_DATE': datetime(2024, 2, 1, 3, 59, 6),
 'CORRECTED_HOTEL_IDS': ['63662','26834','1406','10443','71999','64942','26532','55810','56217','76630','27226','79051','79908','36063','27275']} 
 # ,'10443','71999','64942','26532','55810'

In [0]:
# params = {}
# params['WITHOUT_PMS'] = dbutils.jobs.taskValues.get(taskKey = "capture_params", key = "WITHOUT_PMS")
# params['SELECTED_HOTELS'] = dbutils.jobs.taskValues.get(taskKey = "capture_params", key = "SELECTED_HOTELS")
# params['ENV'] = dbutils.jobs.taskValues.get(taskKey = "capture_params", key = "ENV")
# params['CACHE_MODELS'] = dbutils.jobs.taskValues.get(taskKey = "capture_params", key = "CACHE_MODELS")
# params['TARGET_TYPE'] = dbutils.jobs.taskValues.get(taskKey = "capture_params", key = "TARGET_TYPE")
# params['MODEL_TYPE'] = dbutils.jobs.taskValues.get(taskKey = "capture_params", key = "MODEL_TYPE")
# params['REPOPATH'] = dbutils.jobs.taskValues.get(taskKey = "capture_params", key = "REPOPATH")
# params["FORECAST_POINTS"] = dbutils.jobs.taskValues.get(taskKey = "dev-ff-forecast-data-processing", key = "FORECAST_POINTS")
# params['CA_AWARE'] = dbutils.jobs.taskValues.get(taskKey = "dev-ff-forecast-data-processing", key = "CA_AWARE")
# params['IS_USD_CURRENCY'] = dbutils.jobs.taskValues.get(taskKey = "dev-ff-forecast-data-processing", key = "IS_USD_CURRENCY")
# params['DAYS_AHEAD'] = dbutils.jobs.taskValues.get(taskKey = "dev-ff-forecast-data-processing", key = "DAYS_AHEAD")
# params['MAX_FORECAST_POINT'] = dbutils.jobs.taskValues.get(taskKey = "dev-ff-forecast-data-processing", key = "MAX_FORECAST_POINT")
# params['MIN_FORECAST_POINT'] = dbutils.jobs.taskValues.get(taskKey = "dev-ff-forecast-data-processing", key = "MIN_FORECAST_POINT")
# params['MAX_TARGET_LEAD'] = dbutils.jobs.taskValues.get(taskKey = "dev-ff-forecast-data-processing", key = "MAX_TARGET_LEAD")
# params['MAX_LEAD'] = dbutils.jobs.taskValues.get(taskKey = "dev-ff-forecast-data-processing", key = "MAX_LEAD")
# params['LAG_NUMBERS'] = dbutils.jobs.taskValues.get(taskKey = "dev-ff-forecast-data-processing", key = "LAG_NUMBERS")
# params['MODEL_START_DATE'] = pd.to_datetime(dbutils.jobs.taskValues.get(taskKey = "dev-ff-forecast-data-processing", key = "MODEL_START_DATE"))
# params['COVID_START_DATE'] = pd.to_datetime(dbutils.jobs.taskValues.get(taskKey = "dev-ff-forecast-data-processing", key = "COVID_START_DATE"))
# params['COVID_END_DATE'] = pd.to_datetime(dbutils.jobs.taskValues.get(taskKey = "dev-ff-forecast-data-processing", key = "COVID_END_DATE"))
# params['REVENUE_COL'] = dbutils.jobs.taskValues.get(taskKey = "dev-ff-forecast-data-processing", key = "REVENUE_COL")
# params['ROOMS_COL'] = dbutils.jobs.taskValues.get(taskKey = "dev-ff-forecast-data-processing", key = "ROOMS_COL")
# params['TARGET_COLUMN'] = dbutils.jobs.taskValues.get(taskKey = "dev-ff-forecast-data-processing", key = "TARGET_COLUMN")
# params['PREPROCESSED_TABLE'] = dbutils.jobs.taskValues.get(taskKey = "dev-ff-forecast-data-processing", key = "PREPROCESSED_TABLE")
# params['PARTITION_DATE'] = pd.to_datetime(dbutils.jobs.taskValues.get(taskKey = "dev-ff-forecast-data-processing", key = "PARTITION_DATE"))
# params['CORRECTED_HOTEL_IDS'] = dbutils.jobs.taskValues.get(taskKey = "dev-ff-forecast-data-processing", key = "CORRECTED_HOTEL_IDS")

params["ML_EXPERIMENT_ID"] = 609933091443417
params["CALC_UNCERTAINTY"] = True
params["SAVE_MODEL"] = False
params["SAVE_METRICS"] = False
params["MODEL_TAGS_DCT"] = get_model_tags(getArgument("model_tags"))
params["MAX_FORECAST_POINT"] = max(params["FORECAST_POINTS"]) 
params["MIN_FORECAST_POINT"] = min(params["FORECAST_POINTS"]) 
params["MODEL_TAGS_DCT"]["last_updated_date"] = str(datetime.now())
params["MODEL_TAGS_DCT"]["days_ahead"] = params["DAYS_AHEAD"]
params["MODEL_TAGS_DCT"]["max_forecast_point"] = params["MAX_FORECAST_POINT"]
params["MODEL_TAGS_DCT"]["min_forecast_point"] = params["MIN_FORECAST_POINT"]
params["MODEL_NAME_PREFIX"] = None

In [0]:
# if params["ENV"] == "dev":
#     print(f"Loading phgml package from repo {params['REPOPATH']}")
#     sys.path.append(os.path.abspath(params["REPOPATH"]))

# sys.path.append(os.path.abspath("/Workspace/Repos/yasith.udawatte@henrymwuamica.onmicrosoft.com/phg-data-mlsys"))

In [0]:
from phgml.data.config import FarfieldForecastingHotelConfigProvider,FarfieldEnvironmentConfig
from phgml.data.data_types import (
    training_output_schema,
)
# from phgml.pipeline.training import train_wrapper_farfield
from phgml.reporting.logging import get_dbx_logger
schema = training_output_schema

In [0]:
env_config = FarfieldEnvironmentConfig(
    env=params["ENV"], 
    without_pms=params["WITHOUT_PMS"], 
    target=params["TARGET_TYPE"],
    spark=spark,
    is_usd_currency=params["IS_USD_CURRENCY"]
)
forecasting_config_provider = FarfieldForecastingHotelConfigProvider(spark=spark,env=params["ENV"])
params["TARGET_COLUMN"] = env_config.target_column

In [0]:
logger = get_dbx_logger(pipeline=params["ENV"],
                        task_type=params["TARGET_TYPE"],
                        exclude_pms=params["WITHOUT_PMS"])
logger.setLevel(logging.INFO)

logger.info(f"Processing data for target type: {params['TARGET_TYPE']} : {params['TARGET_COLUMN']}")
logger.info(f"Excluding PMS data? {params['WITHOUT_PMS']}")

In [0]:
logger.info(f"Loading data from {params['PREPROCESSED_TABLE']}")
df = spark.sql(f"select * from {params['PREPROCESSED_TABLE']}")

In [0]:
if df.count() <= 0:
    logger.error("The loaded training dataset is empty.")
    logger.info("Terminting the pipeline execution")
    raise Exception("The loaded training dataset is empty.")

In [0]:
from sys import version_info
import cloudpickle
from phgml.models.model_strategy import BaseStrategy
from phgml.models.base_model import BaseModel
from typing import Optional, Tuple, Union, List, Dict, Any, Callable
import mlflow
import numpy.typing as npt
# from phgml.models.model_strategy import StrategyLGBM, StrategyAG, StrategyLGBMFarField
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor, as_completed
from phgml.reporting.output_metrics import (
    log_metrics_stays,
    report_metrics_stays,
    log_metrics_stays_farfield,
    report_metrics_stays_farfield,
)

PYTHON_VERSION = "{major}.{minor}.{micro}".format(
    major=version_info.major, minor=version_info.minor, micro=version_info.micro
)

In [0]:
from lightgbm import LGBMRegressor
from abc import ABC, abstractmethod
import numpy.typing as npt
from typing import Optional, Tuple, Union, List, Dict, Any, Callable

class BaseStrategy(ABC):
    model_type = "base_strategy"

    def __init__(
        self,
        day_ahead: int,
        quantile_levels: list,
        cd_axis_targets: list,
        path: str,
        is_auto_reg: bool,
        # verbose: int,
    ):
        self.quantile_levels = quantile_levels
        self.day_ahead = day_ahead
        self.cd_axis_targets = cd_axis_targets
        self.target_prefix = self.cd_axis_targets[0][:2]
        self.path = path
        self.autoregressive_predictions = is_auto_reg
        # self.verbose = verbose

        self.objective = "quantile"
        self.sub_predictors: Dict[str, Dict[str, Any]] = {}

    @abstractmethod
    def _fit(self, train_data: pd.DataFrame) -> None:
        pass

    @abstractmethod
    def _predict(self, test_data: pd.Series) -> Dict[float, List[npt.NDArray]]:
        pass


class StrategyLGBMFarField(BaseStrategy):
    model_type = "LGBM_FARFIELD"

    def __init__(
        self,
        day_ahead: int,
        quantile_levels: list,
        cd_axis_targets: list,
        path: str,
        is_auto_reg: bool,
        # verbose: int = -1,
    ):
        super().__init__(
            day_ahead=day_ahead,
            quantile_levels=quantile_levels,
            cd_axis_targets=cd_axis_targets,
            path=path,
            is_auto_reg=is_auto_reg,
            # verbose=verbose,
        )

    def _fit(self, train_data: pd.DataFrame) -> None:

        x_data = train_data.drop([self.cd_axis_targets], axis=1)
        self.target_feature_dtypes = dict(x_data.dtypes)
        y_data = train_data[self.cd_axis_targets]
        print(x_data)

        # if self.verbose != -1:
        print(
            "\t\ttarget: ",
            self.cd_axis_targets,
            " x_data_cols:",
            x_data.columns.tolist(),
            " y_data_cols:",
            self.cd_axis_targets,
        )

        reg_objs = {}
        for qtile in self.quantile_levels:
            sub_predictor = LGBMRegressor(
                objective=self.objective, alpha=qtile, verbose=-1, n_jobs=-1
            )
            sub_predictor.fit(x_data, y_data)
            reg_objs[qtile] = sub_predictor

        self.sub_predictor = {
            "predictors": reg_objs,
            "targets": self.cd_axis_targets,
            "features": x_data.columns.tolist(),
        }

    def _predict(self, test_data: pd.Series) -> Dict[float, List[npt.NDArray]]:

        # making sure dtypes are the same as in training, and filtering out the target columns from the dtypes dict since in test data its not there.
        needed_dtypes = {
            col: col_dtype
            for col, col_dtype in self.target_feature_dtypes.items()
            if "_tgt" not in col
        }

        test_data_cpy = (
            test_data.to_frame().T.reset_index(drop=True).copy().astype(needed_dtypes)
        )

        data = {}

        for qtile in self.quantile_levels:
            pred = self.sub_predictor["predictors"][qtile].predict(test_data_cpy)
            data[qtile] = pred

        return data


In [0]:
conda_env = {
    "channels": ["defaults"],
    "dependencies": [
        "python={}".format(PYTHON_VERSION),
        "pip",
        {
            "pip": [
                "mlflow",
                "lightgbm",
                "cloudpickle=={}".format(cloudpickle.__version__),
            ],
        },
    ],
    "name": "model_wrapper_env",
}

class ModelWrapperMlflowModel(mlflow.pyfunc.PythonModel):
    """Custom Pyfunc model since the main model is not of native format and hence,
    doesn't belong to predefined MLflow flavors
    """

    def load_context(self, context):
        with open(f"{context.artifacts['model_dir']}/model.pkl", "rb") as pkl_file:
            self.model_wrapper_model = pickle.load(pkl_file)

    def predict(self, context, model_input):
        return self.model_wrapper_model.predict(model_input)
    
class ModelWrapperFarField(BaseModel):
    """Custom class which wraps a model type to generate
    predictions in a timeseries format.
    """

    def __init__(
        self,
        cd_axis_max_lags: int,
        static_cols: List[str],
        model_strategy: BaseStrategy,
        is_auto_reg: bool = False,
        is_ca3_training: bool = True,
        prediction_horizon: int = 7,
        lag_numbers: List[int] = [],
        quantiles: List[float] = [0.5],
        mlflow_run_id: Optional[str] = None,
        hotel_id: Optional[str] = None,
        version: Optional[Union[str, int]] = None,
        stage: Optional[str] = None,
        target_type: str = "REVENUE",
        exclude_pms: bool = False,
        save_models: bool = True,
        local_root_dir: Optional[str] = None,
        model_type: str = "MODELWRAPPER_FARFIELD",
        model_name_prefix: Optional[str] = None,
        meta_data: Dict[str, Any] = {},
        n_cd_lags: Optional[int] = None,
        forecast_points=[91, 84, 77, 70, 63, 56, 49, 42, 35],
        is_parallel=True,
        executor = "processpool",
        **kwargs,
    ):
        super().__init__(
            model_type=model_strategy.model_type,
            prediction_horizon=prediction_horizon,
            lag_numbers=lag_numbers,
            quantiles=quantiles,
            mlflow_run_id=mlflow_run_id,
            hotel_id=hotel_id,
            version=version,
            stage=stage,
            target_type=target_type,
            exclude_pms=exclude_pms,
            save_models=save_models,
            local_root_dir=local_root_dir,
            model_name_prefix=model_name_prefix,
            meta_data=meta_data,
            **kwargs,
        )
        self.quantile_levels.sort()
        self.cd_axis_max_lags = cd_axis_max_lags
        self.sd_axis_lag_prefix = "lag"
        self.static_cols = static_cols
        self.n_cd_lags = n_cd_lags
        self.target_suffix = "_tgt"
        self.is_auto_reg = is_auto_reg
        self.forecast_points = forecast_points
        self.model_strategy = model_strategy
        self.model_type = self.model_strategy.model_type
        self.all_cd_cols = [
            f"{self.target_prefix}{i}" for i in range(self.cd_axis_max_lags + 1)
        ]
        self.lag_numbers = [
            x for x in range(0, self.cd_axis_max_lags + 1, self.prediction_horizon)
        ]
        self.is_ca3_training = is_ca3_training
        self.executor = executor

        # initializing targets variables
        self.target_cols: Dict[int, str] = {}

        # initializing feature variables
        self.feature_cols: Dict[int, List[str]] = {}

        self.is_parallel = is_parallel

        if 0.5 not in self.quantile_levels:
            raise ValueError(
                "median quantile (0.5) is not included in the quantile_levels. please ensure that its included"
            )

    def save_model(self) -> None:
        """Saves the models in the local directory, which will then be logged as artifacts in MLflow"""
        os.makedirs(self.local_dir)
        with open(self.local_path, "wb") as pkl_file:
            pickle.dump(obj=self, file=pkl_file, protocol=pickle.HIGHEST_PROTOCOL)

    def log_models(self) -> None:
        """Carries out the mlflow model registry procedures"""
        print("Starting model logging")
        self.save_model()

        modelpath = self.get_model_log_path()
        print("Logging model")
        mlflow.pyfunc.log_model(
            artifact_path=self.get_model_log_path(),
            python_model=ModelWrapperMlflowModel(),
            artifacts=self.artifacts,
            conda_env=conda_env,
        )

        print("Registering model")
        result = mlflow.register_model(
            self.get_model_register_path(),
            self.get_model_name(),
            tags=self.meta_data,
        )

    def clean(self) -> None:
        if os.path.exists(self.local_root):
            shutil.rmtree(self.local_root)

    def load_pyfunc_model(
        self, dst_path: Optional[str] = None, tag: Optional[Union[str, int]] = None
    ) -> mlflow.pyfunc.PyFuncModel:
        """Load and return the pyfunc model from the MLFlow model repository

        Args:
            dst_path (str, optional): Destination path to save the loaded model.
                                      If not provided the files will be saved in the local_root path.
                                      Defaults to None.
            tag (str, optional): Tag to specify the version or model stage to be loaded.
                                 If not provided the latest model version will be loaded.
                                Defaults to None.

        Returns:
            mlflow.pyfunc.model : pyfunc model
        """
        # self.local_dir = dst_path
        print(f"Loading model {self.get_model_uri()}")

        if dst_path is not None:
            self.local_root = dst_path

        if os.path.exists(self.local_root):
            self.clean()

        os.mkdir(self.local_root)

        model = mlflow.pyfunc.load_model(
            self.get_model_uri(tag=tag), dst_path=self.local_root
        )

        self.run_id = model._model_meta.run_id

        # following is a bit of a round about way to set local_dir
        # having the run id in the directory name is a bit troublesome as the run id is not available to us when we create the autogluon object
        # TODO make sure to remove the run id from the local_dir and include either or both task_type/exclude_pms
        # TODO make sure to set the local_dir consistently for both training and inference tasks
        # self.local_dir = "/ag_models/"

        # if self.exclude_pms:
        #     self.local_dir = f"ag_models_{self.hotel_id}_{self.run_id}/"

        # os.rename("artifacts",self.local_dir)

        return model

    def get_filtered_data(
        self, data: pd.DataFrame, forecast_point: int
    ) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:

        if self.is_ca3_training:
            # Condition helps us get the specific entry for the cancellation day index
            target_column = (
                f"{self.target_prefix}{forecast_point-self.prediction_horizon}_tgt"
            )
            condition = data["forecast_index"] == forecast_point
            filt_data = data[condition].copy()
        else:
            target_column = (
                f"{self.target_prefix}{forecast_point-self.prediction_horizon}"
            )
            filt_data = data.copy()

        cd_axis_lag_columns = [
            f"{self.target_prefix}{x}"
            for x in range(forecast_point, self.cd_axis_max_lags + 1)
        ]

        sd_axis_lag_columns = [
            f"{self.sd_axis_lag_prefix}{SD_lag}"
            for SD_lag in self.lag_numbers
            if SD_lag > forecast_point
        ]

        # assigning target and feature variables corresponding to the particular day ahead. This will be retrieved through the attributes in the inference phase.
        self.target_cols[forecast_point] = target_column
        self.feature_cols[forecast_point] = (
            cd_axis_lag_columns + sd_axis_lag_columns + self.static_cols
        )

        x_data = filt_data[self.feature_cols[forecast_point]]
        y_data = filt_data[self.target_cols[forecast_point]]

        return (
            x_data,
            y_data,
            filt_data[[target_column] + self.feature_cols[forecast_point]],
        )

    def train_inner_model(self, train_data: pd.DataFrame, forecast_point: int) -> None:
        print("\tForecast point: ", forecast_point)
        x_train, y_train, xy_train = self.get_filtered_data(
            data=train_data, forecast_point=forecast_point
        )

        reg_obj = self.model_strategy(
            quantile_levels=self.quantile_levels,
            day_ahead=self.prediction_horizon,
            cd_axis_targets=self.target_cols[forecast_point],
            path=self.local_dir,
            is_auto_reg=self.is_auto_reg,
        )  # type: ignore
        reg_obj._fit(xy_train)

        return reg_obj

        # return (forecast_point,reg_obj)       

    def get_pool_executer(self):
        if self.executor == "threadpool":
            return ThreadPoolExecutor
        elif self.executor == "processpool":
            return ProcessPoolExecutor


    def train(self, train_data: pd.DataFrame, n_threads=3): # -> None
        """
        trains models for each day ahead quantile predictions and  relevant
        to the specified prediction_horizon value and the specific quantile
        levels.

        parameters:
            train_data = training data with the booking pace lags, stay date lags or
                    other features such as date features.

        Returns: None
        """
        # Using ProcessPoolExecutor
        if self.is_parallel:
            models = {}
            with self.get_pool_executer()(max_workers=n_threads) as executor:
                future_to_target = {executor.submit(
                    self.train_inner_model, 
                    train_data, 
                    forecast_point): forecast_point for forecast_point in self.forecast_points}
                
                for future in as_completed(future_to_target):
                    forecast_point = future_to_target[future]                    
                    try:
                        model = future.result()
                        # index_, model = future.result()
                    except Exception as exc:
                        print(exc)
                    else:
                        # self.models[forecast_point] = model
                        # self.models[index_] = model
                        models[forecast_point] = model

            self.models = models
        else:
            for forecast_point in self.forecast_points:
                self.models[forecast_point] = self.train_inner_model(train_data, forecast_point)

        model_list = self.models
        return model_list
    
        # with ProcessPoolExecutor(max_workers=3) as executor:
        #     futures = [executor.submit(
        #         self.train_forecast_point,
        #         train_data, 
        #         forecast_point) for forecast_point in self.forecast_points
        #     ]

        #     for future in futures:
        #         self.models[future.result()[0]] = future.result()[1]
            

        # for forecast_point in self.forecast_points:
            # print("\tForecast point: ", forecast_point)

            # x_train, y_train, xy_train = self.get_filtered_data(
            #     data=train_data, forecast_point=forecast_point
            # )
            # reg_obj = self.model_strategy(
            #     quantile_levels=self.quantile_levels,
            #     day_ahead=self.prediction_horizon,
            #     cd_axis_targets=self.target_cols[forecast_point],
            #     path=self.local_dir,
            #     is_auto_reg=self.is_auto_reg,
            # )  # type: ignore
            # reg_obj._fit(xy_train)

            # self.models[forecast_point] = reg_obj

        if self.do_save_models:
            self.log_models()

    def predict(
        self,
        test_data: pd.DataFrame,
    ) -> Tuple[
        List[npt.NDArray], List[pd.Series], List[npt.NDArray], List[npt.NDArray]
    ]:
        """generating quantile predictions for the test data provided. test_data
        should be provided which aligns with the prediction_horizon. If
        test_data has less rows than the prediction_horizon, then the length
        of the test_data will be considered as the prediction horizon.

        eg: if prediction_horizon= 28, ideally test_data should have 28 rows
            which are relevant for 28 stay dates.

        parameters:
            test_data = test data which aligns with the prediction horizon.
                        rows of test_data <= prediction_horizon.

        Returns: Lists with actual values and corresponding predicted values along the booking axis leading upto the
          relevant stay date ahead
        """
        y_test_lst = []
        y_pred_lst = []
        y_upper_lst = []
        y_lower_lst = []
        forecast_point = test_data["forecast_point"].iloc[0]

        # for forecast_point in self.forecast_points:
        target_column = f"{self.target_prefix}{forecast_point-self.prediction_horizon}"
        predictor = self.models[forecast_point]

        x_test = test_data[predictor.sub_predictor["features"]]
        y_test_lst.append(test_data[target_column].iloc[0])

        y_pred_dct = predictor._predict(x_test.squeeze())

        if 0.1 in self.quantile_levels:
            y_lower_lst.append(y_pred_dct[0.1][0])
        else:
            y_lower_lst.append(y_pred_dct[0.5][0])

        if 0.5 in self.quantile_levels:
            y_pred_lst.append(y_pred_dct[0.5][0])

        if 0.9 in self.quantile_levels:
            y_upper_lst.append(y_pred_dct[0.9][0])
        else:
            y_upper_lst.append(y_pred_dct[0.5][0])

        return y_pred_lst, y_test_lst, y_upper_lst, y_lower_lst

In [0]:
from typing import Callable, List, Optional
from phgml.models.model_wrapper import ModelWrapper #, ModelWrapperFarField
# from phgml.models.model_strategy import StrategyLGBM, StrategyAG, StrategyLGBMFarField

def train_wrapper_farfield(
    target_type: str,
    ml_experiment_id: str,
    exclude_pms: bool,
    calc_uncertainty: bool,
    hotel_config_provider: FarfieldForecastingHotelConfigProvider,
    processing_timestamp: datetime,
    save_models: bool,
    save_metrics: bool,
    lag_numbers: List[int],
    forecast_points: List[int],
    model_name_prefix: Optional[str] = None,
    model_tags: dict = None,
    is_parallel: bool = True,
    executor: str = "processpool"
) -> Callable:

    def train_data_models(df):
        static_cols_ = [
            "year",
            "quarter_of_year",
            "month_of_year",
            "week_of_year",
            "day_of_year",
            "month_of_quarter",
            "week_of_quarter",
            "day_of_quarter",
            "week_of_month",
            "day_of_month",
            "holiday",
            "day_of_week_0",
            "day_of_week_1",
            "day_of_week_2",
            "day_of_week_3",
            "day_of_week_4",
            "day_of_week_5",
            "day_of_week_6",
        ]

        logger = get_dbx_logger("PHGML")

        trainer = None
        hotel_id = df["HotelID"].iloc[0]
        hotel_config = hotel_config_provider.get_config(hotel_id)
        model_type = hotel_config.training_model_name

        max_lead = max(forecast_points)

        if target_type == "REVENUE":
            col_prefix = "RV"
        elif target_type == "ROOMS":
            col_prefix = "RM"

        test_partition_end = df["_StayDates"].max()
        test_partition_start = test_partition_end - pd.Timedelta(max_lead, "D")

        metadata_dict = {
            "last_trained_date": str(test_partition_start),
            "booking_pace_start": hotel_config.booking_pace_start,
            "booking_pace_end": hotel_config.booking_pace_end,
        }
        metadata_dict.update(model_tags)

        logger.debug(f"{hotel_id}:Filter train data")
        dftest = df[
            (df.HotelID == hotel_id)
            & (df["_StayDates"] >= test_partition_start)
            & (df["_StayDates"] <= test_partition_end)
        ]

        logger.debug(f"{hotel_id}:Filter test data")
        dftrain = df[
            (df.HotelID == hotel_id) & (df["_StayDates"] < test_partition_start)
        ]

        model_version = 1
        model_stage = "Staging"
        model_name = None

        pms = "PMS"
        if exclude_pms:
            pms = "NOPMS"

        with mlflow.start_run(
            experiment_id=ml_experiment_id,
            run_name=f"farfield-{model_type}-{target_type}-{pms}-{hotel_id}-{hotel_config.hotel_name}",
        ) as run:
            run_id = run.info.run_id

            if model_type == "LIGHTGBM":
                trainer = ModelWrapperFarField(
                    model_strategy=StrategyLGBMFarField,
                    is_auto_reg=False,
                    prediction_horizon=hotel_config.prediction_horizon,
                    hotel_id=hotel_config.hotel_id,
                    target_type=target_type,
                    exclude_pms=exclude_pms,
                    cd_axis_max_lags=hotel_config.booking_pace_end,
                    static_cols=static_cols_,
                    forecast_points=forecast_points,
                    save_models=save_models,
                    quantiles=[0.1, 0.5, 0.9],
                    mlflow_run_id=run_id,
                    model_name_prefix=model_name_prefix,
                    meta_data=metadata_dict,
                    is_parallel=is_parallel,
                    executor = executor
                )

            model_name = trainer.get_model_name()
            output_df = pd.DataFrame()
            try:
                logger.info(f"{hotel_id}:Training model")
                model_list = trainer.train(dftrain)
                logger.info(f"{hotel_id}:Completed training")

                logger.info(f"{hotel_id}:Start prediction")
                start_date = dftest["_StayDates"].min()
                end_date = dftest["_StayDates"].max()
                observed_stays = [
                    start_date + timedelta(days=delta)
                    for delta in range((end_date - start_date).days + 1)
                ]

                result_df_list = []
                test_list = []
                for forecast_point in forecast_points:
                    print(f"Training for {forecast_point} days")
                    stay_date_ = test_partition_start + timedelta(days=forecast_point)

                    temp_df = dftest[
                        (dftest["HotelID"] == hotel_id)
                        & (dftest["_StayDates"] == stay_date_)
                        & (dftest["forecast_index"] == forecast_point)
                    ]
                    temp_df["forecast_point"] = forecast_point

                    y_pred_lst, y_test_lst, y_upper_lst, y_lower_lst = (
                        [0],
                        [0],
                        [0],
                        [0],
                    )
                    if not temp_df.empty:  
                        y_pred_lst, y_test_lst, y_upper_lst, y_lower_lst = (
                            trainer.predict(temp_df)
                        )

                    temp_result = pd.DataFrame(
                        {
                            "prediction_points": [forecast_point],
                            "y_true": y_test_lst,
                            "y_pred": y_pred_lst,
                            "y_lower": y_lower_lst,
                            "y_upper": y_upper_lst,
                        }
                    )
                    temp_result["_StayDates"] = stay_date_
                    temp_result["HotelID"] = hotel_id

                    result_df_list.append(temp_result)

                if save_metrics:
                    log_metrics_stays_farfield(result_df_list, trainer)

                report_metrics_stays_farfield(result_df_list)

                del dftrain
                del dftest

                output_df = pd.DataFrame(
                    {
                        "HotelID": [hotel_id],
                        "run_id": [run_id],
                        "model_version": [model_version],
                        "timestamp": [processing_timestamp],
                        "pms_sync_off": [exclude_pms],
                        "model_name": [model_name],
                        "status": "complete",
                        "message": f"Successfully trained {hotel_id}",
                    }
                )
            except Exception as e:
                empty = pd.DataFrame(
                    {
                        "HotelID": [hotel_id],
                        "run_id": [run_id],
                        "model_version": [model_version],
                        "timestamp": [pd.Timestamp("1900-01-01")],
                        "pms_sync_off": [exclude_pms],
                        "model_name": [model_name],
                        "status": "incomplete",
                        "message": str(e),
                    }
                )
                return empty

            finally:
                if (model_type == "AUTOGLUON") and (trainer is not None):
                    trainer.clean()

        return output_df

    return train_data_models

In [0]:
# # Group the data by hotel id and execute the trainings in parallel
# logger.info("Starting parallel training old way")

# output_df_old = df.groupby("HotelID").applyInPandas(
#     train_wrapper_farfield(
#         target_type=params["TARGET_TYPE"],
#         ml_experiment_id=params["ML_EXPERIMENT_ID"],
#         exclude_pms=params["WITHOUT_PMS"],
#         calc_uncertainty=params["CALC_UNCERTAINTY"],
#         hotel_config_provider=forecasting_config_provider,
#         processing_timestamp=datetime.now(),
#         save_models=params["SAVE_MODEL"],
#         save_metrics=params["SAVE_METRICS"],
#         lag_numbers=[],
#         forecast_points=params["FORECAST_POINTS"],
#         # model_name_prefix=params["MODEL_NAME_PREFIX"],
#         model_tags=params["MODEL_TAGS_DCT"],
#         is_parallel=False,        
#     ),
#     schema,
# )

In [0]:
# display(output_df_old)

In [0]:
logger.info("Starting parallel training new")

output_df_new = df.groupby("HotelID").applyInPandas(
    train_wrapper_farfield(
        target_type=params["TARGET_TYPE"],
        ml_experiment_id=params["ML_EXPERIMENT_ID"],
        exclude_pms=params["WITHOUT_PMS"],
        calc_uncertainty=params["CALC_UNCERTAINTY"],
        hotel_config_provider=forecasting_config_provider,
        processing_timestamp=datetime.now(),
        save_models=params["SAVE_MODEL"],
        save_metrics=params["SAVE_METRICS"],
        lag_numbers=[],
        forecast_points=params["FORECAST_POINTS"],
        # model_name_prefix=params["MODEL_NAME_PREFIX"],
        model_tags=params["MODEL_TAGS_DCT"],
        is_parallel=True,
        executor="threadpool"
    ),
    schema,
)

In [0]:
display(output_df_new)

HotelID,run_id,model_version,timestamp,pms_sync_off,model_name,status,message
71999,ad2fe266fd7e4541817567802589f2d7,1,2024-09-12T16:10:27.882+0000,False,71999_REVENUE_PMS_LGBM_FARFIELD_model,complete,Successfully trained 71999
55810,e46e123585aa44b8a236754f0f795998,1,2024-09-12T16:10:27.882+0000,False,55810_REVENUE_PMS_LGBM_FARFIELD_model,complete,Successfully trained 55810
10443,8784a59c09364c498e6c91eaaa970c2c,1,2024-09-12T16:10:27.882+0000,False,10443_REVENUE_PMS_LGBM_FARFIELD_model,complete,Successfully trained 10443
63662,ec3b890e8da64899b7213002be5abac4,1,2024-09-12T16:10:27.882+0000,False,63662_REVENUE_PMS_LGBM_FARFIELD_model,complete,Successfully trained 63662
1406,fb6a1d380e3347c180027240d6a8ca52,1,2024-09-12T16:10:27.882+0000,False,1406_REVENUE_PMS_LGBM_FARFIELD_model,complete,Successfully trained 1406
64942,9fce8b41db2e44e0bb48bf1967fc1226,1,2024-09-12T16:10:27.882+0000,False,64942_REVENUE_PMS_LGBM_FARFIELD_model,complete,Successfully trained 64942
26532,f56b5f8cd5724aa485f07f7081754849,1,2024-09-12T16:10:27.882+0000,False,26532_REVENUE_PMS_LGBM_FARFIELD_model,complete,Successfully trained 26532
26834,da624252db7c44d3a2fd4ba94c1f9ca1,1,2024-09-12T16:10:27.882+0000,False,26834_REVENUE_PMS_LGBM_FARFIELD_model,complete,Successfully trained 26834


In [0]:
# output_df = output_df.toPandas()

In [0]:
# for index, row in output_df.iterrows():
#     if row.status == "complete":
#         logger.info(f"{row.message}")
#     else:
#         logger.error(
#             f"Error encountered when training hotel {row.HotelID}: {row.message}"
#         )

In [0]:
# client = MlflowClient()
# completed = output_df[output_df["status"]=="complete"]

# outputs_list = []
# for n,g in completed.groupby(["HotelID","model_name"]):
#     hotel_id = n[0]
#     model_name = n[1]
#     hotel_config = forecasting_config_provider.get_config(hotel_id)

#     mv = client.get_latest_versions(name=model_name)[0]
#     print(mv)
#     arts = client.list_artifacts(mv.run_id,path=f"forecasting/{hotel_id}/models/{model_name}/artifacts")
    
#     outputs_list.append({"hotel_id":hotel_id,
#                          "model_name":model_name,
#                          "creation_time":datetime.fromtimestamp(mv.creation_timestamp/1e3),
#                          "last_update":datetime.fromtimestamp(mv.last_updated_timestamp/1e3),
#                          "version":mv.version,
#                          "target":params["TARGET_TYPE"],
#                          "exclude_pms":params["WITHOUT_PMS"],
#                          "config_train_length":hotel_config.booking_pace_start,
#                          "config_infer_length":hotel_config.booking_pace_end,
#                          "num_model_steps":len(arts)-1})
    
#     print(f"Hotel: {hotel_id} target_type:{params['TARGET_TYPE']} exclude_pms:{params['WITHOUT_PMS']} : {len(arts)-1}")

# completed_df = pd.DataFrame(outputs_list)

In [0]:
# with ProcessPoolExecutor(max_workers=5) as executor:
#     futures = [executor.submit(
#         testfunction,
#         i) for i in [1,2,3,4,5,6,7,8,9,10]]

#     # {future: forecast_point}
#     for future in futures:
#         print(future.result()[0],future.result()[1])
#         # self.models[future.result().day_ahead] = future.result()

In [0]:
# import time

# def testfunction(number:int):
#     result_ = number*10
#     time.sleep(6)
#     return (number, result_)

# with ProcessPoolExecutor(max_workers=5) as executor:
#     future_to_target = {
#         executor.submit(
#             testfunction,
#             i): i for i in [1,2,3,4,5,6,7,8,9,10]
#     }

#     # print(future_to_target)

#     models = {}
#     for task in as_completed(future_to_target): 
#         i = future_to_target[task]
#         try:
#             index, resutl = task.result()
#             # resutl = task.result()[1]
#         except Exception as exc:
#             print(exc)
#         else:
#             models[index] = resutl

#     print(models)