In [0]:
dbutils.widgets.dropdown("env_stage", "dev", ["dev", "prod"], "Pipeline stage")
dbutils.widgets.dropdown("exclude_pms", "False", ["True", "False"], "Exclude PMS")
dbutils.widgets.dropdown("target_type", "REVENUE", ["REVENUE", "ROOMS"], "Target Type")
dbutils.widgets.dropdown("is_usd_currency", "True", ["True", "False"], "Use USD currency")
dbutils.widgets.text("lag_numbers","1,7,14,28", "Lag Numbers")
dbutils.widgets.text("model_tags","", "Model Tags")

In [0]:
%pip install mlflow==2.2.2

In [0]:
%load_ext autoreload

%autoreload 2

In [0]:
import pandas as pd
import numpy as np
from pyspark.sql.functions import *
import datetime
from pathlib import Path
import pickle
import os
from sys import version_info
import cloudpickle
import mlflow
import mlflow.pyfunc
import logging
import warnings
from mlflow import MlflowException
from mlflow.client import MlflowClient
import time
import datetime
import re
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor, as_completed

In [0]:
# sys.path.append(os.path.abspath('/Workspace/Repos/manik@surge.global/phg-data-mlsys/src'))
warnings.filterwarnings("ignore")
# start_time = time.perf_counter()

In [0]:
ENV = getArgument("env_stage")

REPOPATH = "/Workspace/Repos/manik@surge.global/phg-data-mlsys/src"

cluster_name = spark.conf.get("spark.databricks.clusterUsageTags.clusterName")

if (ENV == "dev") and ("dev" in cluster_name):
    print(f"Loading phgml package from repo {REPOPATH}")
    sys.path.append(os.path.abspath(REPOPATH))

In [0]:
from phgml.models.xgboost_model import XGBMultiStepPredictor
from phgml.models.autogluon_model import AutoGluonModel, AGMlflowModel
from phgml.models.lightgbm_model import LightGBMModel, LGBMMlflowModel
# from phgml.pipeline.training import train_wrapper
from phgml.data.processing_distr_ca import (
    filter_train_data,
    filter_test_data,
    remove_padded_cols,
)
from phgml.reporting.output_metrics import *
from phgml.data.data_types import (
    revenue_preprocessed_schema,
    rooms_preprocessed_schema,
    training_output_schema,
)
from phgml.reporting.logging import get_logging_path, get_logging_filename, get_dbx_logger
from phgml.reporting.report_results import get_output_df, correct_prediction_list
from phgml.data.config import EnvironmentConfig, ForecastingHotelConfigProvider

In [0]:
# Disable adaptrive query optimization
# Adaptive query optimization groups together smaller tasks into a larger tasks.
# This may result in limited parallelism if the parallel inference tasks are deemed to be too small by the query optimizer
# We are diableing AQE here to circumevent this limitation on parallelism
spark.conf.set("spark.sql.adaptive.enabled", "false")

In [0]:
def str_to_lst(value):
    if value == "":
        return []
    elif "," in value:
        hotels = value.split(",")
        return hotels

    return [value]


def str_to_bool(value):
    FALSE_VALUES = ["false", "no", "0"]
    TRUE_VALUES = ["true", "yes", "1"]
    lvalue = str(value).lower()
    if lvalue in (FALSE_VALUES):
        return False
    if lvalue in (TRUE_VALUES):
        return True
    raise Exception(
        "String value should be one of {}, but got '{}'.".format(
            FALSE_VALUES + TRUE_VALUES, value
        )
    )

def get_model_tags(model_tags_str):
    ''' A Validation for the model tag text through databricks utility'''
    valid_pattern = r'(\w+:\w+)'
    invalid_pattern = r'[^:,\w\s]'
    str_cpy=model_tags_str.replace(" ", '')

    not_allowed_symbols = re.findall(pattern=invalid_pattern, string=str_cpy)
    if len(not_allowed_symbols)>0:
        raise ValueError('''Unwanted characters detected. Allowed characters are colon(:), comma(,), word characters and white space characters
                            Please specify key values pairs as key1:value1,key2:value2
                        ''')
    else:
        matching_pairs = re.findall(pattern=valid_pattern, string=str_cpy)

        for matching_str in matching_pairs:
            str_cpy = str_cpy.replace(matching_str,'')
        str_cpy = str_cpy.replace(',','')
        
        if len(str_cpy)>0:
            raise ValueError('''unmatched string components detected. please check the specified string
                             Please specify key values pairs as key1:value1,key2:value2
                             ''')
        
    return {key:value for key,value in map(lambda x: x.split(':'),matching_pairs)}

In [0]:
log_root = "/dbfs/mnt/extractionlogs/synxis"
processing_timestamp = datetime.datetime.now()

In [0]:
REVENUE_COL = "_reservationRevenuePerRoomUSD"
ROOMS_COL = "rooms"
PIPELINE = "TRAINING"

WITHOUT_PMS = str_to_bool(getArgument("exclude_pms"))
IS_USD_CURRENCY = str_to_bool(getArgument("is_usd_currency"))
TARGET_TYPE = getArgument("target_type")

MODEL_TAGS_DCT = get_model_tags(getArgument("model_tags"))
print('model tags dict: ',MODEL_TAGS_DCT)

### The start of the model data
MODEL_START_DATE = pd.to_datetime("2018-10-01")

COVID_START_DATE = pd.to_datetime("2020-03-01")
COVID_END_DATE = pd.to_datetime("2021-08-01")

CALC_UNCERTAINTY = True
# MODEL_TYPE = "XGB"  # Use "AG" to try out the auto gloun approach
MODEL_TYPE = "AG"

LEAD_WINDOW = 60
PREDICTION_HORIZON = 30

ML_EXPERIMENT_ID = 609933091443417

lead_window_start_days = 14
lead_window_end_days = 60
prediction_horizon = 14
LAG_NUMBERS = list(map(int,str_to_lst(getArgument('lag_numbers'))))


SAVE_MODEL = False
SAVE_METRICS = False

In [0]:
# Config data relevant to this pipeline
env_config = EnvironmentConfig(env=ENV, target=TARGET_TYPE, spark=spark, is_usd_currency=IS_USD_CURRENCY)
forecasting_config_provider = ForecastingHotelConfigProvider(spark=spark,env=ENV)
target_column = env_config.target_column
schema = training_output_schema

In [0]:
logger= get_dbx_logger(
    pipeline=PIPELINE,
    task_type=TARGET_TYPE,
    exclude_pms=WITHOUT_PMS,
)

In [0]:
logger.info(f"Processing data for target type: {TARGET_TYPE} : {target_column}")
logger.info(f"Excluding PMS data? {WITHOUT_PMS}")

In [0]:
#Removed forecast training wrapper function

In [0]:
logger.info(f"Loading data from testing_data.pp_ff_preprocess_rv")
df = spark.sql(f"select * from testing_data.pp_ff_preprocess_rv")

In [0]:
if df.count() <= 0:
    logger.error("The loaded training dataset is empty.")
    logger.info("Terminting the pipeline execution")
    raise Exception("The loaded training dataset is empty.")

In [0]:
from phgml.models.base_model import BaseModel
from phgml.models.model_strategy import BaseStrategy
from phgml.models.model_strategy import StrategyLGBM, StrategyAG, StrategyLGBMFarField
from mlflow import MlflowClient
import mlflow
import pandas as pd
import numpy as np
import cloudpickle
from sys import version_info
import pickle
import os
import shutil
from typing import Optional, Tuple, Union, List, Dict, Any, Callable
import numpy.typing as npt
import re

__all__ = ["ModelWrapper", "ModelWrapperMlflowModel", "ModelWrapperFarField"]

PYTHON_VERSION = "{major}.{minor}.{micro}".format(
    major=version_info.major, minor=version_info.minor, micro=version_info.micro
)

conda_env = {
    "channels": ["defaults"],
    "dependencies": [
        "python={}".format(PYTHON_VERSION),
        "pip",
        {
            "pip": [
                "mlflow",
                "lightgbm",
                "cloudpickle=={}".format(cloudpickle.__version__),
            ],
        },
    ],
    "name": "model_wrapper_env",
}


class ModelWrapper(BaseModel):
    """Custom class which wraps a model type to generate
    predictions in a timeseries format.
    """

    def __init__(
        self,
        cd_axis_max_lags: int,
        static_cols: List[str],
        model_strategy: BaseStrategy,
        is_auto_reg: bool = False,
        is_ca3_training: bool = True,
        prediction_horizon: int = 28,
        lag_numbers: List[int] = [1, 7, 14, 28],
        quantiles: List[float] = [0.5],
        mlflow_run_id: Optional[str] = None,
        hotel_id: Optional[str] = None,
        version: Optional[Union[str, int]] = None,
        stage: Optional[str] = None,
        target_type: str = "REVENUE",
        exclude_pms: bool = False,
        save_models: bool = True,
        local_root_dir: Optional[str] = None,
        model_type: str = "MODELWRAPPER",
        model_name_prefix: Optional[str] = None,
        meta_data: Dict[str, Any] = {},
        n_cd_lags: Optional[int] = None,
        **kwargs,
    ):
        super().__init__(
            model_type=model_strategy.model_type,
            prediction_horizon=prediction_horizon,
            lag_numbers=lag_numbers,
            quantiles=quantiles,
            mlflow_run_id=mlflow_run_id,
            hotel_id=hotel_id,
            version=version,
            stage=stage,
            target_type=target_type,
            exclude_pms=exclude_pms,
            save_models=save_models,
            local_root_dir=local_root_dir,
            model_name_prefix=model_name_prefix,
            meta_data=meta_data,
            **kwargs,
        )
        self.quantile_levels.sort()
        self.cd_axis_max_lags = cd_axis_max_lags
        self.sd_axis_lag_prefix = "lag"
        self.static_cols = static_cols
        self.n_cd_lags = n_cd_lags
        self.target_suffix = "_tgt"
        self.is_auto_reg = is_auto_reg

        self.model_strategy = model_strategy
        self.model_type = self.model_strategy.model_type
        self.all_cd_cols = [
            f"{self.target_prefix}{i}" for i in range(self.cd_axis_max_lags + 1)
        ]
        self.is_ca3_training = is_ca3_training

        # initializing targets variables
        self.target_cols: Dict[int, List[str]] = {}

        # initializing feature variables
        self.feature_cols: Dict[int, List[str]] = {}

        self.envs = ["dev", "qa", "prod"]

        if 0.5 not in self.quantile_levels:
            raise ValueError(
                "median quantile (0.5) is not included in the quantile_levels. please ensure that its included"
            )

    def save_model(self) -> None:
        """Saves the models in the local directory, which will then be logged as artifacts in MLflow"""
        if os.path.exists(self.local_root):
            self.clean()

        os.makedirs(self.local_dir)
        with open(self.local_path, "wb") as pkl_file:
            pickle.dump(obj=self, file=pkl_file, protocol=pickle.HIGHEST_PROTOCOL)

    def change_current_env_tags(self, incoming_tags: Dict[str, str]):

        env_model_tag_keys = set([f"model_stage_{env}" for env in self.envs])
        incoming_tags_keys = set(incoming_tags.keys())

        tags_detected = env_model_tag_keys.intersection(incoming_tags_keys)

        if len(tags_detected) > 0:
            client = MlflowClient()
            all_registered_models_info = client.search_model_versions(
                f"name ='{self.get_model_name()}'"
            )
            # sorting the model meta data list by version number of the considered model name in descending order
            sorted_model_versions = sorted(
                all_registered_models_info, key=lambda x: int(x.version), reverse=True
            )

            for version_meta in sorted_model_versions:
                for env_tag in tags_detected:

                    if (incoming_tags[env_tag] == "yes") and (
                        version_meta.tags.get(env_tag) == "yes"
                    ):
                        client.set_model_version_tag(
                            name=self.get_model_name(),
                            version=str(version_meta.version),
                            key=env_tag,
                            value="no",
                        )

    def log_models(self) -> None:
        """Carries out the mlflow model registry procedures"""
        print("Starting model logging")
        self.save_model()

        modelpath = self.get_model_log_path()
        print("Logging model")
        mlflow.pyfunc.log_model(
            artifact_path=self.get_model_log_path(),
            python_model=ModelWrapperMlflowModel(),
            artifacts=self.artifacts,
            conda_env=conda_env,
        )

        # enforcing lower case for env based string keys and values
        decap_meta_data = {}
        for key, value in self.meta_data.items():
            env_str_match = re.findall(pattern=f"({'|'.join(self.envs)})", string=key)
            if len(env_str_match) > 0:
                decap_meta_data[key.lower()] = (
                    value.lower() if isinstance(value, str) else value
                )
            else:
                decap_meta_data[key] = value

        self.meta_data = decap_meta_data

        self.change_current_env_tags(self.meta_data)

        print("Registering model")
        result = mlflow.register_model(
            self.get_model_register_path(),
            self.get_model_name(),
            tags=self.meta_data,
        )

    def clean(self) -> None:
        if os.path.exists(self.local_root):
            shutil.rmtree(self.local_root)

    def load_pyfunc_model(
        self, dst_path: Optional[str] = None, tag: Optional[Union[str, int]] = None
    ) -> mlflow.pyfunc.PyFuncModel:
        """Load and return the pyfunc model from the MLFlow model repository

        Args:
            dst_path (str, optional): Destination path to save the loaded model.
                                      If not provided the files will be saved in the local_root path.
                                      Defaults to None.
            tag (str, optional): Tag to specify the version or model stage to be loaded.
                                 If not provided the latest model version will be loaded.
                                Defaults to None.

        Returns:
            mlflow.pyfunc.model : pyfunc model
        """
        # self.local_dir = dst_path
        print(f"Loading model {self.get_model_uri()}")

        if dst_path is not None:
            self.local_root = dst_path

        if os.path.exists(self.local_root):
            self.clean()

        os.mkdir(self.local_root)

        model = mlflow.pyfunc.load_model(
            self.get_model_uri(tag=tag), dst_path=self.local_root
        )

        self.run_id = model._model_meta.run_id

        # following is a bit of a round about way to set local_dir
        # having the run id in the directory name is a bit troublesome as the run id is not available to us when we create the autogluon object
        # TODO make sure to remove the run id from the local_dir and include either or both task_type/exclude_pms
        # TODO make sure to set the local_dir consistently for both training and inference tasks
        # self.local_dir = "/ag_models/"

        # if self.exclude_pms:
        #     self.local_dir = f"ag_models_{self.hotel_id}_{self.run_id}/"

        # os.rename("artifacts",self.local_dir)

        return model

    def get_filtered_data(
        self, data: pd.DataFrame, day_ahead: int
    ) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
        target_columns = self.all_cd_cols[:day_ahead]
        target_columns = list(
            map(lambda target: target + self.target_suffix, target_columns)
        )

        # original target order
        target_columns_orig = target_columns.copy()
        target_columns.reverse()

        cd_axis_lag_columns = self.all_cd_cols[day_ahead:]
        if self.n_cd_lags != None:
            cd_axis_lag_columns = cd_axis_lag_columns[: self.n_cd_lags]

        sd_axis_lag_columns = [
            f"{self.sd_axis_lag_prefix}{SD_lag}"
            for SD_lag in self.lag_numbers
            if SD_lag > day_ahead
        ]

        # assigning target and feature variables corresponding to the particular day ahead. This will be retrieved through the attributes in the inference phase.
        self.target_cols[day_ahead] = target_columns
        self.feature_cols[day_ahead] = (
            cd_axis_lag_columns + sd_axis_lag_columns + self.static_cols
        )

        if self.is_ca3_training:
            # Condition helps us get the specific entry for the cancellation day index
            condition = data["forecast_index"] == (day_ahead - 1)
            filt_data = data[condition].copy()
        else:
            filt_data = data.copy()

        x_data = filt_data[self.feature_cols[day_ahead]]
        y_data = filt_data[self.target_cols[day_ahead]]

        return (
            x_data,
            y_data,
            filt_data[target_columns_orig + self.feature_cols[day_ahead]],
        )

    def train_inner(self, train_data: pd.DataFrame, day_ahead: int):
        x_train, y_train, xy_train = self.get_filtered_data(
            data=train_data, day_ahead=day_ahead
        )

        reg_obj = self.model_strategy(
            quantile_levels=self.quantile_levels,
            day_ahead=day_ahead,
            cd_axis_targets=self.target_cols[day_ahead],
            path=self.local_dir,
            is_auto_reg=self.is_auto_reg,
        )  # type: ignore
        reg_obj._fit(xy_train)

        # self.models[day_ahead] = reg_obj

        return day_ahead, reg_obj

    def train(self, train_data: pd.DataFrame, n_threads: int) -> None:
        """
        trains models for each day ahead quantile predictions and  relevant
        to the specified prediction_horizon value and the specific quantile
        levels.

        parameters:
            train_data = training data with the booking pace lags, stay date lags or
                    other features such as date features.

        Returns: None
        """
        with ThreadPoolExecutor(max_workers=n_threads) as executor:
            future_to_target = {executor.submit(
                self.train_inner, train_data, day_ahead): day_ahead for day_ahead in range(1, self.prediction_horizon + 1)}
            
            for future in as_completed(future_to_target):
                # day_ahead = future_to_target[future]
                # print(day_ahead)
                try:
                    day_ahead, reg_obj = future.result()
                except Exception as exc:
                    print(exc)
                else:
                    self.models[day_ahead] = reg_obj

        # for day_ahead in range(1, self.prediction_horizon + 1):
        #     day_ahead, reg_obj = self.train_inner(train_data, day_ahead)
        #     print(day_ahead, reg_obj)

        #     self.models[day_ahead] = reg_obj

        # for day_ahead in range(1, self.prediction_horizon + 1):
        #     print("\tday ahead: ", day_ahead)
        #     x_train, y_train, xy_train = self.get_filtered_data(
        #         data=train_data, day_ahead=day_ahead
        #     )

        #     reg_obj = self.model_strategy(
        #         quantile_levels=self.quantile_levels,
        #         day_ahead=day_ahead,
        #         cd_axis_targets=self.target_cols[day_ahead],
        #         path=self.local_dir,
        #         is_auto_reg=self.is_auto_reg,
        #     )  # type: ignore
        #     reg_obj._fit(xy_train)

        #     self.models[day_ahead] = reg_obj

        # if self.do_save_models:
        #     self.log_models()

    def predict(
        self, test_data: pd.DataFrame
    ) -> Dict[Union[str, float], Union[List[npt.NDArray], List[pd.Series]]]:
        """generating quantile predictions for the test data provided. test_data
        should be provided which aligns with the prediction_horizon. If
        test_data has less rows than the prediction_horizon, then the length
        of the test_data will be considered as the prediction horizon.

        eg: if prediction_horizon= 28, ideally test_data should have 28 rows
            which are relevant for 28 stay dates.

        parameters:
            test_data = test data which aligns with the prediction horizon.
                        rows of test_data <= prediction_horizon.

        Returns: Lists with actual values and corresponding predicted values along the booking axis leading upto the
          relevant stay date ahead
        """
        output_pred: Dict[
            Union[str, float], Union[List[npt.NDArray], List[pd.Series]]
        ] = {}

        for day_ahead in range(1, self.prediction_horizon + 1):
            test_idx = day_ahead - 1

            try:
                needed_test_data = test_data[test_data.day_ahead == day_ahead].iloc[0]
            except IndexError as e:
                days_str = "days" if day_ahead > 1 else "day"

                print(f"Error when predicting {day_ahead} {days_str} ahead")
                print(f"Encountered error {e}")
                print("Skipping this row")
                continue

            x_test = needed_test_data[self.feature_cols[day_ahead]]
            y_test = needed_test_data[self.target_cols[day_ahead]]

            predictor = self.models[day_ahead]
            y_pred_dct = predictor._predict(x_test)

            if output_pred.get("y_test") == None:
                output_pred["y_test"] = [y_test]
            else:
                output_pred["y_test"] += [y_test]

            for qtile in self.quantile_levels:
                if output_pred.get(qtile) == None:
                    output_pred[qtile] = [y_pred_dct[qtile][0]]
                else:
                    output_pred[qtile] += [y_pred_dct[qtile][0]]

        return output_pred

In [0]:
def train_wrapper(
    target_type: str,
    ml_experiment_id: str,
    exclude_pms: bool,
    calc_uncertainty: bool,
    hotel_config_provider: ForecastingHotelConfigProvider,
    processing_timestamp: datetime,
    save_models: bool,
    save_metrics: bool,
    lag_numbers: List[int],
    model_tags: dict = None,
    n_threads: int = 4,
) -> Callable:
    def train_data_models(df):
        static_cols_ = [
            "year",
            "quarter_of_year",
            "month_of_year",
            "week_of_year",
            "day_of_year",
            "month_of_quarter",
            "week_of_quarter",
            "day_of_quarter",
            "week_of_month",
            "day_of_month",
            "holiday",
            "day_of_week_0",
            "day_of_week_1",
            "day_of_week_2",
            "day_of_week_3",
            "day_of_week_4",
            "day_of_week_5",
            "day_of_week_6",
        ]

        logger = get_dbx_logger("PHGML")

        trainer = None
        hotel_id = df["HotelID"].iloc[0]
        hotel_config = hotel_config_provider.get_config(hotel_id)
        model_type = hotel_config.training_model_name

        max_lead_window = 100

        if target_type == "REVENUE":
            col_prefix = "RV"
        elif target_type == "ROOMS":
            col_prefix = "RM"

        df = remove_padded_cols(
            df, hotel_config.lead_window, max_lead_window, col_prefix
        )

        test_partition_end = df["_StayDates"].max()
        test_partition_start = test_partition_end - pd.Timedelta(
            hotel_config.training_length, "D"
        )
        metadata_dict = {
            "last_trained_date": str(
                test_partition_start - pd.Timedelta(hotel_config.training_length, "D")
            ),
            "training_length": hotel_config.training_length,
            "inference_length": hotel_config.inference_length,
        }
        metadata_dict.update(model_tags)

        logger.debug(f"{hotel_id}:Filter train data")
        dftrain = filter_train_data(df, test_partition_start)

        logger.debug(f"{hotel_id}:Filter test data")
        dftest = filter_test_data(
            df,
            test_partition_start=test_partition_start,
            test_partition_end=test_partition_end,
        )
        dftest["day_ahead"] = (dftest["_StayDates"] - test_partition_start).dt.days
        dftest = dftest[dftest.forecast_index == (dftest.day_ahead - 1)]

        model_version = 1
        model_stage = "Staging"
        model_name = None

        pms = "PMS"
        if exclude_pms:
            pms = "NOPMS"

        with mlflow.start_run(
            experiment_id=ml_experiment_id,
            run_name=f"{model_type}-{target_type}-{pms}-{hotel_id}-{hotel_config.hotel_name}",
        ) as run:
            run_id = run.info.run_id

            if model_type == "AUTOGLUON":
                trainer = ModelWrapper(
                    model_strategy=StrategyAG,
                    is_auto_reg=True,
                    prediction_horizon=hotel_config.training_length,
                    mlflow_run_id=run_id,
                    hotel_id=hotel_id,
                    save_models=save_models,
                    target_type=target_type,
                    exclude_pms=exclude_pms,
                    meta_data=metadata_dict,
                    cd_axis_max_lags=99,
                    static_cols=static_cols_,
                    quantiles=[0.1, 0.5, 0.9],
                )
            elif model_type == "LIGHTGBM":
                trainer = ModelWrapper(
                    model_strategy=StrategyLGBM,
                    prediction_horizon=hotel_config.training_length,
                    mlflow_run_id=run_id,
                    hotel_id=hotel_id,
                    save_models=save_models,
                    target_type=target_type,
                    exclude_pms=exclude_pms,
                    meta_data=metadata_dict,
                    cd_axis_max_lags=99,
                    static_cols=static_cols_,
                    quantiles=[0.1, 0.5, 0.9],
                )

            model_name = trainer.get_model_name()
            output_df = pd.DataFrame()
            try:
                logger.info(f"{hotel_id}:Training model")
                trainer.train(dftrain, n_threads)

                logger.info(f"{hotel_id}:Completed training")
                logger.info(f"{hotel_id}:Start prediction")

                output_dct = trainer.predict(dftest)
                y_pred_lst, y_test_lst, y_upper_lst, y_lower_lst = (
                    output_dct[0.5],
                    output_dct["y_test"],
                    output_dct[0.9],
                    output_dct[0.1],
                )

                predicted_stays = [i[-1] for i in y_pred_lst]
                observed_stays = [i.values[-1] for i in y_test_lst]
                if calc_uncertainty:
                    upper_stays = [i[-1] for i in y_upper_lst]
                    lower_stays = [i[-1] for i in y_lower_lst]

                if save_metrics:
                    log_metrics_stays(observed_stays, predicted_stays, trainer)

                report_metrics_stays(observed_stays, predicted_stays)

                del dftrain
                del dftest

                output_df = pd.DataFrame(
                    {
                        "HotelID": [hotel_id],
                        "run_id": [run_id],
                        "model_version": [model_version],
                        "timestamp": [processing_timestamp],
                        "pms_sync_off": [exclude_pms],
                        "model_name": [model_name],
                        "status": "complete",
                        "message": f"Successfully trained {hotel_id}",
                    }
                )
            except Exception as e:
                empty = pd.DataFrame(
                    {
                        "HotelID": [hotel_id],
                        "run_id": [run_id],
                        "model_version": [model_version],
                        "timestamp": [pd.Timestamp("1900-01-01")],
                        "pms_sync_off": [exclude_pms],
                        "model_name": [model_name],
                        "status": "incomplete",
                        "message": str(e),
                    }
                )
                return empty

            finally:
                if (model_type == "AUTOGLUON") and (trainer is not None):
                    trainer.clean()

        return output_df

    return train_data_models

In [0]:
SAVE_MODEL, SAVE_METRICS

In [0]:
# Group the data by hotel id and execute the trainings in parallel
logger.info("Starting parallel training")

output_df = df.groupby("HotelID").applyInPandas(
    train_wrapper(
        target_type=TARGET_TYPE,
        ml_experiment_id=ML_EXPERIMENT_ID,
        exclude_pms=WITHOUT_PMS,
        calc_uncertainty=CALC_UNCERTAINTY,
        hotel_config_provider=forecasting_config_provider,
        processing_timestamp=processing_timestamp,
        save_models=SAVE_MODEL,
        save_metrics=SAVE_METRICS,
        lag_numbers=LAG_NUMBERS,
        model_tags=MODEL_TAGS_DCT
    ),
    schema,
)

In [0]:
start_time = time.perf_counter()
output_df = output_df.toPandas()
elapsed_time = time.perf_counter() - start_time

print(f"Model training time: {elapsed_time}")

HotelID,run_id,model_version,timestamp,pms_sync_off,model_name,status,message
71999,3c5b82d9b6874242b8bb6cc0d148de9e,1,2024-10-01T18:28:26.994+0000,False,71999_REVENUE_PMS_LGBM_model,complete,Successfully trained 71999
10443,eb0231c82a7743a08ac2b8afa658ad5b,1,2024-10-01T18:28:26.994+0000,False,10443_REVENUE_PMS_LGBM_model,complete,Successfully trained 10443
63662,15dc3c1ae0584f56bbf9155a7a20692f,1,2024-10-01T18:28:26.994+0000,False,63662_REVENUE_PMS_LGBM_model,complete,Successfully trained 63662
1406,586e0310bc0444e7b4722eda9957e3a5,1,2024-10-01T18:28:26.994+0000,False,1406_REVENUE_PMS_LGBM_model,complete,Successfully trained 1406
64942,ce468a942e32478a89654b6f7cb0cdc9,1,2024-10-01T18:28:26.994+0000,False,64942_REVENUE_PMS_LGBM_model,complete,Successfully trained 64942


In [0]:
for index, row in output_df.iterrows():
    if row.status == "complete":
        logger.info(f"{row.message}")
    else:
        logger.error(
            f"Error encountered when training hotel {row.HotelID}: {row.message}"
        )

In [0]:
client = MlflowClient()
completed = output_df[output_df["status"]=="complete"]

outputs_list = []
for n,g in completed.groupby(["HotelID","model_name"]):
    hotel_id = n[0]
    model_name = n[1]
    hotel_config = forecasting_config_provider.get_config(hotel_id)

    mv = client.get_latest_versions(name=model_name)[0]
    print(mv)
    arts = client.list_artifacts(mv.run_id,path=f"forecasting/{hotel_id}/models/{model_name}/artifacts")
    
    outputs_list.append({"hotel_id":hotel_id,
                         "model_name":model_name,
                         "creation_time":datetime.datetime.fromtimestamp(mv.creation_timestamp/1e3),
                         "last_update":datetime.datetime.fromtimestamp(mv.last_updated_timestamp/1e3),
                         "version":mv.version,
                         "target":TARGET_TYPE,
                         "exclude_pms":WITHOUT_PMS,
                         "config_train_length":hotel_config.training_length,
                         "config_infer_length":hotel_config.inference_length,
                         "num_model_steps":len(arts)-1})
    
    print(f"Hotel: {hotel_id} target_type:{TARGET_TYPE} exclude_pms:{WITHOUT_PMS} : {len(arts)-1}")

completed_df = pd.DataFrame(outputs_list)

In [0]:
completed_df

In [0]:
logger.info("Model training completed.")

# elapsed_time = time.perf_counter() - start_time
logger.info(f"Time elapsed {elapsed_time}")
logger.info(f"Time elapsed in minutes {elapsed_time/60}")