In [0]:
dbutils.widgets.dropdown("env_stage", "dev", ["dev", "prod", "qa","dev_synxis_2_0", "prod_synxis_2_0", "qa_synxis_2_0"], "Pipeline stage")
dbutils.widgets.dropdown("exclude_pms", "False", ["True", "False"], "Exclude PMS")
dbutils.widgets.dropdown("target_type", "REVENUE", ["REVENUE", "ROOMS"], "Target Type")
dbutils.widgets.dropdown("is_usd_currency", "True", ["True", "False"], "Use USD currency")
dbutils.widgets.text("lag_numbers","1,7,14,28", "Lag Numbers")
dbutils.widgets.text("model_tags","", "Model Tags")
dbutils.widgets.text("thread_numbers","1", "Number of Threads")

In [0]:
%pip install mlflow==2.2.2
%load_ext autoreload
%autoreload 2

In [0]:
import pandas as pd
import numpy as np
from pyspark.sql.functions import *
import datetime
from pathlib import Path
import pickle
import os
from sys import version_info
import cloudpickle
import mlflow
import mlflow.pyfunc
import logging
import warnings
from mlflow import MlflowException
from mlflow.client import MlflowClient
import time
import datetime
import re
import logging

In [0]:
warnings.filterwarnings("ignore")
start_time = time.perf_counter()

In [0]:
# from phgml.models.xgboost_model import XGBMultiStepPredictor
# from phgml.models.autogluon_model import AutoGluonModel, AGMlflowModel
# from phgml.models.lightgbm_model import LightGBMModel, LGBMMlflowModel
# from phgml.pipeline.training import train_wrapper
from phgml.data.processing_distr_ca import (
    filter_train_data,
    filter_test_data,
    remove_padded_cols,
)
from phgml.reporting.output_metrics import *
from phgml.data.data_types import (
    revenue_preprocessed_schema,
    rooms_preprocessed_schema,
    training_output_schema,
)
from phgml.reporting.logging import get_logging_path, get_logging_filename, get_dbx_logger
from phgml.reporting.report_results import get_output_df, correct_prediction_list
from phgml.data.config import EnvironmentConfig, ForecastingHotelConfigProvider 
from phgml.utilities.task_utilities import str_to_lst, str_to_bool, get_model_tags

from phgml.models.model_wrapper import ModelWrapper
from phgml.models.model_strategy import StrategyLGBM, StrategyAG
from pyspark.sql.types import  TimestampType, StringType, StructField, StructType, DoubleType, LongType, BooleanType, FloatType, IntegerType, DateType

In [0]:
params = {}
params["ENV"] = getArgument("env_stage")
params["CLUSTER_NAME"] = spark.conf.get("spark.databricks.clusterUsageTags.clusterName")
params["REVENUE_COL"] = "_reservationRevenuePerRoomUSD"
params["ROOMS_COL"] = "rooms"
params["PIPELINE"] = "TRAINING"
params["WITHOUT_PMS"] = str_to_bool(getArgument("exclude_pms"))
params["IS_USD_CURRENCY"] = str_to_bool(getArgument("is_usd_currency"))
params["TARGET_TYPE"] = getArgument("target_type")
params["MODEL_TAGS_DCT"] = get_model_tags(getArgument("model_tags"))
print('model tags dict: ',params["MODEL_TAGS_DCT"])

# The start of the model data
params["MODEL_START_DATE"] = pd.to_datetime("2018-10-01")
params["COVID_START_DATE"] = pd.to_datetime("2020-03-01")
params["COVID_END_DATE"] = pd.to_datetime("2021-08-01")

params["CALC_UNCERTAINTY"] = True
params["LEAD_WINDOW"] = 60
params["PREDICTION_HORIZON"] = 30
params["ML_EXPERIMENT_ID"] = 2169257822521486
params["LAG_NUMBERS"] = list(map(int,str_to_lst(getArgument('lag_numbers'))))
params["SAVE_MODEL"] = True
params["SAVE_METRICS"] = True
params["THREAD_NUMBERS"] = int(getArgument("thread_numbers"))
if getArgument('thread_numbers'):
    params["THREAD_NUMBERS"] = int(getArgument('thread_numbers'))

params["LOG_ROOT"] = '/dbfs/mnt/extractionlogs/synxis'
if "synxis_2_0" in params["ENV"]:
    params["LOG_ROOT"] = '/dbfs/mnt/extractionlogs/synxis_2_0'

In [0]:
# Disable adaptrive query optimization
# Adaptive query optimization groups together smaller tasks into a larger tasks.
# This may result in limited parallelism if the parallel inference tasks are deemed to be too small by the query optimizer
# We are diableing AQE here to circumevent this limitation on parallelism
spark.conf.set("spark.sql.adaptive.enabled", "false")
processing_timestamp = datetime.datetime.now()

In [0]:
# Config data relevant to this pipeline
env_config = EnvironmentConfig(env=params["ENV"], target=params["TARGET_TYPE"], spark=spark, is_usd_currency=params["IS_USD_CURRENCY"])
forecasting_config_provider = ForecastingHotelConfigProvider(spark=spark,env=params["ENV"])
params["TARGET_COLUMN"] = env_config.target_column

for handler in logging.root.handlers[:]:
    logging.root.removeHandler(handler)
        
logging.root.setLevel(logging.INFO)

processing_timestamp = datetime.datetime.now()

logfile_path = get_logging_path(params["LOG_ROOT"],processing_timestamp)
if not os.path.exists(logfile_path):
    os.makedirs(logfile_path)

pms = "PMS"
if params["WITHOUT_PMS"]:
    pms = "NOPMS"
        
log_file_name = get_logging_filename(
    logfile_path,
    params["PIPELINE"],
    params["TARGET_TYPE"],
    pms,
    processing_timestamp)

logger = logging.getLogger(f"preprocess-{params['TARGET_TYPE']}-{pms}")

file_handler = logging.FileHandler(log_file_name)
file_format = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
file_handler.setFormatter(file_format)

logger.addHandler(file_handler)

In [0]:
target_column = env_config.target_column
schema = training_output_schema

In [0]:
logger.info(f"Processing data for target type: {params['TARGET_TYPE']} : {params['TARGET_COLUMN']}")
logger.info(f"Excluding PMS data? {params['WITHOUT_PMS']}")

In [0]:
print(f"Reading preprocessed data from table {env_config.preprocessed_data_table}")
logger.info(f"Loading data from {env_config.preprocessed_data_table}")
df = spark.sql(f"select * from {env_config.preprocessed_data_table}")

In [0]:
if df.count() <= 0:
    logger.error("The loaded training dataset is empty.")
    logger.info("Terminting the pipeline execution")
    raise Exception("The loaded training dataset is empty.")

In [0]:
# # # For testing purposes
# # # # d = d[d["HotelID"]=="71999"]

# # # CALC_UNCERTAINTY = True
# # # SAVE_MODEL = True

# fn = train_wrapper(
#         target_type=params["TARGET_TYPE"],
#         ml_experiment_id=params["ML_EXPERIMENT_ID"],
#         exclude_pms=params["WITHOUT_PMS"],
#         calc_uncertainty=params["CALC_UNCERTAINTY"],
#         hotel_config_provider=forecasting_config_provider,
#         processing_timestamp=processing_timestamp,
#         save_models=False,
#         save_metrics=False,
#         lag_numbers=params["LAG_NUMBERS"],
#         model_tags=params["MODEL_TAGS_DCT"],
#         n_threads=params["THREAD_NUMBERS"],
#     )

# df_pd = df.filter(df.HotelID=='27398').toPandas()
# output = fn(df_pd)

In [0]:
def train_wrapper(
  model_type,
  target_type,
  exclude_pms,
  calc_uncertainty,
  prediction_horizon,
  save_models,
  save_metrics,
  lag_numbers: List[int],
  n_threads: int = 1,
  meta_data={}
) -> Callable:
    def train_data_models(df):
        static_cols_ = [
          "year",
          "quarter_of_year",
          "month_of_year",
          "week_of_year",
          "day_of_year",
          "month_of_quarter",
          "week_of_quarter",
          "day_of_quarter",
          "week_of_month",
          "day_of_month",
          "holiday",
          "day_of_week_0",
          "day_of_week_1",
          "day_of_week_2",
          "day_of_week_3",
          "day_of_week_4",
          "day_of_week_5",
          "day_of_week_6",
        ]

        logger = get_dbx_logger("PHGML")

        trainer = None
        hotel_id = df["HotelID"].iloc[0]

        max_lead_window = 100

        if target_type == "REVENUE":
            col_prefix = "RV"
        elif target_type == "ROOMS":
            col_prefix = "RM"

        df = remove_padded_cols(
            df, max_lead_window, max_lead_window, col_prefix
        )

        test_partition_end = df["_StayDates"].max()
        test_partition_start = test_partition_end - pd.Timedelta(prediction_horizon,"D")
        metadata_dict = meta_data

        logger.debug(f"{hotel_id}:Filter train data")
        dftrain = filter_train_data(df, test_partition_start)

        logger.debug(f"{hotel_id}:Filter test data")
        dftest = filter_test_data(
            df,
            test_partition_start=test_partition_start,
            test_partition_end=test_partition_end,
        )
        dftest["day_ahead"] = (dftest["_StayDates"] - test_partition_start).dt.days
        dftest = dftest[dftest.forecast_index == (dftest.day_ahead - 1)]

        model_version = 1
        model_stage = "Staging"
        model_name = None

        pms = "PMS"
        if exclude_pms:
            pms = "NOPMS"

        # experiment_name=f"{mlflow_user}{hotel_id}-{target_type}-{pms}-{prediction_horizon}days-new_hotel_eval"

        # if mlflow.get_experiment_by_name(experiment_name):
        #     experiment_id=mlflow.get_experiment_by_name(experiment_name).experiment_id
        # else:
        #     experiment_id = mlflow.create_experiment(experiment_name) 

        experiment_id=2169257822521486

        with mlflow.start_run(experiment_id=experiment_id,run_name=f"RUN-{model_type}_{prediction_horizon}days-{hotel_id}-{target_type}-{pms}") as run:
          run_id = run.info.run_id
          local_n_threads = n_threads
          local_prediction_horizon = prediction_horizon
          local_save_metrics = save_metrics

          if model_type == "AUTOGLUON":
            local_n_threads = 1

            trainer = ModelWrapper(
                model_strategy=StrategyAG,
                is_auto_reg=True,
                prediction_horizon=local_prediction_horizon,
                mlflow_run_id=run_id,
                hotel_id=hotel_id,
                save_models=save_models,
                target_type=target_type,
                exclude_pms=exclude_pms,
                model_name_prefix=f"BACKUP_TEST_{local_prediction_horizon}DAY",
                meta_data=metadata_dict,
                cd_axis_max_lags=99,
                static_cols=static_cols_,
                quantiles=[0.1, 0.5, 0.9],
                n_threads=local_n_threads,
            )
          elif model_type == "LIGHTGBM":
            trainer = ModelWrapper(
                model_strategy=StrategyLGBM,
                prediction_horizon=local_prediction_horizon,
                mlflow_run_id=run_id,
                hotel_id=hotel_id,
                save_models=save_models,
                target_type=target_type,
                exclude_pms=exclude_pms,
                model_name_prefix=f"BACKUP_TEST_{local_prediction_horizon}DAY",
                meta_data=metadata_dict,
                cd_axis_max_lags=99,
                static_cols=static_cols_,
                quantiles=[0.1, 0.5, 0.9],
                n_threads=local_n_threads,
            )

          model_name = trainer.get_model_name()
          output_df = pd.DataFrame()
          try:
            print("training")
            trainer.train(dftrain)

            output_dct = trainer.predict(dftest)
            y_pred , y_test , y_upper , y_lower = output_dct[0.5], output_dct['y_test'], output_dct[0.9], output_dct[0.1]

            if local_save_metrics:
                y_test_flat = [val for ar in y_test for val in ar]
                y_pred_flat = [val for ar in y_pred for val in ar]
                
                SMAPE = mean_absolute_percentage_error(y_test_flat,y_pred_flat,symmetric=True)
                MAE = mean_absolute_error(y_test_flat,y_pred_flat)

                mlflow.log_metric(f"SMAPE-{prediction_horizon}", SMAPE)
                mlflow.log_metric(f"MAE-{prediction_horizon}", MAE)
                
            dflst = []
            for i,stay_date in enumerate(dftest["_StayDates"].unique()):
              
              dfpart = pd.DataFrame(
                {
                  "_StayDates":[stay_date]*len(y_pred[i]),
                  "y_pred":y_pred[i],
                  "y_true":y_test[i]
                })
              dflst.append(dfpart)             
              
              output_df = pd.concat(dflst,axis=0)               
              output_df["HotelID"] = hotel_id 
              output_df["pms_sync_off"] = exclude_pms
              output_df["status"] = "complete"
              output_df["message"] = f"Successfully trained {hotel_id}"                           
              
          except Exception as e:
              raise e

        output_df = output_df[["HotelID","pms_sync_off","status","message","_StayDates","y_pred","y_true"]]
        output_df.reset_index(drop=False, inplace=True)
        return output_df

    return train_data_models

In [0]:
training_output_schema = StructType(    
    [
        StructField("index", StringType(), True),
        StructField("HotelID", StringType(), True),
        StructField("pms_sync_off",BooleanType(), True),
        StructField("status", StringType(), True),
        StructField("message", StringType(), True),
        StructField("_StayDates",DateType(),True),
        StructField("y_pred",FloatType(),True),
        StructField("y_true",FloatType(),True),
    ]
)

In [0]:
# Group the data by hotel id and execute the trainings in parallel
logger.info("Starting parallel training")

output_df = df.groupby("HotelID").applyInPandas(
    train_wrapper(
        model_type = "LIGHTGBM",
        target_type=params["TARGET_TYPE"],
        # ml_experiment_id=params["ML_EXPERIMENT_ID"],
        exclude_pms=params["WITHOUT_PMS"],
        calc_uncertainty=params["CALC_UNCERTAINTY"],
        # hotel_config_provider=forecasting_config_provider,
        # processing_timestamp=processing_timestamp,
        prediction_horizon=28,
        save_models=params["SAVE_MODEL"],
        save_metrics=params["SAVE_METRICS"],
        lag_numbers=params["LAG_NUMBERS"],
        # model_tags=params["MODEL_TAGS_DCT"],
        n_threads=params["THREAD_NUMBERS"],
        meta_data={}
    ),
    training_output_schema,
)


In [0]:
output_df = output_df.toPandas()

In [0]:
for index, row in output_df.iterrows():
    if row.status == "complete":
        logger.info(f"{row.message}")
    else:
        logger.error(
            f"Error encountered when training hotel {row.HotelID}: {row.message}"
        )

In [0]:
# client = MlflowClient()
# completed = output_df[output_df["status"]=="complete"]

# outputs_list = []
# for n,g in completed.groupby(["HotelID","model_name"]):
#     hotel_id = n[0]
#     model_name = n[1]
#     hotel_config = forecasting_config_provider.get_config(hotel_id)

#     mv = client.get_latest_versions(name=model_name)[0]
#     print(mv)
#     arts = client.list_artifacts(mv.run_id,path=f"forecasting/{hotel_id}/models/{model_name}/artifacts")
    
#     outputs_list.append({"hotel_id":hotel_id,
#                          "model_name":model_name,
#                          "creation_time":datetime.datetime.fromtimestamp(mv.creation_timestamp/1e3),
#                          "last_update":datetime.datetime.fromtimestamp(mv.last_updated_timestamp/1e3),
#                          "version":mv.version,
#                          "target":params["TARGET_TYPE"],
#                          "exclude_pms":params["WITHOUT_PMS"],
#                          "config_train_length":hotel_config.training_length,
#                          "config_infer_length":hotel_config.inference_length,
#                          "num_model_steps":len(arts)-1})
    
#     print(f"Hotel: {hotel_id} target_type:{params['TARGET_TYPE']} exclude_pms:{params['WITHOUT_PMS']} : {len(arts)-1}")

# completed_df = pd.DataFrame(outputs_list)

In [0]:
# display(completed_df)

In [0]:
logger.info("Model training completed.")

elapsed_time = time.perf_counter() - start_time
logger.info(f"Time elapsed {elapsed_time}")
logger.info(f"Time elapsed in minutes {elapsed_time/60}")