In [0]:
%pip install mlflow
%load_ext autoreload
%autoreload 2

In [0]:
dbutils.widgets.dropdown("env_stage", "dev", ["dev", "prod", "qa","dev_synxis_2_0", "prod_synxis_2_0", "qa_synxis_2_0"], "Pipeline stage")
dbutils.widgets.dropdown("exclude_pms", "False", ["True", "False"], "Exclude PMS")
dbutils.widgets.dropdown("target_type", "REVENUE", ["REVENUE", "ROOMS"], "Target Type")
dbutils.widgets.dropdown("is_usd_currency", "True", ["True", "False"], "Use USD currency")
dbutils.widgets.text("selected_hotels", "", "Hotels")
dbutils.widgets.text("lag_numbers","1,7,14,28", "Lag Numbers")

In [0]:
import pandas as pd
import numpy as np
from pyspark.sql.functions import *
from pyspark.sql.types import *
import os
from autogluon.core.utils.loaders import load_pkl
import logging
import shutil
import mlflow
from mlflow import MlflowException
import mlflow.pyfunc
import time
import warnings
from datetime import datetime, timezone

start_time = time.perf_counter()
warnings.filterwarnings("ignore")

In [0]:
params = {}
params["ENV"] = getArgument("env_stage")
params["CLUSTER_NAME"] = spark.conf.get("spark.databricks.clusterUsageTags.clusterName") 

In [0]:
from phgml.models.model_wrapper import ModelWrapper
from phgml.models.model_strategy import StrategyLGBM, StrategyAG
from phgml.data.processing_distr_ca import remove_padded_cols
from phgml.reporting.output_metrics import *
from phgml.reporting.report_results import get_output_df, correct_prediction_list, interpolated_fill
from phgml.data.data_types import inference_output_schema
from phgml.reporting.logging import get_dbx_logger
from phgml.data.config import ForecastingHotelConfigProvider,EnvironmentConfig
from phgml.utilities.task_utilities import str_to_lst, str_to_bool, model_wrapper_attr_sync
from phgml.reporting.logging import get_logging_path, get_logging_filename, get_dbx_logger

In [0]:
# Disable adaptive query optimization
# Adaptive query optimization groups together smaller tasks into a larger tasks.
# This may result in limited parallelism if the parallel inference tasks are deemed to be too small by the query optimizer
# We are diableing AQE here to circumevent this limitation on parallelism
spark.conf.set("spark.sql.adaptive.enabled", "false")               

In [0]:
params["REVENUE_COL"] = "_reservationRevenuePerRoomUSD"
params["ROOMS_COL"] = "_rooms"
params["PIPELINE"] = "INFERENCE"

params["WITHOUT_PMS"] = str_to_bool(getArgument("exclude_pms"))
params["IS_USD_CURRENCY"] = str_to_bool(getArgument("is_usd_currency"))
params["TARGET_TYPE"] = getArgument("target_type")
selected_hotels = str_to_lst(getArgument("selected_hotels"))
params["LAG_NUMBERS"] = list(map(int,str_to_lst(getArgument('lag_numbers'))))

### The start of the model data
params["MODEL_START_DATE"] = pd.to_datetime("2018-10-01")
params["COVID_START_DATE"] = pd.to_datetime("2020-03-01")
params["COVID_END_DATE"] = pd.to_datetime("2021-08-01")

params["CALC_UNCERTAINTY"] = False
# MODEL_TYPE = "XGB"  # Use "AG" to try out the auto gloun approach
params["MODEL_TYPE"] = "AG"
params["LEAD_WINDOW"] = 60
params["ML_EXPERIMENT_ID"] = 1079527465953184

params["LOG_ROOT"] = '/dbfs/mnt/extractionlogs/synxis'
if "synxis_2_0" in params["ENV"]:
    params["LOG_ROOT"] = '/dbfs/mnt/extractionlogs/synxis_2_0'

if params["MODEL_TYPE"] == "XGB":
    params["RUN_ID"] = "92907cac187f4c8cadb63ff60a05d72e"  # XGB Run
elif params["CALC_UNCERTAINTY"] and (params["MODEL_TYPE"] == "AG"):
    params["RUN_ID"] = "9549361574484dc58fcf1b7d130541a0"
else:
    params["RUN_ID"] = "19dee6420aed45f29e956016c5ea6e8a"

In [0]:
# Config data relevant to this pipeline
env_config = EnvironmentConfig(env=params["ENV"], target=params["TARGET_TYPE"], spark=spark, is_usd_currency=params["IS_USD_CURRENCY"])
forecasting_config_provider = ForecastingHotelConfigProvider(spark=spark,env=params["ENV"])
params["TARGET_COLUMN"] = env_config.target_column

for handler in logging.root.handlers[:]:
    logging.root.removeHandler(handler)
        
logging.root.setLevel(logging.INFO)

processing_timestamp = datetime.now(timezone.utc)
TIMESTAMP = pd.to_datetime(processing_timestamp)

logfile_path = get_logging_path(params["LOG_ROOT"],processing_timestamp)
if not os.path.exists(logfile_path):
    os.makedirs(logfile_path)

pms = "PMS"
if params["WITHOUT_PMS"]:
    pms = "NOPMS"
        
log_file_name = get_logging_filename(
    logfile_path,
    params["PIPELINE"],
    params["TARGET_TYPE"],
    pms,
    processing_timestamp)

logger = logging.getLogger(f"preprocess-{params['TARGET_TYPE']}-{pms}")

file_handler = logging.FileHandler(log_file_name)
file_format = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
file_handler.setFormatter(file_format)

logger.addHandler(file_handler)

In [0]:
schema = inference_output_schema

In [0]:
# As a workaround for the bug PHG-2157
params["PARTITION_DATE"] = spark.sql(
    f"select max(confirmationDate) from {env_config.source_data_table}"
).collect()[0][0]

print(f"Partition date: {params['PARTITION_DATE']}")

max_inference_length = spark.sql(f'select max(inference_prediction_length) from {forecasting_config_provider.config_table_name}').collect()[0][0]
params["TEST_PARTIITON_END"] = params["PARTITION_DATE"] + pd.Timedelta(max_inference_length, "D")
print(f"Partition end date: {params['TEST_PARTIITON_END']}")

In [0]:
def pyfunc_load_model_retry(model_uri, max_tries):
    '''Retry mechanism for loading models from mlflow model registry to 
    handle the model loading error
    '''
    loop_len = max_tries+1
    for i in range(loop_len):
        try:
            return mlflow.pyfunc.load_model(model_uri)
        except Exception as e:
            if i+1==loop_len:
                raise e
            else:
                print(e)
                print(f'Retrying: attempt {i+1}')

In [0]:
def prediction_wrapper(
    target_type, run_id, exclude_pms,hotel_config_provider,model_cache_dir,environment,infer_timestamp
):
    def predict_distributed(data):
        static_cols_ = ['year', 'quarter_of_year', 'month_of_year', 'week_of_year', 'day_of_year', 'month_of_quarter', 'week_of_quarter', 'day_of_quarter', 'week_of_month', 'day_of_month', 'holiday', 'day_of_week_0', 'day_of_week_1', 'day_of_week_2', 'day_of_week_3', 'day_of_week_4', 'day_of_week_5', 'day_of_week_6']

        logger = get_dbx_logger("PHGML")
        
        max_lead_window = 100
        
        hotel_id = data["HotelID"].iloc[0]
        hotel_config = hotel_config_provider.get_config(hotel_id)
        model_type = hotel_config.inference_model_name

        print(f"Processing Hotel {hotel_id}")
        
        if target_type == "REVENUE":
            col_prefix = "RV"

            if hotel_config.forecast_currency is None:
                # If the target type is REVENUE, we should have a defined forecast_currency
                raise ValueError(f"Forecast currency cannot be None for target_type {target_type}")
            
        elif target_type == "ROOMS":
            col_prefix = "RM"
        
        data = remove_padded_cols(data,hotel_config.lead_window,max_lead_window,col_prefix)
        
        model_version = 1
        model_stage = "Staging"
        model_name = None

        try:
            if data['_StayDates'].isna().any():
                # Raise an exception for empty data
                raise ValueError("The input data is empty and cannot proceed with prediction")

            if model_type == "LIGHTGBM":

                model_obj = ModelWrapper(
                                model_strategy=StrategyLGBM,
                                prediction_horizon=hotel_config.inference_length,
                                hotel_id=hotel_id,
                                target_type=target_type,
                                exclude_pms=exclude_pms,
                                cd_axis_max_lags=99, 
                                static_cols =static_cols_,)
                
                model_obj.set_latest_model_version(model_stage = environment)

                loaded_model = pyfunc_load_model_retry(model_obj.get_model_uri(), 6)
                
                loaded_model.unwrap_python_model().model_wrapper_model.prediction_horizon = hotel_config.inference_length
                #during training time, the target variables are suffixed as '_tgt' to differentiate between target booking pace values and feature booking pace values. but while doing daily inferences,
                # that distinction doesnt matter since we dont have the true values anyway, hence overriding the the target columns as below to avoid columns being not detected.
                loaded_model.unwrap_python_model().model_wrapper_model.target_cols = {day_ahead:[ f"{col_prefix}{j}" for j in range(day_ahead)] for day_ahead in range(1,hotel_config.inference_length+1)}

                # syncing attributes if the class implementation has extra attributes added
                model_wrapper_attr_sync(model_wrapper_instance=loaded_model.unwrap_python_model().model_wrapper_model)
                    
            elif model_type == "AUTOGLUON":

                model_obj = ModelWrapper(
                                model_strategy=StrategyAG,
                                is_auto_reg=True,
                                prediction_horizon=hotel_config.inference_length,
                                hotel_id=hotel_id,
                                target_type=target_type,
                                exclude_pms=exclude_pms,
                                cd_axis_max_lags=99, 
                                static_cols =static_cols_,)


                model_obj.set_latest_model_version()
            
                pms = "PMS"
                if exclude_pms:
                    pms = "NOPMS"

                dbfs_dir = f"{model_cache_dir}{hotel_id}_{target_type}_{pms}" 
                local_dir = model_obj.local_root

                if os.path.exists(local_dir):
                    shutil.rmtree(local_dir)

                # Copy cached model from blob storage to local dir                
                shutil.copytree(dbfs_dir, local_dir)

                # load model
                loaded_model = load_pkl.load(path=model_obj.local_path)
                loaded_model.prediction_horizon = model_obj.prediction_horizon

            model_version = int(model_obj.version)
            model_name = [
                model_obj.get_model_name()
                for step in range(1, hotel_config.inference_length + 1)
            ]
            model_metadata = model_obj.get_remote_model_metadata()
            logger.info("Using model version {model_version}")

            logger.info(f"Inference length of model: {model_metadata.get('inference_length','NOT_FOUND')}")
            logger.info(f"Last trained date: {model_metadata.get('last_trained_date','NOT_FOUND')}")           

            output_dct = loaded_model.predict(data)

            y_test, y_lower_raw, y_pred_raw, y_upper_raw = output_dct['y_test'], output_dct[0.1], output_dct[0.5], output_dct[0.9]

            y_pred_interpolated = [interpolated_fill(day_ahead_array) for day_ahead_array in y_pred_raw]
            
            y_pred, y_upper, y_lower = correct_prediction_list(
                y_pred_interpolated, y_test, y_upper_raw, y_lower_raw,target_type,available_rooms = hotel_config.available_rooms
            )

        
            data["status"] = "complete"
            data["message"] = f"Successfully processed {hotel_id}"

            output_df = get_output_df(
                y_pred=y_pred,
                y_true=y_test,
                run_id=run_id,
                hotel_id=hotel_id,
                data=data.sort_values('day_ahead'),
                model_name=model_name,
                model_version=model_version,
                pms_sync_off=exclude_pms,
                forecast_currency=hotel_config.forecast_currency,
                prediction_horizon=hotel_config.inference_length,
                y_upper=y_upper,
                y_lower=y_lower,
                y_med_raw=y_pred_raw,
                y_upper_raw=y_upper_raw,
                y_lower_raw=y_lower_raw,
                timestamp=infer_timestamp
            )

            output_df["status"] = "complete"
            output_df["message"] = f"Successfully processed {hotel_id}"

        except (ValueError, MlflowException) as e:
            # Check for specific error cases
            if isinstance(e, ValueError):
                message = str(e)
                print(f"ValueError occurred: {message}")
                
            elif isinstance(e, MlflowException):
                if "RESOURCE_DOES_NOT_EXIST" in e.message:
                    print(f"Model {model_obj.get_model_name()} was not found in the model registry. Skipping this model...")
                else:
                    print("An MLFlowException occurred")
                    print(e)
                message = e.message

            empty = pd.DataFrame(
                {
                    "HotelID": [hotel_id],
                    "run_id": [run_id],
                    "stay_date": [pd.Timestamp("1900-01-01")],
                    "booking_date": [pd.Timestamp("1900-01-01")],
                    "model_version": [0],
                    "timestamp": [pd.Timestamp("1900-01-01")],
                    "pms_sync_off": [exclude_pms],
                    "forecast_currency":[hotel_config.forecast_currency],
                    "day_index": [0],
                    "y_med": [0],
                    "model_name": [""],
                    "y_upper": [0],
                    "y_lower": [0],
                    "y_med_raw": [0],
                    "y_upper_raw": [0],
                    "y_lower_raw": [0],
                    "status": "incomplete",
                    "message": message,
                }
            )

            return empty

        except Exception as e:
            print(f"Hotel {hotel_id} encountered an error ")
            raise e
        finally:
            if model_type == "AUTOGLUON":
                model_obj.clean()

        return output_df
    
    return predict_distributed

In [0]:
logger.info("Read preprocessing data")
df = spark.sql(
    f"select * from {env_config.preprocess_intermediate_table}"
).withColumn("status", lit("incomplete"))
df = df.withColumn("_StayDates", to_timestamp("_StayDates", "yyyy-MM-dd")).orderBy(["HotelID", "_StayDates"])

df = df.withColumn('partition_date', lit(str(params["PARTITION_DATE"])))
df = df.withColumn("day_ahead", datediff(col("_StayDates"), to_timestamp('partition_date', "yyyy-MM-dd")))

In [0]:
# Group the data by hotel id and execute the inferences in parallel
logger.info("Starting parallell processing")
output_df = df.groupby("HotelID").applyInPandas(
    prediction_wrapper(
        target_type=params["TARGET_TYPE"], 
        run_id=params["RUN_ID"], 
        exclude_pms=params["WITHOUT_PMS"], 
        hotel_config_provider=forecasting_config_provider,
        model_cache_dir=env_config.model_cache_dir,
        environment=params["ENV"],
        infer_timestamp=TIMESTAMP
    ),
    schema,
)

logger.info(
    f"Writing inference results to temporary table {env_config.inference_intermediate_table}"
)
(
    output_df.write.mode("overwrite")
    .option("overwriteSchema", "true")
    .saveAsTable(env_config.inference_intermediate_table)
)

In [0]:
meta_columns = ["HotelID", "run_id", "timestamp", "pms_sync_off", "status", "message"]
results_table = spark.sql(f"select * from {env_config.inference_intermediate_table}")
output_meta = results_table.select(meta_columns).toPandas()

num_completed = output_meta[output_meta["status"] == "complete"]["HotelID"].nunique()
total = output_meta["HotelID"].nunique()
logger.info(f"{num_completed} out of {total} hotels processed succussfully")
print(f"{num_completed} out of {total} hotels processed succussfully")

In [0]:
incomplete = output_meta[~(output_meta["status"] == "complete")]

for row in incomplete.itertuples():
    logger.error(
        f"Error encountered when processing hotel {row.HotelID}: {row.message}"
    )
    print( f"Error encountered when processing hotel {row.HotelID}: {row.message}")

In [0]:
output_df = results_table.filter(results_table.status == "complete").drop(
    "status", "message"
)

#Drop forecast currency if TARGET_TYPE is ROOMS
if params["TARGET_TYPE"] == "ROOMS":
    output_df = output_df.drop("forecast_currency")

In [0]:
logger.info("Writing completed results to table")
file_format = "delta"

(
    output_df.write.format("delta")
    .mode("append")
    .partitionBy("HotelID")
    .option("path", env_config.inference_output_table_blob)
    .option("overwriteSchema", "true")
    .saveAsTable(env_config.inference_output_table)
)

In [0]:
elapsed_time = time.perf_counter() - start_time
logger.info(f"Time elapsed {elapsed_time}")
logger.info(f"Time elapsed in minutes {elapsed_time/60}")