In [0]:
dbutils.widgets.dropdown("env_stage", "dev", ["dev", "prod"], "Pipeline stage")

dbutils.widgets.dropdown("exclude_pms", "False", ["True", "False"], "Exclude PMS")

dbutils.widgets.dropdown("target_type", "REVENUE", ["REVENUE", "ROOMS"], "Target Type")

dbutils.widgets.text("selected_hotels", "", "Hotels")

dbutils.widgets.text("lag_numbers","1,7,14,28", "Lag Numbers")

In [0]:
%pip install mlflow

In [0]:
%load_ext autoreload
%autoreload 2

In [0]:
import pandas as pd
import numpy as np
from pyspark.sql.functions import *
from pyspark.sql.types import *
from typing import List
import holidays
import matplotlib
import seaborn as sns
import datetime
from pathlib import Path
import pickle
import os
from sys import version_info
import cloudpickle
from autogluon.core.utils.loaders import load_pkl
import datetime
import logging
import shutil
import mlflow
from mlflow import MlflowException
import mlflow.pyfunc
import time

start_time = time.perf_counter()

In [0]:
import warnings

warnings.filterwarnings("ignore")

In [0]:
ENV = getArgument("env_stage")

REPOPATH = "/Workspace/Repos/manik@surge.global/phg-data-mlsys/src"
cluster_name = spark.conf.get("spark.databricks.clusterUsageTags.clusterName") 

if (ENV == "dev") and ("dev" in cluster_name):
    print(f"Loading phgml package from repo {REPOPATH}")
    sys.path.append(os.path.abspath(REPOPATH))

In [0]:
from phgml.models.xgboost_model import XGBMultiStepPredictor
from phgml.models.autogluon_model import AutoGluonModel, AGMlflowModel
from phgml.models.lightgbm_model import LightGBMModel, LGBMMlflowModel
from phgml.data.processing import get_lags
from phgml.data.processing_distr import (
    calc_date_features,
    add_date_features,
    preprocess_data,
    filter_test_partition,
    aggregate_target,
    create_rows,
    compile_test_table,
    compile_hotel_tables,
    remove_padded_cols,
    filter_hotels
)
from phgml.reporting.output_metrics import *
from phgml.reporting.report_results import get_output_df, correct_prediction_list
from phgml.data.data_types import inference_output_schema
from phgml.reporting.logging import get_logging_path, get_logging_filename, get_dbx_logger
from phgml.data.config import EnvironmentConfig,ForecastingHotelConfigProvider

In [0]:
def str_to_lst(value):
    if value == "":
        return []
    elif "," in value:
        hotels = value.split(",")
        return hotels

    return [value]

In [0]:
# Disable adaptive query optimization
# Adaptive query optimization groups together smaller tasks into a larger tasks.
# This may result in limited parallelism if the parallel inference tasks are deemed to be too small by the query optimizer
# We are diableing AQE here to circumevent this limitation on parallelism
spark.conf.set("spark.sql.adaptive.enabled", "false")

In [0]:
hotel_ids = spark.sql(
    "select distinct HotelID,HotelName from phg_data.consumption_deaggrecords"
).toPandas()
hotel_ids = hotel_ids.sort_values("HotelID")

In [0]:
def str_to_bool(value):
    FALSE_VALUES = ["false", "no", "0"]
    TRUE_VALUES = ["true", "yes", "1"]
    lvalue = str(value).lower()
    if lvalue in (FALSE_VALUES):
        return False
    if lvalue in (TRUE_VALUES):
        return True
    raise Exception(
        "String value should be one of {}, but got '{}'.".format(
            FALSE_VALUES + TRUE_VALUES, value
        )
    )

In [0]:
# As a workaround for the bug PHG-2157
PARTITION_DATE = spark.sql(
    "select max(confirmationDate) from phg_data.consumption_deaggrecords"
).collect()[0][0]

In [0]:
# log_root = "/dbfs/mnt/extractionlogs/synxis"
# processing_timestamp = datetime.datetime.now()

In [0]:
REVENUE_COL = "_reservationRevenuePerRoomUSD"
ROOMS_COL = "_rooms"
PIPELINE = "INFERENCE"

WITHOUT_PMS = str_to_bool(getArgument("exclude_pms"))
TARGET_TYPE = getArgument("target_type")
selected_hotels = str_to_lst(getArgument("selected_hotels"))
LAG_NUMBERS = list(map(int,str_to_lst(getArgument('lag_numbers'))))

### The start of the model data
MODEL_START_DATE = pd.to_datetime("2018-10-01")

COVID_START_DATE = pd.to_datetime("2020-03-01")
COVID_END_DATE = pd.to_datetime("2021-08-01")

CALC_UNCERTAINTY = False
# MODEL_TYPE = "XGB"  # Use "AG" to try out the auto gloun approach
MODEL_TYPE = "AG"

LEAD_WINDOW = 60

ML_EXPERIMENT_ID = 1079527465953184

if MODEL_TYPE == "XGB":
    RUN_ID = "92907cac187f4c8cadb63ff60a05d72e"  # XGB Run
elif CALC_UNCERTAINTY and (MODEL_TYPE == "AG"):
    RUN_ID = "9549361574484dc58fcf1b7d130541a0"
else:
    RUN_ID = "19dee6420aed45f29e956016c5ea6e8a"


lead_window_start_days = 14
lead_window_end_days = 60
prediction_horizon = 14

In [0]:
env_config = EnvironmentConfig(env=ENV, target=TARGET_TYPE, spark=spark)
forecasting_config_provider = ForecastingHotelConfigProvider(spark=spark,env=ENV)
target_column = env_config.target_column
schema = inference_output_schema

In [0]:
max_inference_length = spark.sql(f'select max(inference_prediction_length) from {forecasting_config_provider.config_table_name}').collect()[0][0]
TEST_PARTIITON_END = PARTITION_DATE + pd.Timedelta(max_inference_length, "D")

In [0]:
logger = get_dbx_logger(pipeline=PIPELINE,
                        task_type=TARGET_TYPE,
                        exclude_pms=WITHOUT_PMS)
logger.setLevel(logging.INFO)

In [0]:
print(f"Executing pipeline stage: {ENV}")
print(f"Processing data for target type: {TARGET_TYPE} : {target_column}")
print(f"Intermediate inference results table name: {env_config.inference_intermediate_table }")
print(f"Writing inference results to table: {env_config.inference_output_table } with blob {env_config.inference_output_table_blob}")
print(f"Excluding PMS data? {WITHOUT_PMS}")

In [0]:
logger.info(f"Executing pipeline stage: {ENV}")
logger.info(f"Processing data for target type: {TARGET_TYPE} : {target_column}")
logger.info(f"Intermediate inference results table name: {env_config.inference_intermediate_table }")
logger.info(f"Writing inference results to table: {env_config.inference_output_table } with blob {env_config.inference_output_table_blob}")
logger.info(f"Excluding PMS data? {WITHOUT_PMS}")

In [0]:
logger.info("Selecting hotels.")

hotel_details = spark.sql(
    "select HotelID,HotelName,PMSStartDate from phg_data.dim_hotels_"
).toPandas()

#Filter hotels 
correct_hotel_ids = filter_hotels(hotel_details,selected_hotels,WITHOUT_PMS,forecasting_config_provider)

In [0]:
columns = [
    "HotelID",
    "_StayDates",
    "confirmationDate",
    "departureDate",
    "channel",
    "status",
    REVENUE_COL,
    ROOMS_COL,
]
dfsp = spark.sql(
    f"select {','.join(columns)} from phg_data.consumption_deaggrecords where status='Confirmed'"
)

if correct_hotel_ids:
    print(f"Filtering data for the selected hotels: {correct_hotel_ids}")
    dfsp = dfsp.filter(dfsp.HotelID.isin(correct_hotel_ids))

In [0]:
def debug_prediction(df, target_type, ml_experiment_id, run_id, exclude_pms, calc_uncertainty,hotel_config_provider,model_cache_dir):
    fn = prediction_wrapper(target_type, ml_experiment_id, run_id, exclude_pms, calc_uncertainty,hotel_config_provider,model_cache_dir)
    
    return fn(df)

In [0]:
def pyfunc_load_model_retry(model_uri, max_tries):
    '''Retry mechanism for loading models from mlflow model registry to 
    handle the model loading error
    '''
    loop_len = max_tries+1
    for i in range(loop_len):
            try:
                return mlflow.pyfunc.load_model(model_uri)
            except Exception as e:
                if i+1==loop_len:
                    raise e
                else:
                    print(e)
                    print(f'Retrying: attempt {i+1}')

In [0]:
def prediction_wrapper(
    target_type, ml_experiment_id, run_id, exclude_pms, calc_uncertainty,hotel_config_provider,model_cache_dir
):
    def predict_distributed(data):
        static_cols_ = ['year', 'quarter_of_year', 'month_of_year', 'week_of_year',
                         'day_of_year', 'month_of_quarter', 'week_of_quarter', 'day_of_quarter',
                           'week_of_month', 'day_of_month', 'holiday',
                             'day_of_week_0', 'day_of_week_1', 'day_of_week_2', 
                             'day_of_week_3', 'day_of_week_4', 'day_of_week_5', 'day_of_week_6']

        logger = get_dbx_logger("PHGML")
        
        max_lead_window = 100
        
        hotel_id = data["HotelID"].iloc[0]
        hotel_config = hotel_config_provider.get_config(hotel_id)
        model_type = hotel_config.model_name

        print(f"Processing Hotel {hotel_id}")
        
        if target_type == "REVENUE":
            col_prefix = "RV"
        elif target_type == "ROOMS":
            col_prefix = "RM"
        
        data = remove_padded_cols(data,hotel_config.lead_window,max_lead_window,col_prefix)
        
        model_version = 1
        model_stage = "Staging"
        model_name = None

        try:

            if model_type == "LIGHTGBM":
                model_obj = LightGBMModel(prediction_horizon=hotel_config.inference_length,
                                          hotel_id=hotel_id,
                                          target_type=target_type,
                                          exclude_pms=exclude_pms,
                                          cd_axis_max_lags=99, 
                                          static_cols =static_cols_)
                model_obj.set_latest_model_version(model_stage = 'Production')

                loaded_model = pyfunc_load_model_retry(model_obj.get_model_uri(), 6)
                    
            elif model_type == "AUTOGLUON":

                model_obj = AutoGluonModel(
                    prediction_horizon=hotel_config.inference_length,
                    calc_uncertainty=calc_uncertainty,
                    mlflow_run_id=run_id,
                    hotel_id=hotel_id,
                    target_type=target_type,
                    exclude_pms=exclude_pms,
                )

                model_obj.set_latest_model_version()
            
                pms = "PMS"
                if exclude_pms:
                    pms = "NOPMS"

    #             dbfs_dir = f"/dbfs/mnt/models/forecasting/individual_hotels/{hotel_id}_{target_type}_{pms}/"
                dbfs_dir = f"{model_cache_dir}{hotel_id}_{target_type}_{pms}" #f"/dbfs/mnt/models/forecasting/dev_individual_hotels/{hotel_id}_{target_type}_{pms}/"
                local_dir = model_obj.local_root

                if os.path.exists(local_dir):
                    shutil.rmtree(local_dir)

                # Copy cached model from blob storage to local dir
                
                shutil.copytree(dbfs_dir, local_dir)

                # load model
                loaded_model = load_pkl.load(path=model_obj.local_path)
                loaded_model.prediction_horizon = model_obj.prediction_horizon

            model_version = int(model_obj.version)
            model_name = [
                model_obj.get_model_name()
                for step in range(1, hotel_config.inference_length + 1)
            ]
            model_metadata = model_obj.get_remote_model_metadata()
            logger.info("Using model version {model_version}")

            logger.info(f"Inference length of model: {model_metadata.get('inference_length','NOT_FOUND')}")
            logger.info(f"Last trained date: {model_metadata.get('last_trained_date','NOT_FOUND')}")

           

            y_pred_raw, y_test, y_upper_raw, y_lower_raw = loaded_model.predict(data)
            y_pred, y_upper, y_lower = correct_prediction_list(
                y_pred_raw, y_test, y_upper_raw, y_lower_raw
            )

            data["status"] = "complete"
            data["message"] = f"Successfully processed {hotel_id}"

            output_df = get_output_df(
                y_pred=y_pred,
                y_true=y_test,
                run_id=run_id,
                hotel_id=hotel_id,
                data=data,
                model_name=model_name,
                model_version=model_version,
                pms_sync_off=exclude_pms,
                prediction_horizon=hotel_config.inference_length,
                y_upper=y_upper,
                y_lower=y_lower,
                y_med_raw=y_pred_raw,
                y_upper_raw=y_upper_raw,
                y_lower_raw=y_lower_raw,
            )

            output_df["status"] = "complete"
            output_df["message"] = f"Successfully processed {hotel_id}"

        except MlflowException as e:
            if "RESOURCE_DOES_NOT_EXIST" in e.message:
                if model_type == "XGB":
                    print(
                        f"Model {model.get_model_name()} was not  found in the model registry. Skipping this model..."
                    )
                else:
                    print(
                        f"Model {model_obj.get_model_name()} was not  found in the model registry. Skipping this model..."
                    )
            else:
                print("An MLFlowException occured")
                print(e)

            empty = pd.DataFrame(
                {
                    "HotelID": [hotel_id],
                    "run_id": [run_id],
                    "stay_date": [pd.Timestamp("1900-01-01")],
                    "booking_date": [pd.Timestamp("1900-01-01")],
                    "model_version": [0],
                    "timestamp": [pd.Timestamp("1900-01-01")],
                    "pms_sync_off": [exclude_pms],
                    "day_index": [0],
                    "y_med": [0],
                    "model_name": [""],
                    "y_upper": [0],
                    "y_lower": [0],
                    "y_med_raw": [0],
                    "y_upper_raw": [0],
                    "y_lower_raw": [0],
                    "status": "incomplete",
                    "message": e.message,
                }
            )

            return empty

        except Exception as e:
            print(f"Hotel {hotel_id} encountered an error ")
            raise e
        finally:
            if model_type == "AUTOGLUON":
                model_obj.clean()

        return output_df

    return predict_distributed

In [0]:
logger.info("Preprocessing data")
df = preprocess_data(
    dfsp,
    WITHOUT_PMS,
    REVENUE_COL,
    ROOMS_COL,
    MODEL_START_DATE,
    COVID_START_DATE,
    COVID_END_DATE,
)

logger.info("Calculating date features")
dates = calc_date_features(df)

logger.info("Calculating lag features")
df_lags = get_lags(df.toPandas(),lag_numbers=LAG_NUMBERS, target_col=target_column)


logger.info("Filtering test partition")
df = filter_test_partition(
    df=df,
    partition_start=PARTITION_DATE,
    partition_end=TEST_PARTIITON_END,
    revenue_col=REVENUE_COL,
    rooms_col=ROOMS_COL,
)

In [0]:
logger.info("Compiling test data set")
df = compile_hotel_tables(
    df, df_lags, dates,target_column=target_column,config_provider=forecasting_config_provider,compile_fn=compile_test_table
)

In [0]:
# DEBUG
# output = debug_prediction(df,MODEL_TYPE, TARGET_TYPE, ML_EXPERIMENT_ID, RUN_ID, WITHOUT_PMS, CALC_UNCERTAINTY,forecasting_config_provider,model_cache_dir=env_config.model_cache_dir)

In [0]:
# Convere the data frame to Spark data frame and add status column for reporting purposes
df = spark.createDataFrame(df)
df = df.withColumn("status", lit("incomplete"))

In [0]:
# Group the data by hotel id and execute the inferences in parallel
logger.info("Starting parallell processing")
output_df = df.groupby("HotelID").applyInPandas(
    prediction_wrapper(
        TARGET_TYPE, ML_EXPERIMENT_ID, RUN_ID, WITHOUT_PMS, CALC_UNCERTAINTY,forecasting_config_provider,model_cache_dir=env_config.model_cache_dir
    ),
    schema,
)

In [0]:
logger.info("Drop intermediate results table if it exists")
spark.sql(f"DROP TABLE IF EXISTS {env_config.inference_intermediate_table}")

In [0]:
logger.info(
    f"Writing inference results to temporary table {env_config.inference_intermediate_table}"
)
(
    output_df.write.mode("overwrite")
    .option("overwriteSchema", "true")
    .saveAsTable(env_config.inference_intermediate_table)
)

In [0]:
meta_columns = ["HotelID", "run_id", "timestamp", "pms_sync_off", "status", "message"]
results_table = spark.sql(f"select * from {env_config.inference_intermediate_table}")
output_meta = results_table.select(meta_columns).toPandas()

num_completed = output_meta[output_meta["status"] == "complete"]["HotelID"].nunique()
total = output_meta["HotelID"].nunique()
logger.info(f"{num_completed} out of {total} hotels processed succussfully")

In [0]:
incomplete = output_meta[~(output_meta["status"] == "complete")]

for row in incomplete.itertuples():
    logger.error(
        f"Error encountered when processing hotel {row.HotelID}: {row.message}"
    )

In [0]:
output_df = results_table.filter(results_table.status == "complete").drop(
    "status", "message"
)

In [0]:
logger.info("Writing completed results to table")
file_format = "delta"


(
    output_df.write.format("delta")
    .mode("append")
    .partitionBy("HotelID")
    .option("path", env_config.inference_output_table_blob)
    .option("overwriteSchema", "true")
    .saveAsTable(env_config.inference_output_table)
)

In [0]:
elapsed_time = time.perf_counter() - start_time
logger.info(f"Time elapsed {elapsed_time}")
logger.info(f"Time elapsed in minutes {elapsed_time/60}")