In [0]:
%pip install /dbfs/FileStore/python-wheels/dev/phgml-1.4.0-py3-none-any.whl -q

In [0]:
dbutils.widgets.dropdown("ca_aware", "True", ["True", "False"], "Cancellation Aware")
dbutils.widgets.dropdown("is_usd_currency", "True", ["True", "False"], "Use USD currency")
dbutils.widgets.text("forecast_points", "91,84,77,70,63,56,49,42,35", "Forecast points")

In [0]:
from typing import List
import pandas as pd
import pyspark.sql.functions as F
import logging
import datetime
import os
import sys

In [0]:
def str_to_bool(value):
  FALSE_VALUES = ['false', 'no', '0']
  TRUE_VALUES = ['true', 'yes', '1']
  lvalue = str(value).lower()
  if lvalue in (FALSE_VALUES): return False
  if lvalue in (TRUE_VALUES):  return True
  raise Exception("String value should be one of {}, but got '{}'.".format(FALSE_VALUES + TRUE_VALUES, value))

def extract_param_values(value: str)-> List[str]:
    """
    The function takes comma seperated strings and return list of strings

    input params:
        value (str) : comma seperated strings

    output:
        (List) : list of strings
    """
    if value == "":
        return []
    elif "," in value:
        val_lst = value.split(",")
        return val_lst
    else:
        return [value]

In [0]:
params = {'WITHOUT_PMS': False, 'SELECTED_HOTELS': ['56217','76630','27226','79051','79908','36063','27275'], 'ENV': 'dev', 'CACHE_MODELS': False, 'TARGET_TYPE': 'REVENUE', 'MODEL_TYPE': 'FARFIELD', 'REPOPATH': '/Workspace/Repos/yasith.udawatte@henrymwuamica.onmicrosoft.com/phg-data-mlsys'}
# params = {} 
# params['WITHOUT_PMS'] = dbutils.jobs.taskValues.get(taskKey = "capture_params", key = "WITHOUT_PMS")
# params['SELECTED_HOTELS'] = dbutils.jobs.taskValues.get(taskKey = "capture_params", key = "SELECTED_HOTELS")
# params['ENV'] = dbutils.jobs.taskValues.get(taskKey = "capture_params", key = "ENV")
# params['CACHE_MODELS'] = dbutils.jobs.taskValues.get(taskKey = "capture_params", key = "CACHE_MODELS")
# params['TARGET_TYPE'] = dbutils.jobs.taskValues.get(taskKey = "capture_params", key = "TARGET_TYPE")
# params['MODEL_TYPE'] = dbutils.jobs.taskValues.get(taskKey = "capture_params", key = "MODEL_TYPE")
# params['REPOPATH'] = dbutils.jobs.taskValues.get(taskKey = "capture_params", key = "REPOPATH")

params["FORECAST_POINTS"] = [eval(i) for i in extract_param_values(getArgument("forecast_points"))]
params["CA_AWARE"] = str_to_bool(getArgument("ca_aware"))
params["IS_USD_CURRENCY"] = str_to_bool(getArgument("is_usd_currency"))
params["DAYS_AHEAD"] = 7
params["MAX_FORECAST_POINT"] = max(params["FORECAST_POINTS"]) 
params["MIN_FORECAST_POINT"] = min(params["FORECAST_POINTS"]) 
params["MAX_TARGET_LEAD"] = 100
params["MAX_LEAD"] = 151
params["LAG_NUMBERS"] = [x for x in range(7, params["MAX_LEAD"], params["DAYS_AHEAD"])] 
params["MODEL_START_DATE"] = pd.to_datetime("2018-10-01")
params["COVID_START_DATE"] = pd.to_datetime("2020-03-01")
params["COVID_END_DATE"] = pd.to_datetime("2021-08-01")
params["REVENUE_COL"] = "_reservationRevenuePerRoomUSD"
params["ROOMS_COL"] = "_rooms"

In [0]:
# if params["ENV"] == "dev":
#     print(f"Loading phgml package from repo {params['REPOPATH']}")
#     sys.path.append(os.path.abspath(params["REPOPATH"]))

# sys.path.append(os.path.abspath("/Workspace/Repos/yasith.udawatte@henrymwuamica.onmicrosoft.com/phg-data-mlsys"))

In [0]:
from phgml.data.config import FarfieldForecastingHotelConfigProvider,FarfieldEnvironmentConfig
from phgml.reporting.logging import get_dbx_logger
from phgml.data.processing_distr import (
    filter_hotels
)
from phgml.data.processing_distr_spark import (
    filter_data,
    preprocess_data,
    calc_date_features,
    get_lags,
    compile_hotel_tables,    
)

In [0]:
# Config data relevant to this pipeline
env_config = FarfieldEnvironmentConfig(
    env=params["ENV"], 
    without_pms=params["WITHOUT_PMS"], 
    target=params["TARGET_TYPE"],
    spark=spark,
    is_usd_currency=params["IS_USD_CURRENCY"]
)
forecasting_config_provider = FarfieldForecastingHotelConfigProvider(spark=spark,env=params["ENV"])
params["TARGET_COLUMN"] = env_config.target_column

if params["TARGET_TYPE"]=='REVENUE':
    # params['PREPROCESSED_TABLE'] = env_config.prep_train_revenue_table
    params['PREPROCESSED_TABLE'] = f"testing_data.pp_ff_preprocess_rv"
else:
    # params['PREPROCESSED_TABLE'] = env_config.prep_train_rooms_table
    params['PREPROCESSED_TABLE'] = f"testing_data.pp_ff_preprocess_rm"

In [0]:
params["PARTITION_DATE"] = spark.sql(
    f"select max(transactionTimeStamp) from {env_config.source_data_table}"
).collect()[0][0]

logger = get_dbx_logger(pipeline=params["ENV"],
                        task_type=params["TARGET_TYPE"],
                        exclude_pms=params["WITHOUT_PMS"])
logger.setLevel(logging.INFO)

In [0]:
logger.info("Selecting hotels.")
hotel_details = spark.sql(
    "select HotelID,HotelName,PMSStartDate from phg_data.dim_hotels_"
).toPandas()

params["CORRECTED_HOTEL_IDS"] = filter_hotels(
    hotel_details,
    params["SELECTED_HOTELS"],
    params["WITHOUT_PMS"],
    forecasting_config_provider,
    mode="TRAINING"
)

In [0]:
logger.info("Loading data")
columns = [
    "HotelID",
    "_StayDates",
    "confirmationDate",
    "departureDate",
    "channel",
    "status",
    params["REVENUE_COL"],
    params["ROOMS_COL"],
]

dfsp = spark.sql(
    f"select {','.join(columns)} from {env_config.source_data_table} where status='Confirmed' "
)

dfsp = dfsp.withColumn('_StayDates', F.to_date('_StayDates'))
dfsp = dfsp.withColumn('confirmationDate', F.to_date('confirmationDate'))

# Select transaction data along with the cancellation data from the raw dataset
dfsp_ca = spark.sql(
    f"select a.TransactionID,a.HotelID,a._StayDates,a.confirmationDate,a.departureDate,a.channel,a.status,a.cancellationNumber,a._reservationRevenuePerRoomUSD,a._rooms,b.cancellationDate from {env_config.source_data_table} as a left join {env_config.transaction_data_table} as b on a.TransactionID=b.TransactionID"
    )

dfsp_ca = dfsp_ca.withColumn('_StayDates', F.to_date('_StayDates'))
dfsp_ca = dfsp_ca.withColumn('confirmationDate', F.to_date('confirmationDate'))


if params["CORRECTED_HOTEL_IDS"]:
    dfsp = dfsp.filter(dfsp.HotelID.isin(params["CORRECTED_HOTEL_IDS"]))
    dfsp_ca = dfsp_ca.filter(dfsp_ca.HotelID.isin(params["CORRECTED_HOTEL_IDS"]))

params['NO_OF_HOTELS'] = len(params["CORRECTED_HOTEL_IDS"])
dfsp = dfsp.repartition(params['NO_OF_HOTELS'],"HotelID")
dfsp_ca = dfsp_ca.repartition(params['NO_OF_HOTELS'],"HotelID")

In [0]:
logger.info("Preprocessing data")
df = preprocess_data(
    dfsp,
    params["WITHOUT_PMS"],
    params["REVENUE_COL"],
    params["ROOMS_COL"],
    params["MODEL_START_DATE"],
    cancel_aware=False
)

df_ca = preprocess_data(
    dfsp_ca,
    params["WITHOUT_PMS"],
    params["REVENUE_COL"],
    params["ROOMS_COL"],
    params["MODEL_START_DATE"],
    cancel_aware = True
)

dates = spark.createDataFrame(calc_date_features(df).reset_index().rename(columns={"index":"_StayDates"}))
dates = dates.withColumn('_StayDates', F.to_date('_StayDates'))

df_lags = get_lags(
    df,
    lag_numbers=params["LAG_NUMBERS"], 
    target_col=params["TARGET_COLUMN"])

logger.info(f"Stay dates filtering upto : {params['PARTITION_DATE']}")

df = filter_data(
    df=df, 
    partition_date=params["PARTITION_DATE"], 
    revenue_col=params["REVENUE_COL"], 
    rooms_col=params["ROOMS_COL"], 
    cancel_aware=False
)

df_ca = filter_data(
    df=df_ca, 
    partition_date=params["PARTITION_DATE"], 
    revenue_col=params["REVENUE_COL"], 
    rooms_col=params["ROOMS_COL"], 
    cancel_aware=True
)

In [0]:
output_df_targets = compile_hotel_tables(
    df=df,
    target_type=params["TARGET_TYPE"],
    target_column=params["TARGET_COLUMN"],
    prediction_horizon=params["MAX_FORECAST_POINT"]+1,
    lead_window=params["MAX_TARGET_LEAD"],
    selected_hotels=params["CORRECTED_HOTEL_IDS"],
    spark=spark,
    suffix="_tgt"
)

In [0]:
output_df_ca = compile_hotel_tables(
    df=df_ca, 
    target_type=params["TARGET_TYPE"],
    target_column=params["TARGET_COLUMN"],
    prediction_horizon=params["MAX_FORECAST_POINT"]+1,
    lead_window=params["MAX_LEAD"]+1,
    selected_hotels=params["CORRECTED_HOTEL_IDS"],
    spark=spark,
    dates=dates,
    df_lags=df_lags,
    cancel_aware=True 
)

In [0]:
group_cols = ["HotelID", "_StayDates"]
if params["CA_AWARE"]:
    group_cols = ["forecast_index", "HotelID", "_StayDates"]
output_df = output_df_targets.join(output_df_ca, on=['HotelID','_StayDates'], how='inner').orderBy(group_cols) 

In [0]:
file_format = "delta"

logger.info(f"Writing preprocessed data to table {params['PREPROCESSED_TABLE']}")
(
    output_df.write.format("delta")
    .mode("append") # append, overwrite
    .partitionBy("HotelID")
    .option("overwriteSchema", "true")
    .saveAsTable(params['PREPROCESSED_TABLE'])
)

In [0]:
print(params)

In [0]:
dbutils.jobs.taskValues.set(key= 'FORECAST_POINTS', value = params['FORECAST_POINTS'])
dbutils.jobs.taskValues.set(key= 'CA_AWARE', value = params['CA_AWARE'])
dbutils.jobs.taskValues.set(key= 'IS_USD_CURRENCY', value = params['IS_USD_CURRENCY'])
dbutils.jobs.taskValues.set(key= 'DAYS_AHEAD', value = params['DAYS_AHEAD'])
dbutils.jobs.taskValues.set(key= 'MAX_FORECAST_POINT', value = params['MAX_FORECAST_POINT'])
dbutils.jobs.taskValues.set(key= 'MIN_FORECAST_POINT', value = params['MIN_FORECAST_POINT'])
dbutils.jobs.taskValues.set(key= 'MAX_TARGET_LEAD', value = params['MAX_TARGET_LEAD'])
dbutils.jobs.taskValues.set(key= 'MAX_LEAD', value = params['MAX_LEAD'])
dbutils.jobs.taskValues.set(key= 'LAG_NUMBERS', value = params['LAG_NUMBERS'])
dbutils.jobs.taskValues.set(key= 'MODEL_START_DATE', value = str(params['MODEL_START_DATE']))
dbutils.jobs.taskValues.set(key= 'COVID_START_DATE', value = str(params['COVID_START_DATE']))
dbutils.jobs.taskValues.set(key= 'COVID_END_DATE', value = str(params['COVID_END_DATE']))
dbutils.jobs.taskValues.set(key= 'REVENUE_COL', value = params['REVENUE_COL'])
dbutils.jobs.taskValues.set(key= 'ROOMS_COL', value = params['ROOMS_COL'])
dbutils.jobs.taskValues.set(key= 'TARGET_COLUMN', value = params['TARGET_COLUMN'])
dbutils.jobs.taskValues.set(key= 'PREPROCESSED_TABLE', value = params['PREPROCESSED_TABLE'])
dbutils.jobs.taskValues.set(key= 'PARTITION_DATE', value = str(params['PARTITION_DATE']))
dbutils.jobs.taskValues.set(key= 'CORRECTED_HOTEL_IDS', value = params['CORRECTED_HOTEL_IDS'])
