In [0]:
dbutils.widgets.dropdown("env_stage", "dev", ["dev", "prod"], "Pipeline stage")

dbutils.widgets.dropdown("exclude_pms", "False", ["True", "False"], "Exclude PMS")

dbutils.widgets.dropdown("target_type", "REVENUE", ["REVENUE", "ROOMS"], "Target Type")

dbutils.widgets.dropdown("is_usd_currency", "True", ["True", "False"], "Use USD currency")

dbutils.widgets.text("selected_hotels", "", "Hotels")

dbutils.widgets.text("lag_numbers","1,7,14,28", "Lag Numbers")

In [0]:
%load_ext autoreload

%autoreload 2

In [0]:
import pandas as pd
import numpy as np
from pyspark.sql.functions import *
from pyspark.sql.types import (
    StringType,
    DateType,
    IntegerType,
    StructField,
    StructType,
    DoubleType,
    LongType,
)
from sktime.transformations.series.date import DateTimeFeatures
from sktime.performance_metrics.forecasting import (
    mean_absolute_percentage_error,
    MeanAbsolutePercentageError,
    mean_absolute_error,
)
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder
import holidays
import datetime
from pathlib import Path
import pickle
import os
from sys import version_info
import cloudpickle
from pyspark.sql.functions import pandas_udf, PandasUDFType
import logging
import warnings
import time

In [0]:
warnings.filterwarnings("ignore")
start_time = time.perf_counter()

In [0]:
ENV = getArgument("env_stage")

REPOPATH = "/Workspace/Repos/manik@surge.global/phg-data-mlsys/src"

if ENV == "dev":
    print(f"Loading phgml package from repo {REPOPATH}")
    sys.path.append(os.path.abspath(REPOPATH))

In [0]:
from phgml.data.processing_distr_ca import (
    calc_date_features,
    add_date_features,
    preprocess_data,
    filter_data,
    aggregate_target,
    create_rows,
    compile_train_table,
    compile_hotel_tables,
    filter_hotels,
    get_booking_pace_targets,
    get_lags
)
from phgml.data.data_types import revenue_preprocessed_schema
from phgml.data.config import EnvironmentConfig, ForecastingHotelConfigProvider
from phgml.reporting.logging import get_logging_path, get_logging_filename, get_dbx_logger

In [0]:
def str_to_lst(value):
    if value == "":
        return []
    elif "," in value:
        hotels = value.split(",")
        return hotels

    return [value]

In [0]:
def str_to_bool(value):
    FALSE_VALUES = ["false", "no", "0"]
    TRUE_VALUES = ["true", "yes", "1"]
    lvalue = str(value).lower()
    if lvalue in (FALSE_VALUES):
        return False
    if lvalue in (TRUE_VALUES):
        return True
    raise Exception(
        "String value should be one of {}, but got '{}'.".format(
            FALSE_VALUES + TRUE_VALUES, value
        )
    )

In [0]:
# log_root = "/dbfs/mnt/extractionlogs/synxis"
# processing_timestamp = datetime.datetime.now()

In [0]:
REVENUE_COL = "_reservationRevenuePerRoomUSD"
ROOMS_COL = "_rooms"
PIPELINE = "PREPROCESS"

WITHOUT_PMS = str_to_bool(getArgument("exclude_pms"))
IS_USD_CURRENCY = str_to_bool(getArgument("is_usd_currency"))
TARGET_TYPE = getArgument("target_type")
selected_hotels = str_to_lst(getArgument("selected_hotels"))
# PARTITION_DATE = pd.to_datetime('2022-08-01')

### The start of the model data
MODEL_START_DATE = pd.to_datetime("2018-10-01")

COVID_START_DATE = pd.to_datetime("2020-03-01")
COVID_END_DATE = pd.to_datetime("2021-08-01")

CALC_UNCERTAINTY = True
MODEL_TYPE = "AG"

LAG_NUMBERS = list(map(int,str_to_lst(getArgument('lag_numbers'))))


In [0]:
# Config data relevant to this pipeline
env_config = EnvironmentConfig(env=ENV, target=TARGET_TYPE, spark=spark,is_usd_currency=IS_USD_CURRENCY)
forecasting_config_provider = ForecastingHotelConfigProvider(spark=spark,env=ENV)
target_column = env_config.target_column

In [0]:
# changing the source tables to production because dev has less data maintained in its environment
env_config.source_data_table = 'phg_data.consumption_deaggrecords'
env_config.transaction_data_table = 'phg_data.consumption_mrt'

In [0]:
PARTITION_DATE = spark.sql(
    f"select max(transactionTimeStamp) from {env_config.source_data_table}"
).collect()[0][0]


logger = get_dbx_logger(pipeline=PIPELINE,
                        task_type=TARGET_TYPE,
                        exclude_pms=WITHOUT_PMS)
logger.setLevel(logging.INFO)

In [0]:
logger.info("Selecting hotels.")
hotel_details = spark.sql(
    "select HotelID,HotelName,PMSStartDate from phg_data.dim_hotels_"
).toPandas()

correct_hotel_ids = filter_hotels(hotel_details,selected_hotels,WITHOUT_PMS,forecasting_config_provider,mode="TRAINING")

In [0]:
logger.info("Loading data")
columns = [
    "HotelID",
    "_StayDates",
    "confirmationDate",
    "departureDate",
    "channel",
    "status",
    REVENUE_COL,
    ROOMS_COL,
]
# dfsp = spark.sql(
#     f"select {','.join(columns)} from {env_config.source_data_table} where status='Confirmed' "
# )

# Select transaction data along with the cancellation data from the raw dataset
dfsp_ca = spark.sql(
    f"select a.TransactionID,a.HotelID,a._StayDates,a.confirmationDate,a.departureDate,a.channel,a.status,a.cancellationNumber,a._reservationRevenuePerRoomUSD,a._rooms,b.cancellationDate from {env_config.source_data_table} as a left join {env_config.transaction_data_table} as b on a.TransactionID=b.TransactionID"
    )

# filtering data for anomalies detected, where status==confirmed with cancel dates, status==cancelled
# with no cancel dates, and cancel_dates > stay_date
dfsp = dfsp_ca.filter((col('status')=='Confirmed')&(dfsp_ca.cancellationDate.isNull())).select(columns)

dfsp_ca = dfsp_ca.filter(
    ((col('status')=='Confirmed')&(dfsp_ca.cancellationDate.isNull()))|
    ((col('status')=='Cancelled')&(dfsp_ca.cancellationDate.isNotNull())&(dfsp_ca.cancellationDate<=dfsp_ca._StayDates))
               )

if correct_hotel_ids:
    dfsp = dfsp.filter(dfsp.HotelID.isin(correct_hotel_ids))
    dfsp_ca = dfsp_ca.filter(dfsp_ca.HotelID.isin(correct_hotel_ids))

logger.info("Preprocessing data")
df = preprocess_data(
    dfsp,
    WITHOUT_PMS,
    REVENUE_COL,
    ROOMS_COL,
    MODEL_START_DATE,
    cancel_aware=False
)

df_ca = preprocess_data(
    dfsp_ca,
    WITHOUT_PMS,
    REVENUE_COL,
    ROOMS_COL,
    MODEL_START_DATE,
    cancel_aware = True
)

dates = calc_date_features(df)

df_lags = get_lags(df.toPandas(),lag_numbers=LAG_NUMBERS, target_col=target_column)

logger.info(f"Stay dates filtering upto : {PARTITION_DATE}")

df = filter_data(
    df=df, partition=PARTITION_DATE, revenue_col=REVENUE_COL, rooms_col=ROOMS_COL, cancel_aware=False
)

df_ca = filter_data(
    df=df_ca, partition=PARTITION_DATE, revenue_col=REVENUE_COL, rooms_col=ROOMS_COL, cancel_aware=True
)

In [0]:
logger.info(f"Executing pipeline stage: {ENV}")
logger.info(f"Processing data for target type: {TARGET_TYPE} : {target_column}")
logger.info(f"Excluding PMS data? {WITHOUT_PMS}")

In [0]:
logger.info("Compiling train data set")

#creating targets with cancellations nulled out entirely
hotels = list(df.select(col("HotelID")).distinct().toPandas()["HotelID"])
all_inf_lens = [ forecasting_config_provider.get_config(hotel_id).training_length for hotel_id in hotels ]

output_df_targets = get_booking_pace_targets(
            data=df,
            forecast_length=np.max(all_inf_lens),
            target_column=target_column,)


In [0]:
output_df_ca = compile_hotel_tables(
    df_ca, df_lags, dates,target_column=target_column,config_provider=forecasting_config_provider,compile_fn=compile_train_table,cancel_aware=True 
)

In [0]:
output_df = output_df_targets.merge(output_df_ca, on=['HotelID','_StayDates'], how='inner')

In [0]:
# ENV_NAME = 'phg' if ENV=='prod' else ENV
# DATE_REMOVAL_TB_NAME = f"{ENV_NAME}_data.config_date_range_removal"
# # removing date ranges if any
# output_df = spark.createDataFrame(output_df)

# date_removal_df_retrieved=None
# try:
#     date_removal_df_retrieved = spark.sql(f"select * from {DATE_REMOVAL_TB_NAME}")
#     date_removal_df_retrieved = date_removal_df_retrieved.filter(date_removal_df_retrieved.pipeline=='main_training_pipeline')
#     date_removal_df_retrieved = date_removal_df_retrieved.filter(date_removal_df_retrieved.HotelID.isin(selected_hotels))
#     date_removal_df_retrieved = date_removal_df_retrieved.toPandas()
#     date_removal_df_retrieved = date_removal_df_retrieved.groupby('HotelID').apply(lambda x: x[x.timestamp==x.timestamp.max()]).reset_index(drop=True)
# except Exception as e:
#     print(e)

# if type(date_removal_df_retrieved)!=type(None):
#     filtered_data_sp = None
#     for hid in selected_hotels:
#         print(hid)
#         train_data_cpy_sp = output_df.filter(output_df.HotelID==hid)
#         st_dates_lst = date_removal_df_retrieved[date_removal_df_retrieved.HotelID==hid].date_range_start.values
#         end_dates_lst = date_removal_df_retrieved[date_removal_df_retrieved.HotelID==hid].date_range_end.values
#         if len(st_dates_lst)>0:
#             for st_date, end_date in zip(st_dates_lst,end_dates_lst):
#                 print(st_date,end_date)
#                 st_date = pd.to_datetime(st_date)
#                 end_date = pd.to_datetime(end_date)
#                 train_data_cpy_sp= train_data_cpy_sp.filter(~((train_data_cpy_sp._StayDates>=st_date)&(train_data_cpy_sp._StayDates<=end_date)))
#         else:
#             print("No removal date ranges present")
        
#         if type(filtered_data_sp)!=type(None):
#             filtered_data_sp = filtered_data_sp.union(train_data_cpy_sp)
#         else:
#             filtered_data_sp=train_data_cpy_sp
    
#     output_df =filtered_data_sp

In [0]:
if TARGET_TYPE=='REVENUE':
    # params['PREPROCESSED_TABLE'] = env_config.prep_train_revenue_table
    PREPROCESSED_TABLE = f"testing_data.pp_ff_preprocess_rv"
else:
    PREPROCESSED_TABLE= f"testing_data.pp_ff_preprocess_rm"

In [0]:
output_df = spark.createDataFrame(output_df)

In [0]:
file_format = "delta"

logger.info(f"Writing preprocessed data to table {PREPROCESSED_TABLE}")
(
    output_df.write.format("delta")
    .mode("append") # overwrite
    .partitionBy("HotelID")
    .option("overwriteSchema", "true")
    .saveAsTable(PREPROCESSED_TABLE)
)

In [0]:
elapsed_time = time.perf_counter() - start_time
logger.info(f"Time elapsed {elapsed_time}")
logger.info(f"Time elapsed in minutes {elapsed_time/60}")
print(f"Time elapsed in minutes {elapsed_time/60}")
logger.info("Preprocessing completed.")

print("Elapsed time: " + time.strftime("%H:%M:%S.{}".format(str(elapsed_time % 1)[2:])[:15], time.gmtime(elapsed_time)))