In [0]:
dbutils.widgets.dropdown("env_stage", "dev", ["dev", "prod", "qa","dev_synxis_2_0", "prod_synxis_2_0", "qa_synxis_2_0"], "Pipeline stage")
dbutils.widgets.dropdown("source_catalog", "phg_data", ["dev_data", "qa_data","phg_data"], "Source Catalog")
dbutils.widgets.dropdown("exclude_pms", "False", ["True", "False"], "Exclude PMS")
dbutils.widgets.dropdown("target_type", "REVENUE", ["REVENUE", "ROOMS"], "Target Type")
dbutils.widgets.dropdown("is_usd_currency", "True", ["True", "False"], "Use USD currency")
dbutils.widgets.text("selected_hotels", "", "Hotels")
dbutils.widgets.text("lag_numbers","1,7,14,28", "Lag Numbers")
dbutils.widgets.text("model_start_date", "2018-10-01", "Model Start Date")

In [0]:
%load_ext autoreload
%autoreload 2

In [0]:
import pandas as pd
import numpy as np
from pyspark.sql.types import (
    StringType,
    DateType,
    IntegerType,
    StructField,
    StructType,
    DoubleType,
    LongType,
)
from sktime.transformations.series.date import DateTimeFeatures
from sktime.performance_metrics.forecasting import (
    mean_absolute_percentage_error,
    MeanAbsolutePercentageError,
    mean_absolute_error,
)
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder
import holidays
import datetime
from pathlib import Path
import pickle
import os
from sys import version_info
import cloudpickle
from pyspark.sql.functions import pandas_udf, PandasUDFType
import logging
import warnings
import time
import pyspark.sql.functions as F
import sys

warnings.filterwarnings("ignore")
start_time = time.perf_counter()

In [0]:
from phgml.data.processing_distr_spark import (
    filter_data,
    preprocess_data,
    get_lags,
    compile_hotel_tables,    
)
from phgml.data.processing_distr_ca import filter_hotels
from phgml.data.data_types import revenue_preprocessed_schema
from phgml.data.config import EnvironmentConfig, ForecastingHotelConfigProvider 
from phgml.reporting.logging import get_logging_path, get_logging_filename, get_dbx_logger
from phgml.utilities.task_utilities import str_to_lst, str_to_bool

In [0]:
# Read params
params = {}
params["ENV"] = getArgument("env_stage")
params["SOURCE_CATALOG"] = getArgument("source_catalog")
params["WITHOUT_PMS"] = str_to_bool(getArgument("exclude_pms"))
params["IS_USD_CURRENCY"] = str_to_bool(getArgument("is_usd_currency"))
params["TARGET_TYPE"] = getArgument("target_type")
params["SELECTED_HOTELS"] = str_to_lst(getArgument("selected_hotels"))
params["LAG_NUMBERS"] = list(map(int,str_to_lst(getArgument('lag_numbers'))))
params["REVENUE_COL"] = "_reservationRevenuePerRoomUSD"
params["ROOMS_COL"] = "_rooms"
params["PIPELINE"] = "PREPROCESS"
params["REPOPATH"] = "/Workspace/Repos/manik@surge.global/phg-data-mlsys/src"
params["MAXLEAD"] = 100
params["PREDICTION_HORIZON"] = 28
params["CA_AWARE"] = True
params["MODEL_START_DATE"] = pd.to_datetime(getArgument("model_start_date"))
params["COVID_START_DATE"] = pd.to_datetime("2020-03-01")
params["COVID_END_DATE"] = pd.to_datetime("2021-08-01")
params["CALC_UNCERTAINTY"] = True
params["LOG_ROOT"] = '/dbfs/mnt/extractionlogs/synxis'

if "synxis_2_0" in params["ENV"]:
    params["LOG_ROOT"] = '/dbfs/mnt/extractionlogs/synxis_2_0'

In [0]:
env_config = EnvironmentConfig(env=params["ENV"], target=params["TARGET_TYPE"], spark=spark, is_usd_currency=params["IS_USD_CURRENCY"])
forecasting_config_provider = ForecastingHotelConfigProvider(spark=spark,env=params["ENV"])
params["TARGET_COLUMN"] = env_config.target_column

for handler in logging.root.handlers[:]:
    logging.root.removeHandler(handler)
        
logging.root.setLevel(logging.INFO)

processing_timestamp = datetime.datetime.now()

logfile_path = get_logging_path(params["LOG_ROOT"],processing_timestamp)
if not os.path.exists(logfile_path):
    os.makedirs(logfile_path)

pms = "PMS"
if params['WITHOUT_PMS']:
    pms = "NOPMS"
        
log_file_name = get_logging_filename(
    logfile_path,
    "PREPROCESS",
    params['TARGET_TYPE'],
    pms,
    processing_timestamp)

logger = logging.getLogger(f"preprocess-{params['TARGET_TYPE']}-{pms}")

file_handler = logging.FileHandler(log_file_name)
file_format = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
file_handler.setFormatter(file_format)

logger.addHandler(file_handler)

In [0]:

for handler in logging.root.handlers[:]:
        logging.root.removeHandler(handler)
        
logging.root.setLevel(logging.INFO)

processing_timestamp = datetime.datetime.now()

logfile_path = get_logging_path(params['LOG_ROOT'],processing_timestamp)
if not os.path.exists(logfile_path):
    os.makedirs(logfile_path)

pms = "PMS"
if params['WITHOUT_PMS']:
    pms = "NOPMS"
        
log_file_name = get_logging_filename(
    logfile_path,
    "PREPROCESS",
    params['TARGET_TYPE'],
    pms,
    processing_timestamp)

logger = logging.getLogger(f"preprocess-{params['TARGET_TYPE']}-{pms}")

file_handler = logging.FileHandler(log_file_name)
file_format = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
file_handler.setFormatter(file_format)

logger.addHandler(file_handler)

In [0]:
logger.info("Selecting hotels.")
hotel_details = spark.sql(
    f"select HotelID,HotelName,PMSStartDate,Country,State from {params['SOURCE_CATALOG']}.dim_hotels_data"
).toPandas()

# Not considering state info other countries other than US and Canada for date features
hotel_details.loc[~hotel_details.Country.isin(['US','CA']), "State"] = "N/A"
hotel_details = hotel_details[~hotel_details.HotelID.isna()]

correct_hotel_ids = filter_hotels(
    hotel_details,
    params["SELECTED_HOTELS"],
    params["WITHOUT_PMS"],
    forecasting_config_provider,
    mode="TRAINING"
)

print(f"Corrected hotel ids: {correct_hotel_ids}")

In [0]:
logger.info("Loading data")
# Select transaction data along with the cancellation data from the raw dataset
if "synxis_2_0" in params['ENV']:
    
    # changing the source tables to production because dev has less data maintained in its environment
    env_config.source_data_table = f'{params["SOURCE_CATALOG"]}.consumption_deaggrecords_v2'

    dfsp_src = spark.sql(
        f"select * from {env_config.source_data_table}"
        )
    
    dfsp_src = dfsp_src.withColumn(
        'cancellationDate',
        F.when((F.col('status') == 'No-show')&(F.col('cancellationDate').isNull()), F.col('_StayDates')).otherwise(F.col('cancellationDate'))
    )
else:
     # changing the source tables to production because dev has less data maintained in its environment
    env_config.source_data_table = f'{params["SOURCE_CATALOG"]}.consumption_deaggrecords'
    env_config.transaction_data_table = f'{params["SOURCE_CATALOG"]}.consumption_mrt'

    dfsp_src = spark.sql(
        f"select a.TransactionID,a.HotelID,a._StayDates,a.confirmationDate,a.departureDate,a.channel,a.status,a.cancellationNumber,a._reservationRevenuePerRoomUSD,a._rooms,b.cancellationDate from {env_config.source_data_table} as a left join {env_config.transaction_data_table} as b on a.TransactionID=b.TransactionID"
        )

dfsp_src = dfsp_src.filter(dfsp_src.HotelID.isin(correct_hotel_ids))

In [0]:
params['PARTITION_DATE'] = spark.sql(
    f"select max(confirmationDate) from {env_config.source_data_table}"
).collect()[0][0]

print(f"Using training data up to {params['PARTITION_DATE']}")

In [0]:
# load booking statuses 
result = spark.sql("SELECT * FROM phg_data.bookings_status")

confirmed_status_list = [row['status'] for row in result.filter(result.scenario == 'confirmed').collect()]
cancelled_status_list = [row['status'] for row in result.filter(result.scenario == 'cancelled').collect()]

# Display the list
print(f"Confirmed Booking Status List: {confirmed_status_list}")
print(f"Cancelled Booking Status List: {cancelled_status_list}")

In [0]:
columns = [
    "HotelID",
    "_StayDates",
    "confirmationDate",
    "channel",
    "status",
    params["REVENUE_COL"],
    params["ROOMS_COL"],
]

dfsp = dfsp_src.filter(
        (F.col('status').isin(confirmed_status_list)) & (dfsp_src.cancellationDate.isNull())
    ).select(columns)

dfsp_ca = dfsp_src.filter(
    ((F.col('status').isin(confirmed_status_list)) & (dfsp_src.cancellationDate.isNull())) |
    ((F.col('status').isin(cancelled_status_list)) & (dfsp_src.cancellationDate.isNotNull()) & (F.col('cancellationDate') <= F.col('_StayDates')))
)

In [0]:
df = preprocess_data(
    dfsp,
    params["WITHOUT_PMS"],
    params["REVENUE_COL"],
    params["ROOMS_COL"],
    params["MODEL_START_DATE"],
    cancel_aware=False
)

df_ca = preprocess_data(
    dfsp_ca,
    params["WITHOUT_PMS"],
    params["REVENUE_COL"],
    params["ROOMS_COL"],
    params["MODEL_START_DATE"],
    cancel_aware = True
)

In [0]:
logger.info("Loading data features")
model_start_date = params['MODEL_START_DATE']
dates = spark.sql(f"select * from phg_data.date_features where date >= '{model_start_date}'")
dates = dates.withColumn('date', F.to_date('date'))
dates = dates.withColumnRenamed("date","_StayDates")
dates = dates.join(
    spark.createDataFrame(hotel_details[["HotelID","Country","State"]]), 
    on=['Country','State'],
     how="inner")

logger.info("Calculate lags")
df_lags = get_lags(
    df,
    lag_numbers=params["LAG_NUMBERS"], 
    target_col=params["TARGET_COLUMN"]
)

In [0]:
logger.info(f"Stay dates filtering upto : {params['PARTITION_DATE']}")
df = filter_data(
    df=df, 
    partition_date=params["PARTITION_DATE"], 
    revenue_col=params["REVENUE_COL"], 
    rooms_col=params["ROOMS_COL"], 
    cancel_aware=False
)

df_ca = filter_data(
    df=df_ca,
    partition_date=params["PARTITION_DATE"],
    revenue_col=params["REVENUE_COL"],
    rooms_col=params["ROOMS_COL"],
    cancel_aware=True
)

In [0]:
logger.info(f"Executing pipeline stage: {params['ENV']}")
logger.info(f"Processing data for target type: {params['TARGET_TYPE']} : {params['TARGET_COLUMN']}")
logger.info(f"Excluding PMS data? {params['WITHOUT_PMS']}")

In [0]:
logger.info("Compiling train data set")
output_df_targets = compile_hotel_tables(
    df=df,
    target_type=params["TARGET_TYPE"],
    target_column=params["TARGET_COLUMN"],
    prediction_horizon=params["PREDICTION_HORIZON"],
    lead_window=params["PREDICTION_HORIZON"],
    selected_hotels=correct_hotel_ids,
    spark=spark,
    suffix="_tgt"
)

output_df_ca = compile_hotel_tables(
    df=df_ca, 
    target_type=params["TARGET_TYPE"],
    target_column=params["TARGET_COLUMN"],
    prediction_horizon=params["PREDICTION_HORIZON"],
    lead_window=params["MAXLEAD"],
    selected_hotels=correct_hotel_ids,
    spark=spark,
    dates=dates,
    df_lags=df_lags,
    cancel_aware=True 
)

In [0]:
output_df = output_df_targets.join(output_df_ca, on=['HotelID','_StayDates'], how='inner') 

In [0]:
params["ENV_NAME"] = 'phg' if params["ENV"]=='prod' else params["ENV"]
params["DATE_REMOVAL_TB_NAME"] = f"{params['ENV_NAME']}_data.config_date_range_removal"

date_removal_df_retrieved=None
try:
    date_removal_df_retrieved = spark.sql(f"select * from {params['DATE_REMOVAL_TB_NAME']}")
    date_removal_df_retrieved = date_removal_df_retrieved.filter(date_removal_df_retrieved.pipeline=='main_training_pipeline')
    date_removal_df_retrieved = date_removal_df_retrieved.filter(date_removal_df_retrieved.HotelID.isin(params["SELECTED_HOTELS"]))
    date_removal_df_retrieved = date_removal_df_retrieved.toPandas()
    date_removal_df_retrieved = date_removal_df_retrieved.groupby('HotelID').apply(lambda x: x[x.timestamp==x.timestamp.max()]).reset_index(drop=True)
except Exception as e:
    print(e)

if type(date_removal_df_retrieved)!=type(None):
    filtered_data_sp = None
    for hid in params["SELECTED_HOTELS"]:
        print(hid)
        train_data_cpy_sp = output_df.filter(output_df.HotelID==hid)
        st_dates_lst = date_removal_df_retrieved[date_removal_df_retrieved.HotelID==hid].date_range_start.values
        end_dates_lst = date_removal_df_retrieved[date_removal_df_retrieved.HotelID==hid].date_range_end.values
        if len(st_dates_lst)>0:
            for st_date, end_date in zip(st_dates_lst,end_dates_lst):
                print(st_date,end_date)
                st_date = pd.to_datetime(st_date)
                end_date = pd.to_datetime(end_date)
                train_data_cpy_sp= train_data_cpy_sp.filter(~((train_data_cpy_sp._StayDates>=st_date)&(train_data_cpy_sp._StayDates<=end_date)))
        else:
            print("No removal date ranges present")
        
        if type(filtered_data_sp)!=type(None):
            filtered_data_sp = filtered_data_sp.union(train_data_cpy_sp)
        else:
            filtered_data_sp=train_data_cpy_sp
    
    output_df =filtered_data_sp

In [0]:
file_format = "delta"
print(f"Writing preprocessed data to table {env_config.preprocessed_data_table}")
logger.info(f"Writing preprocessed data to table {env_config.preprocessed_data_table}")
(
    output_df.write.format("delta")
    .mode("overwrite")
    .partitionBy("HotelID")
    .option("overwriteSchema", "true")
    .saveAsTable(env_config.preprocessed_data_table)
)

In [0]:
elapsed_time = time.perf_counter() - start_time
logger.info(f"Time elapsed {elapsed_time}")
logger.info(f"Time elapsed in minutes {elapsed_time/60}")
print(f"Time elapsed in minutes {elapsed_time/60}")
logger.info("Preprocessing completed.")