In [0]:
dbutils.widgets.dropdown("env_stage", "dev", ["dev", "prod"], "Pipeline stage")

dbutils.widgets.dropdown("exclude_pms", "False", ["True", "False"], "Exclude PMS")

dbutils.widgets.dropdown("target_type", "REVENUE", ["REVENUE", "ROOMS"], "Target Type")

dbutils.widgets.dropdown("is_usd_currency", "True", ["True", "False"], "Use USD currency")

dbutils.widgets.text("lag_numbers","1,7,14,28", "Lag Numbers")

dbutils.widgets.text("model_tags","", "Model Tags")

In [0]:
%pip install mlflow==2.2.2

In [0]:
%load_ext autoreload

%autoreload 2

In [0]:
import pandas as pd
import numpy as np
from pyspark.sql.functions import *
import datetime
from pathlib import Path
import pickle
import os
from sys import version_info
import cloudpickle
import mlflow
import mlflow.pyfunc
import logging
import warnings
from mlflow import MlflowException
from mlflow.client import MlflowClient
import time
import datetime
import re

In [0]:
# sys.path.append(os.path.abspath('/Workspace/Repos/manik@surge.global/phg-data-mlsys/src'))
warnings.filterwarnings("ignore")
start_time = time.perf_counter()

In [0]:
ENV = getArgument("env_stage")

REPOPATH = "/Workspace/Repos/manik@surge.global/phg-data-mlsys/src"

cluster_name = spark.conf.get("spark.databricks.clusterUsageTags.clusterName")

if (ENV == "dev") and ("dev" in cluster_name):
    print(f"Loading phgml package from repo {REPOPATH}")
    sys.path.append(os.path.abspath(REPOPATH))

In [0]:
from phgml.models.xgboost_model import XGBMultiStepPredictor
from phgml.models.autogluon_model import AutoGluonModel, AGMlflowModel
from phgml.models.lightgbm_model import LightGBMModel, LGBMMlflowModel
from phgml.pipeline.training import train_wrapper
from phgml.data.processing_distr_ca import (
    filter_train_data,
    filter_test_data,
    remove_padded_cols,
)
from phgml.reporting.output_metrics import *
from phgml.data.data_types import (
    revenue_preprocessed_schema,
    rooms_preprocessed_schema,
    training_output_schema,
)
from phgml.reporting.logging import get_logging_path, get_logging_filename, get_dbx_logger
from phgml.reporting.report_results import get_output_df, correct_prediction_list
from phgml.data.config import EnvironmentConfig, ForecastingHotelConfigProvider

In [0]:
# Disable adaptrive query optimization
# Adaptive query optimization groups together smaller tasks into a larger tasks.
# This may result in limited parallelism if the parallel inference tasks are deemed to be too small by the query optimizer
# We are diableing AQE here to circumevent this limitation on parallelism
spark.conf.set("spark.sql.adaptive.enabled", "false")

In [0]:
def str_to_lst(value):
    if value == "":
        return []
    elif "," in value:
        hotels = value.split(",")
        return hotels

    return [value]


def str_to_bool(value):
    FALSE_VALUES = ["false", "no", "0"]
    TRUE_VALUES = ["true", "yes", "1"]
    lvalue = str(value).lower()
    if lvalue in (FALSE_VALUES):
        return False
    if lvalue in (TRUE_VALUES):
        return True
    raise Exception(
        "String value should be one of {}, but got '{}'.".format(
            FALSE_VALUES + TRUE_VALUES, value
        )
    )

def get_model_tags(model_tags_str):
    ''' A Validation for the model tag text through databricks utility'''
    valid_pattern = r'(\w+:\w+)'
    invalid_pattern = r'[^:,\w\s]'
    str_cpy=model_tags_str.replace(" ", '')

    not_allowed_symbols = re.findall(pattern=invalid_pattern, string=str_cpy)
    if len(not_allowed_symbols)>0:
        raise ValueError('''Unwanted characters detected. Allowed characters are colon(:), comma(,), word characters and white space characters
                            Please specify key values pairs as key1:value1,key2:value2
                        ''')
    else:
        matching_pairs = re.findall(pattern=valid_pattern, string=str_cpy)

        for matching_str in matching_pairs:
            str_cpy = str_cpy.replace(matching_str,'')
        str_cpy = str_cpy.replace(',','')
        
        if len(str_cpy)>0:
            raise ValueError('''unmatched string components detected. please check the specified string
                             Please specify key values pairs as key1:value1,key2:value2
                             ''')
        
    return {key:value for key,value in map(lambda x: x.split(':'),matching_pairs)}

In [0]:
log_root = "/dbfs/mnt/extractionlogs/synxis"
processing_timestamp = datetime.datetime.now()

In [0]:
REVENUE_COL = "_reservationRevenuePerRoomUSD"
ROOMS_COL = "rooms"
PIPELINE = "TRAINING"

WITHOUT_PMS = str_to_bool(getArgument("exclude_pms"))
IS_USD_CURRENCY = str_to_bool(getArgument("is_usd_currency"))
TARGET_TYPE = getArgument("target_type")

MODEL_TAGS_DCT = get_model_tags(getArgument("model_tags"))
print('model tags dict: ',MODEL_TAGS_DCT)

### The start of the model data
MODEL_START_DATE = pd.to_datetime("2018-10-01")

COVID_START_DATE = pd.to_datetime("2020-03-01")
COVID_END_DATE = pd.to_datetime("2021-08-01")

CALC_UNCERTAINTY = True
# MODEL_TYPE = "XGB"  # Use "AG" to try out the auto gloun approach
MODEL_TYPE = "AG"

LEAD_WINDOW = 60
PREDICTION_HORIZON = 30

ML_EXPERIMENT_ID = 609933091443417

lead_window_start_days = 14
lead_window_end_days = 60
prediction_horizon = 14
LAG_NUMBERS = list(map(int,str_to_lst(getArgument('lag_numbers'))))


SAVE_MODEL = False
SAVE_METRICS = False

In [0]:
# Config data relevant to this pipeline
env_config = EnvironmentConfig(env=ENV, target=TARGET_TYPE, spark=spark, is_usd_currency=IS_USD_CURRENCY)
forecasting_config_provider = ForecastingHotelConfigProvider(spark=spark,env=ENV)
target_column = env_config.target_column
schema = training_output_schema

In [0]:
logger= get_dbx_logger(
    pipeline=PIPELINE,
    task_type=TARGET_TYPE,
    exclude_pms=WITHOUT_PMS,
)

In [0]:
logger.info(f"Processing data for target type: {TARGET_TYPE} : {target_column}")
logger.info(f"Excluding PMS data? {WITHOUT_PMS}")

In [0]:
#Removed forecast training wrapper function

In [0]:
logger.info(f"Loading data from testing_data.pp_ff_preprocess_rv")
df = spark.sql(f"select * from testing_data.pp_ff_preprocess_rv")

In [0]:
if df.count() <= 0:
    logger.error("The loaded training dataset is empty.")
    logger.info("Terminting the pipeline execution")
    raise Exception("The loaded training dataset is empty.")

In [0]:
# Group the data by hotel id and execute the trainings in parallel
logger.info("Starting parallel training")

output_df = df.groupby("HotelID").applyInPandas(
    train_wrapper(
        target_type=TARGET_TYPE,
        ml_experiment_id=ML_EXPERIMENT_ID,
        exclude_pms=WITHOUT_PMS,
        calc_uncertainty=CALC_UNCERTAINTY,
        hotel_config_provider=forecasting_config_provider,
        processing_timestamp=processing_timestamp,
        save_models=SAVE_MODEL,
        save_metrics=SAVE_METRICS,
        lag_numbers=LAG_NUMBERS,
        model_tags=MODEL_TAGS_DCT
    ),
    schema,
)

In [0]:
start = time.time()
output_df = output_df.toPandas()

end = time.time()

print(f"Model training time: {end - start}")

In [0]:
#display(output_df)

In [0]:
for index, row in output_df.iterrows():
    if row.status == "complete":
        logger.info(f"{row.message}")
    else:
        logger.error(
            f"Error encountered when training hotel {row.HotelID}: {row.message}"
        )

In [0]:
client = MlflowClient()
completed = output_df[output_df["status"]=="complete"]

outputs_list = []
for n,g in completed.groupby(["HotelID","model_name"]):
    hotel_id = n[0]
    model_name = n[1]
    hotel_config = forecasting_config_provider.get_config(hotel_id)

    mv = client.get_latest_versions(name=model_name)[0]
    print(mv)
    arts = client.list_artifacts(mv.run_id,path=f"forecasting/{hotel_id}/models/{model_name}/artifacts")
    
    outputs_list.append({"hotel_id":hotel_id,
                         "model_name":model_name,
                         "creation_time":datetime.datetime.fromtimestamp(mv.creation_timestamp/1e3),
                         "last_update":datetime.datetime.fromtimestamp(mv.last_updated_timestamp/1e3),
                         "version":mv.version,
                         "target":TARGET_TYPE,
                         "exclude_pms":WITHOUT_PMS,
                         "config_train_length":hotel_config.training_length,
                         "config_infer_length":hotel_config.inference_length,
                         "num_model_steps":len(arts)-1})
    
    print(f"Hotel: {hotel_id} target_type:{TARGET_TYPE} exclude_pms:{WITHOUT_PMS} : {len(arts)-1}")

completed_df = pd.DataFrame(outputs_list)

In [0]:
completed_df

In [0]:
logger.info("Model training completed.")

# elapsed_time = time.perf_counter() - start_time
# logger.info(f"Time elapsed {elapsed_time}")
# logger.info(f"Time elapsed in minutes {elapsed_time/60}")