In [0]:
%pip install /dbfs/FileStore/python-wheels/dev/phgml-1.4.0-py3-none-any.whl -q

In [0]:
dbutils.widgets.dropdown("save_model", "True", ["True", "False"], "Save Model")
dbutils.widgets.text("model_tags","model_stage:CA3_develop", "Model Tags")

In [0]:
import pandas as pd
from datetime import datetime, timedelta
from typing import List
import re
import logging
from mlflow import MlflowException
from mlflow.client import MlflowClient
import os
import sys

In [0]:
def str_to_bool(value):
  FALSE_VALUES = ['false', 'no', '0']
  TRUE_VALUES = ['true', 'yes', '1']
  lvalue = str(value).lower()
  if lvalue in (FALSE_VALUES): return False
  if lvalue in (TRUE_VALUES):  return True
  raise Exception("String value should be one of {}, but got '{}'.".format(FALSE_VALUES + TRUE_VALUES, value))

def extract_param_values(value: str)-> List[str]:
    """
    The function takes comma seperated strings and return list of strings

    input params:
        value (str) : comma seperated strings

    output:
        (List) : list of strings
    """
    if value == "":
        return []
    elif "," in value:
        val_lst = value.split(",")
        return val_lst
    else:
        return [value]
    
def get_model_tags(model_tags_str):
    ''' A Validation for the model tag text through databricks utility'''
    valid_pattern = r'(\w+:\w+)'
    invalid_pattern = r'[^:,\w\s]'
    str_cpy=model_tags_str.replace(" ", '')

    not_allowed_symbols = re.findall(pattern=invalid_pattern, string=str_cpy)
    if len(not_allowed_symbols)>0:
        raise ValueError('''Unwanted characters detected. Allowed characters are colon(:), comma(,), word characters and white space characters
                            Please specify key values pairs as key1:value1,key2:value2
                        ''')
    else:
        matching_pairs = re.findall(pattern=valid_pattern, string=str_cpy)

        for matching_str in matching_pairs:
            str_cpy = str_cpy.replace(matching_str,'')
        str_cpy = str_cpy.replace(',','')
        
        if len(str_cpy)>0:
            raise ValueError('''unmatched string components detected. please check the specified string
                             Please specify key values pairs as key1:value1,key2:value2
                             ''')
        
    return {key:value for key,value in map(lambda x: x.split(':'),matching_pairs)}

In [0]:
params = {'WITHOUT_PMS': False,
 'SELECTED_HOTELS': ['63662','26834','1406','10443','71999','64942','26532','55810'],
 'ENV': 'dev',
 'CACHE_MODELS': False,
 'TARGET_TYPE': 'REVENUE',
 'MODEL_TYPE': 'FARFIELD',
 'REPOPATH': '/Workspace/Repos/manik@surge.global/phg-data-mlsys/src',
 'FORECAST_POINTS': [91, 84, 77, 70, 63, 56, 49, 42, 35],
 'CA_AWARE': True,
 'IS_USD_CURRENCY': True,
 'DAYS_AHEAD': 7,
 'MAX_FORECAST_POINT': 91,
 'MIN_FORECAST_POINT': 35,
 'MAX_TARGET_LEAD': 100,
 'MAX_LEAD': 151,
 'LAG_NUMBERS': [7,14,21,28,35,42,49,56,63,70,77,84,91,98,105,112,119,126,133,140,147],
 'MODEL_START_DATE': pd.to_datetime('2018-10-01 00:00:00'),
 'COVID_START_DATE': pd.to_datetime('2020-03-01 00:00:00'),
 'COVID_END_DATE': pd.to_datetime('2021-08-01 00:00:00'),
 'REVENUE_COL': '_reservationRevenuePerRoomUSD',
 'ROOMS_COL': '_rooms',
 'TARGET_COLUMN': '_reservationRevenuePerRoomUSD',
 'PREPROCESSED_TABLE': 'testing_data.pp_ff_preprocess_rv',
 'PARTITION_DATE': datetime(2024, 7, 10, 3, 59, 6),
 'CORRECTED_HOTEL_IDS': ['63662','26834','1406','10443','71999','64942','26532','55810']}

In [0]:
# params = {}
# params['WITHOUT_PMS'] = dbutils.jobs.taskValues.get(taskKey = "capture_params", key = "WITHOUT_PMS")
# params['SELECTED_HOTELS'] = dbutils.jobs.taskValues.get(taskKey = "capture_params", key = "SELECTED_HOTELS")
# params['ENV'] = dbutils.jobs.taskValues.get(taskKey = "capture_params", key = "ENV")
# params['CACHE_MODELS'] = dbutils.jobs.taskValues.get(taskKey = "capture_params", key = "CACHE_MODELS")
# params['TARGET_TYPE'] = dbutils.jobs.taskValues.get(taskKey = "capture_params", key = "TARGET_TYPE")
# params['MODEL_TYPE'] = dbutils.jobs.taskValues.get(taskKey = "capture_params", key = "MODEL_TYPE")
# params['REPOPATH'] = dbutils.jobs.taskValues.get(taskKey = "capture_params", key = "REPOPATH")
# params["FORECAST_POINTS"] = dbutils.jobs.taskValues.get(taskKey = "dev-ff-forecast-data-processing", key = "FORECAST_POINTS")
# params['CA_AWARE'] = dbutils.jobs.taskValues.get(taskKey = "dev-ff-forecast-data-processing", key = "CA_AWARE")
# params['IS_USD_CURRENCY'] = dbutils.jobs.taskValues.get(taskKey = "dev-ff-forecast-data-processing", key = "IS_USD_CURRENCY")
# params['DAYS_AHEAD'] = dbutils.jobs.taskValues.get(taskKey = "dev-ff-forecast-data-processing", key = "DAYS_AHEAD")
# params['MAX_FORECAST_POINT'] = dbutils.jobs.taskValues.get(taskKey = "dev-ff-forecast-data-processing", key = "MAX_FORECAST_POINT")
# params['MIN_FORECAST_POINT'] = dbutils.jobs.taskValues.get(taskKey = "dev-ff-forecast-data-processing", key = "MIN_FORECAST_POINT")
# params['MAX_TARGET_LEAD'] = dbutils.jobs.taskValues.get(taskKey = "dev-ff-forecast-data-processing", key = "MAX_TARGET_LEAD")
# params['MAX_LEAD'] = dbutils.jobs.taskValues.get(taskKey = "dev-ff-forecast-data-processing", key = "MAX_LEAD")
# params['LAG_NUMBERS'] = dbutils.jobs.taskValues.get(taskKey = "dev-ff-forecast-data-processing", key = "LAG_NUMBERS")
# params['MODEL_START_DATE'] = pd.to_datetime(dbutils.jobs.taskValues.get(taskKey = "dev-ff-forecast-data-processing", key = "MODEL_START_DATE"))
# params['COVID_START_DATE'] = pd.to_datetime(dbutils.jobs.taskValues.get(taskKey = "dev-ff-forecast-data-processing", key = "COVID_START_DATE"))
# params['COVID_END_DATE'] = pd.to_datetime(dbutils.jobs.taskValues.get(taskKey = "dev-ff-forecast-data-processing", key = "COVID_END_DATE"))
# params['REVENUE_COL'] = dbutils.jobs.taskValues.get(taskKey = "dev-ff-forecast-data-processing", key = "REVENUE_COL")
# params['ROOMS_COL'] = dbutils.jobs.taskValues.get(taskKey = "dev-ff-forecast-data-processing", key = "ROOMS_COL")
# params['TARGET_COLUMN'] = dbutils.jobs.taskValues.get(taskKey = "dev-ff-forecast-data-processing", key = "TARGET_COLUMN")
# params['PREPROCESSED_TABLE'] = dbutils.jobs.taskValues.get(taskKey = "dev-ff-forecast-data-processing", key = "PREPROCESSED_TABLE")
# params['PARTITION_DATE'] = pd.to_datetime(dbutils.jobs.taskValues.get(taskKey = "dev-ff-forecast-data-processing", key = "PARTITION_DATE"))
# params['CORRECTED_HOTEL_IDS'] = dbutils.jobs.taskValues.get(taskKey = "dev-ff-forecast-data-processing", key = "CORRECTED_HOTEL_IDS")

params["ML_EXPERIMENT_ID"] = 609933091443417
params["CALC_UNCERTAINTY"] = True
params["SAVE_MODEL"] = False
params["SAVE_METRICS"] = False
params["MODEL_TAGS_DCT"] = get_model_tags(getArgument("model_tags"))
params["MAX_FORECAST_POINT"] = max(params["FORECAST_POINTS"]) 
params["MIN_FORECAST_POINT"] = min(params["FORECAST_POINTS"]) 
params["MODEL_TAGS_DCT"]["last_updated_date"] = str(datetime.now())
params["MODEL_TAGS_DCT"]["days_ahead"] = params["DAYS_AHEAD"]
params["MODEL_TAGS_DCT"]["max_forecast_point"] = params["MAX_FORECAST_POINT"]
params["MODEL_TAGS_DCT"]["min_forecast_point"] = params["MIN_FORECAST_POINT"]
params["MODEL_NAME_PREFIX"] = None

In [0]:
# if params["ENV"] == "dev":
#     print(f"Loading phgml package from repo {params['REPOPATH']}")
#     sys.path.append(os.path.abspath(params["REPOPATH"]))

# sys.path.append(os.path.abspath("/Workspace/Repos/yasith.udawatte@henrymwuamica.onmicrosoft.com/phg-data-mlsys"))

In [0]:
from phgml.data.config import FarfieldForecastingHotelConfigProvider,FarfieldEnvironmentConfig
from phgml.data.data_types import (
    training_output_schema,
)
from phgml.pipeline.training import train_wrapper_farfield
from phgml.reporting.logging import get_dbx_logger
schema = training_output_schema

In [0]:
env_config = FarfieldEnvironmentConfig(
    env=params["ENV"], 
    without_pms=params["WITHOUT_PMS"], 
    target=params["TARGET_TYPE"],
    spark=spark,
    is_usd_currency=params["IS_USD_CURRENCY"]
)
forecasting_config_provider = FarfieldForecastingHotelConfigProvider(spark=spark,env=params["ENV"])
params["TARGET_COLUMN"] = env_config.target_column

In [0]:
logger = get_dbx_logger(pipeline=params["ENV"],
                        task_type=params["TARGET_TYPE"],
                        exclude_pms=params["WITHOUT_PMS"])
logger.setLevel(logging.INFO)

logger.info(f"Processing data for target type: {params['TARGET_TYPE']} : {params['TARGET_COLUMN']}")
logger.info(f"Excluding PMS data? {params['WITHOUT_PMS']}")

In [0]:
logger.info(f"Loading data from {params['PREPROCESSED_TABLE']}")
df = spark.sql(f"select * from {params['PREPROCESSED_TABLE']}")

In [0]:
if df.count() <= 0:
    logger.error("The loaded training dataset is empty.")
    logger.info("Terminting the pipeline execution")
    raise Exception("The loaded training dataset is empty.")

In [0]:
params

In [0]:
# Group the data by hotel id and execute the trainings in parallel
logger.info("Starting parallel training")

output_df = df.groupby("HotelID").applyInPandas(
    train_wrapper_farfield(
        target_type=params["TARGET_TYPE"],
        ml_experiment_id=params["ML_EXPERIMENT_ID"],
        exclude_pms=params["WITHOUT_PMS"],
        calc_uncertainty=params["CALC_UNCERTAINTY"],
        hotel_config_provider=forecasting_config_provider,
        processing_timestamp=datetime.now(),
        save_models=params["SAVE_MODEL"],
        save_metrics=params["SAVE_METRICS"],
        lag_numbers=[],
        forecast_points=params["FORECAST_POINTS"],
        # model_name_prefix=params["MODEL_NAME_PREFIX"],
        model_tags=params["MODEL_TAGS_DCT"]
    ),
    schema,
)

In [0]:
display(output_df)

HotelID,run_id,model_version,timestamp,pms_sync_off,model_name,status,message
71999,e7570a1d7b83464e88a34db7b7554075,1,2024-09-10T17:39:20.332+0000,False,71999_REVENUE_PMS_LGBM_FARFIELD_model,complete,Successfully trained 71999
55810,302b92d3504e41798b732499bd9e87d4,1,2024-09-10T17:39:20.332+0000,False,55810_REVENUE_PMS_LGBM_FARFIELD_model,complete,Successfully trained 55810
10443,141ba72f630342a79f37bc3d1d21a1a1,1,2024-09-10T17:39:20.332+0000,False,10443_REVENUE_PMS_LGBM_FARFIELD_model,complete,Successfully trained 10443
63662,3ad910b78d4447cc83e147b5ab744354,1,2024-09-10T17:39:20.332+0000,False,63662_REVENUE_PMS_LGBM_FARFIELD_model,complete,Successfully trained 63662
1406,ca8f87ca617b4d5386b8eacbf29f3763,1,2024-09-10T17:39:20.332+0000,False,1406_REVENUE_PMS_LGBM_FARFIELD_model,complete,Successfully trained 1406
64942,75ea4b7117334f4c8f8dd027239a738b,1,2024-09-10T17:39:20.332+0000,False,64942_REVENUE_PMS_LGBM_FARFIELD_model,complete,Successfully trained 64942
26532,877fd18811174c7aa86424af0af909b3,1,2024-09-10T17:39:20.332+0000,False,26532_REVENUE_PMS_LGBM_FARFIELD_model,complete,Successfully trained 26532
26834,dd0a254c2b2645248fe9a724c9cd535b,1,2024-09-10T17:39:20.332+0000,False,26834_REVENUE_PMS_LGBM_FARFIELD_model,complete,Successfully trained 26834


In [0]:
output_df = output_df.toPandas()

In [0]:
for index, row in output_df.iterrows():
    if row.status == "complete":
        logger.info(f"{row.message}")
    else:
        logger.error(
            f"Error encountered when training hotel {row.HotelID}: {row.message}"
        )

In [0]:
client = MlflowClient()
completed = output_df[output_df["status"]=="complete"]

outputs_list = []
for n,g in completed.groupby(["HotelID","model_name"]):
    hotel_id = n[0]
    model_name = n[1]
    hotel_config = forecasting_config_provider.get_config(hotel_id)

    mv = client.get_latest_versions(name=model_name)[0]
    print(mv)
    arts = client.list_artifacts(mv.run_id,path=f"forecasting/{hotel_id}/models/{model_name}/artifacts")
    
    outputs_list.append({"hotel_id":hotel_id,
                         "model_name":model_name,
                         "creation_time":datetime.fromtimestamp(mv.creation_timestamp/1e3),
                         "last_update":datetime.fromtimestamp(mv.last_updated_timestamp/1e3),
                         "version":mv.version,
                         "target":params["TARGET_TYPE"],
                         "exclude_pms":params["WITHOUT_PMS"],
                         "config_train_length":hotel_config.booking_pace_start,
                         "config_infer_length":hotel_config.booking_pace_end,
                         "num_model_steps":len(arts)-1})
    
    print(f"Hotel: {hotel_id} target_type:{params['TARGET_TYPE']} exclude_pms:{params['WITHOUT_PMS']} : {len(arts)-1}")

completed_df = pd.DataFrame(outputs_list)