In [0]:
dbutils.widgets.dropdown("env_stage", "dev", ["dev", "prod"], "Pipeline stage")
dbutils.widgets.dropdown("exclude_pms", "False", ["True", "False"], "Exclude PMS")
dbutils.widgets.dropdown("target_type", "REVENUE", ["REVENUE", "ROOMS"], "Target Type")
dbutils.widgets.dropdown("is_usd_currency", "True", ["True", "False"], "Use USD currency")
dbutils.widgets.text("lag_numbers","1,7,14,28", "Lag Numbers")
dbutils.widgets.text("model_tags","", "Model Tags")
dbutils.widgets.text("thread_numbers","1", "Number of Threads")

In [0]:
%pip install mlflow==2.2.2
%load_ext autoreload
%autoreload 2

In [0]:
import pandas as pd
import numpy as np
from pyspark.sql.functions import *
import datetime
from pathlib import Path
import pickle
import os
from sys import version_info
import cloudpickle
import mlflow
import mlflow.pyfunc
import logging
import warnings
from mlflow import MlflowException
from mlflow.client import MlflowClient
import time
import datetime
import re
import logging

In [0]:
# sys.path.append(os.path.abspath('/Workspace/Repos/manik@surge.global/phg-data-mlsys/src'))
warnings.filterwarnings("ignore")
start_time = time.perf_counter()

In [0]:
from phgml.models.xgboost_model import XGBMultiStepPredictor
from phgml.models.autogluon_model import AutoGluonModel, AGMlflowModel
from phgml.models.lightgbm_model import LightGBMModel, LGBMMlflowModel
from phgml.pipeline.training import train_wrapper
from phgml.data.processing_distr_ca import (
    filter_train_data,
    filter_test_data,
    remove_padded_cols,
)
from phgml.reporting.output_metrics import *
from phgml.data.data_types import (
    revenue_preprocessed_schema,
    rooms_preprocessed_schema,
    training_output_schema,
)
from phgml.reporting.logging import get_logging_path, get_logging_filename, get_dbx_logger
from phgml.reporting.report_results import get_output_df, correct_prediction_list
from phgml.data.config import EnvironmentConfig, ForecastingHotelConfigProvider
from phgml.utilities.task_utilities import str_to_lst, str_to_bool, get_model_tags

In [0]:
params = {}
params["ENV"] = getArgument("env_stage")
params["REPOPATH"] = "/Workspace/Repos/manik@surge.global/phg-data-mlsys/src"
params["CLUSTER_NAME"] = spark.conf.get("spark.databricks.clusterUsageTags.clusterName")
params["REVENUE_COL"] = "_reservationRevenuePerRoomUSD"
params["ROOMS_COL"] = "rooms"
params["PIPELINE"] = "TRAINING"
params["WITHOUT_PMS"] = str_to_bool(getArgument("exclude_pms"))
params["IS_USD_CURRENCY"] = str_to_bool(getArgument("is_usd_currency"))
params["TARGET_TYPE"] = getArgument("target_type")
params["MODEL_TAGS_DCT"] = get_model_tags(getArgument("model_tags"))
print('model tags dict: ',params["MODEL_TAGS_DCT"])

### The start of the model data
params["MODEL_START_DATE"] = pd.to_datetime("2018-10-01")
params["COVID_START_DATE"] = pd.to_datetime("2020-03-01")
params["COVID_END_DATE"] = pd.to_datetime("2021-08-01")

params["CALC_UNCERTAINTY"] = True
params["MODEL_TYPE"] = "AG"
params["LEAD_WINDOW"] = 60
params["PREDICTION_HORIZON"] = 30
params["ML_EXPERIMENT_ID"] = 609933091443417
params["LOG_ROOT"] = "/dbfs/mnt/extractionlogs/synxis"
params["LAG_NUMBERS"] = list(map(int,str_to_lst(getArgument('lag_numbers'))))
params["SAVE_MODEL"] = True
params["SAVE_METRICS"] = True
params["THREAD_NUMBERS"] = 1
if getArgument('thread_numbers'):
    params["THREAD_NUMBERS"] = int(getArgument('thread_numbers'))

if (params["ENV"] == "dev") and ("dev" in params["CLUSTER_NAME"]):
    print(f"Loading phgml package from repo {params['REPOPATH']}")
    sys.path.append(os.path.abspath(params["REPOPATH"]))

In [0]:
# Disable adaptrive query optimization
# Adaptive query optimization groups together smaller tasks into a larger tasks.
# This may result in limited parallelism if the parallel inference tasks are deemed to be too small by the query optimizer
# We are diableing AQE here to circumevent this limitation on parallelism
spark.conf.set("spark.sql.adaptive.enabled", "false")
processing_timestamp = datetime.datetime.now()

In [0]:
# Config data relevant to this pipeline
env_config = EnvironmentConfig(env=params["ENV"], target=params["TARGET_TYPE"], spark=spark, is_usd_currency=params["IS_USD_CURRENCY"])
forecasting_config_provider = ForecastingHotelConfigProvider(spark=spark,env=params["ENV"])
params["TARGET_COLUMN"] = env_config.target_column

for handler in logging.root.handlers[:]:
    logging.root.removeHandler(handler)
        
logging.root.setLevel(logging.INFO)

processing_timestamp = datetime.datetime.now()

logfile_path = get_logging_path(params["LOG_ROOT"],processing_timestamp)
if not os.path.exists(logfile_path):
    os.makedirs(logfile_path)

pms = "PMS"
if params["WITHOUT_PMS"]:
    pms = "NOPMS"
        
log_file_name = get_logging_filename(
    logfile_path,
    params["PIPELINE"],
    params["TARGET_TYPE"],
    pms,
    processing_timestamp)

logger = logging.getLogger(f"preprocess-{params['TARGET_TYPE']}-{pms}")

file_handler = logging.FileHandler(log_file_name)
file_format = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
file_handler.setFormatter(file_format)

logger.addHandler(file_handler)

In [0]:
target_column = env_config.target_column
schema = training_output_schema

In [0]:
logger.info(f"Processing data for target type: {params['TARGET_TYPE']} : {params['TARGET_COLUMN']}")
logger.info(f"Excluding PMS data? {params['WITHOUT_PMS']}")

In [0]:
logger.info(f"Loading data from {env_config.preprocessed_data_table}")
df = spark.sql(f"select * from {env_config.preprocessed_data_table}")

In [0]:
if df.count() <= 0:
    logger.error("The loaded training dataset is empty.")
    logger.info("Terminting the pipeline execution")
    raise Exception("The loaded training dataset is empty.")

In [0]:
# For testing purposes
# # d = d[d["HotelID"]=="71999"]

# CALC_UNCERTAINTY = True
# SAVE_MODEL = True

# fn = train_wrapper(
#         MODEL_TYPE, TARGET_TYPE, ML_EXPERIMENT_ID, WITHOUT_PMS, CALC_UNCERTAINTY,forecasting_config_provider
#     )

# df = df.toPandas()
# output = fn(df)

In [0]:
# Group the data by hotel id and execute the trainings in parallel
logger.info("Starting parallel training")

output_df = df.groupby("HotelID").applyInPandas(
    train_wrapper(
        target_type=params["TARGET_TYPE"],
        ml_experiment_id=params["ML_EXPERIMENT_ID"],
        exclude_pms=params["WITHOUT_PMS"],
        calc_uncertainty=params["CALC_UNCERTAINTY"],
        hotel_config_provider=forecasting_config_provider,
        processing_timestamp=processing_timestamp,
        save_models=params["SAVE_MODEL"],
        save_metrics=params["SAVE_METRICS"],
        lag_numbers=params["LAG_NUMBERS"],
        model_tags=params["MODEL_TAGS_DCT"],
        n_threads=params["THREAD_NUMBERS"],
    ),
    schema,
)

In [0]:
output_df = output_df.toPandas()

In [0]:
#display(output_df)

In [0]:
for index, row in output_df.iterrows():
    if row.status == "complete":
        logger.info(f"{row.message}")
    else:
        logger.error(
            f"Error encountered when training hotel {row.HotelID}: {row.message}"
        )

In [0]:
client = MlflowClient()
completed = output_df[output_df["status"]=="complete"]

outputs_list = []
for n,g in completed.groupby(["HotelID","model_name"]):
    hotel_id = n[0]
    model_name = n[1]
    hotel_config = forecasting_config_provider.get_config(hotel_id)

    mv = client.get_latest_versions(name=model_name)[0]
    print(mv)
    arts = client.list_artifacts(mv.run_id,path=f"forecasting/{hotel_id}/models/{model_name}/artifacts")
    
    outputs_list.append({"hotel_id":hotel_id,
                         "model_name":model_name,
                         "creation_time":datetime.datetime.fromtimestamp(mv.creation_timestamp/1e3),
                         "last_update":datetime.datetime.fromtimestamp(mv.last_updated_timestamp/1e3),
                         "version":mv.version,
                         "target":params["TARGET_TYPE"],
                         "exclude_pms":params["WITHOUT_PMS"],
                         "config_train_length":hotel_config.training_length,
                         "config_infer_length":hotel_config.inference_length,
                         "num_model_steps":len(arts)-1})
    
    print(f"Hotel: {hotel_id} target_type:{params['TARGET_TYPE']} exclude_pms:{params['WITHOUT_PMS']} : {len(arts)-1}")

completed_df = pd.DataFrame(outputs_list)

In [0]:
display(completed_df)

In [0]:
logger.info("Model training completed.")

elapsed_time = time.perf_counter() - start_time
logger.info(f"Time elapsed {elapsed_time}")
logger.info(f"Time elapsed in minutes {elapsed_time/60}")