In [None]:
# COMMAND ----------
#
import logging
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, concat, to_date, datediff, sum, avg, expr
from pyspark.sql.types import DoubleType, StringType

# COMMAND ----------
#
# Initialize logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# COMMAND ----------
#
def load_data():
    """Load data from Unity Catalog tables."""
    logger.info("Loading data from Unity Catalog tables...")
    orders_central_df = spark.table("catalog.orders_central")
    orders_west_df = spark.table("catalog.orders_west")
    orders_east_df = spark.table("catalog.orders_east")
    orders_south_df = spark.table("catalog.orders_south_2015")
    quota_df = spark.table("catalog.quota")
    returns_df = spark.table("catalog.returns")
    return orders_central_df, orders_west_df, orders_east_df, orders_south_df, quota_df, returns_df

# COMMAND ----------
#
def standardize_data(orders_central_df):
    """Standardize data formats and column names."""
    logger.info("Standardizing data...")
    orders_central_df = orders_central_df.withColumn(
        "Order Date", to_date(concat("Order Day", "Order Month", "Order Year"), "ddMMyyyy")
    ).withColumn(
        "Ship Date", to_date(concat("Ship Day", "Ship Month", "Ship Year"), "ddMMyyyy")
    ).withColumnRenamed("Discounts", "Discount").withColumnRenamed("Product", "Product Name")
    return orders_central_df

# COMMAND ----------
#
def clean_data(orders_central_df):
    """Clean data by removing nulls and ensuring correct data types."""
    logger.info("Cleaning data...")
    orders_central_df = orders_central_df.filter(col("Order ID").isNotNull())
    orders_central_df = orders_central_df.withColumn("Sales", col("Sales").cast(DoubleType())).withColumn("Discount", col("Discount").cast(StringType()))
    return orders_central_df

# COMMAND ----------
#
def pivot_and_consolidate_data(quota_df, orders_central_df, orders_west_df, orders_east_df, orders_south_df):
    """Pivot quota data and consolidate order data."""
    logger.info("Pivoting and consolidating data...")
    quota_df = quota_df.select(
        col("Region"),
        expr("stack(4, '2015', `2015`, '2016', `2016`, '2017', `2017`, '2018', `2018`)").alias("Year", "Quota")
    )
    orders_df = orders_central_df.union(orders_west_df).union(orders_east_df).union(orders_south_df)
    return quota_df, orders_df

# COMMAND ----------
#
def perform_custom_calculations(orders_df, returns_df):
    """Perform custom calculations on the orders data."""
    logger.info("Performing custom calculations...")
    orders_df = orders_df.withColumn("Days to Ship", datediff(col("Ship Date"), col("Order Date")))
    is_returned = col("Return Reason").isNotNull()
    orders_df = orders_df.join(returns_df, "Order ID", "left").withColumn("Returned?", is_returned)
    return orders_df

# COMMAND ----------
#
def generate_outputs(orders_df):
    """Generate output data for analysis."""
    logger.info("Generating outputs...")
    annual_performance_df = orders_df.groupBy("Region", "Year of Sale").agg(
        sum("Profit").alias("Total Profit"),
        sum("Sales").alias("Total Sales"),
        sum("Quantity").alias("Total Quantity"),
        avg("Discount").alias("Average Discount")
    )
    return annual_performance_df

# COMMAND ----------
#
def save_outputs(annual_performance_df):
    """Save output data to Unity Catalog tables."""
    logger.info("Saving outputs to Unity Catalog tables...")
    spark.sql("DROP TABLE IF EXISTS catalog.annual_regional_performance")
    annual_performance_df.write.format("delta").mode("overwrite").saveAsTable("catalog.annual_regional_performance")

# COMMAND ----------
#
def main():
    try:
        orders_central_df, orders_west_df, orders_east_df, orders_south_df, quota_df, returns_df = load_data()
        orders_central_df = standardize_data(orders_central_df)
        orders_central_df = clean_data(orders_central_df)
        quota_df, orders_df = pivot_and_consolidate_data(quota_df, orders_central_df, orders_west_df, orders_east_df, orders_south_df)
        orders_df = perform_custom_calculations(orders_df, returns_df)
        annual_performance_df = generate_outputs(orders_df)
        save_outputs(annual_performance_df)
        logger.info("ETL process completed successfully.")
    except Exception as e:
        logger.error("An error occurred during the ETL process", exc_info=True)

# COMMAND ----------
#
# Execute the main function
main()
