In [0]:
# -------------------------------
# Import Libraries
# --------------------------------
from pyspark.sql import SparkSession, Row
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from datetime import datetime
import uuid
import traceback


# --------------------------------
# Initialize Spark Session
# --------------------------------
spark = SparkSession.builder.appName("EnergyConsumptionDailyGold").getOrCreate()

# --------------------------------
# Job ID for this run
# --------------------------------
job_id = str(uuid.uuid4())

# --------------------------------
# Logging function (writes into Delta table)
# --------------------------------
def log_step(step_name, status="INFO", message=""):
    log_df = spark.createDataFrame([
        Row(
            job_id=job_id,
            timestamp=datetime.utcnow().isoformat(),
            step=step_name,
            status=status,
            message=message
        )
    ])
    (
        log_df.write
        .format("delta")
        .mode("append")
        .saveAsTable("project_logs.gold_log.log")   # Gold log table
    )

# --------------------------------
# Config
# --------------------------------
source_catalog = "aws_refinement"
source_schema = "default"
source_table = "usage_cleaned"

target_catalog = "aws_curateddata"
target_schema = "default"
target_table_daily = "daily_summary"

silver_table_full_name = f"{source_catalog}.{source_schema}.{source_table}"
gold_table_daily_full_name = f"{target_catalog}.{target_schema}.{target_table_daily}"

# --------------------------------
# Gold ETL
# --------------------------------
try:
    log_step("START", "INFO", f"Starting Daily Gold processing from {silver_table_full_name}")

    # 1. Read the cleaned Silver data
    silver_df = spark.read.table(silver_table_full_name)
    log_step("LOAD_SILVER", "INFO", f"Loaded Silver table with {silver_df.count()} records")

    # 2. Aggregate daily metrics
    daily_agg_df = silver_df.groupBy(F.to_date("timestamp").alias("date")) \
        .agg(
            F.sum("Global_active_power").alias("total_active_power_kw"),
            F.avg("Voltage").alias("avg_voltage"),
            F.sum("Sub_metering_1").alias("total_sub_metering_1_wh"),
            F.sum("Sub_metering_2").alias("total_sub_metering_2_wh"),
            F.sum("Sub_metering_3").alias("total_sub_metering_3_wh")
        )
    log_step("DAILY_AGG", "INFO", "Daily aggregation completed")

    # 3. Calculate hourly consumption
    hourly_consumption = silver_df.groupBy(
        F.to_date("timestamp").alias("date"),
        F.hour("timestamp").alias("hour")
    ).agg(
        F.sum("Global_active_power").alias("hourly_active_power_kw")
    )

    # 4. Find peak consumption hour
    window_spec = Window.partitionBy("date").orderBy(F.col("hourly_active_power_kw").desc())
    peak_hour_df = hourly_consumption.withColumn("rank", F.row_number().over(window_spec)) \
                                     .filter(F.col("rank") == 1) \
                                     .select("date", F.col("hour").alias("peak_consumption_hour"))
    log_step("PEAK_HOUR", "INFO", "Peak consumption hour calculation completed")

    # 5. Join daily summary with peak hour info
    final_gold_df = daily_agg_df.join(peak_hour_df, on="date", how="left") \
                                .withColumn("day_name", F.date_format("date", "EEEE"))

    # 6. Add date features
    final_gold_df = final_gold_df.withColumn("day_of_week", F.dayofweek("date")) \
                                 .withColumn("month", F.month("date")) \
                                 .orderBy("date")

    log_step("FINAL_PREP", "INFO", "Final Gold DataFrame prepared")

    # 7. Write to Gold Delta table
    final_gold_df.write.format("delta") \
        .mode("overwrite") \
        .option("overwriteSchema", "true") \
        .saveAsTable(gold_table_daily_full_name)

    log_step("SAVE_GOLD", "SUCCESS", f"Saved Daily Gold table {gold_table_daily_full_name}")
    log_step("END", "SUCCESS", "Daily Gold processing completed")

except Exception as e:
    log_step("ERROR", "FAIL", traceback.format_exc())
    raise


In [0]:
final_gold_df.display()