In [0]:
# Receive merged data path from previous notebook
dbutils.widgets.text("merged_json", "", "Merged Data Path")
merged_path = dbutils.widgets.get("merged_json")

# Read merged data as a Spark DataFrame
final_merged = spark.read.json(merged_path)

# Check for column names and map them to correct capitalization
columns = [c.lower() for c in final_merged.columns]
if "region" not in columns:
    raise Exception("Column 'Region' not found in merged data.")
# Find the real column name (to support any capitalization)
region_col = [c for c in final_merged.columns if c.lower() == "region"][0]
unit_price_col = [c for c in final_merged.columns if c.lower() == "unitprice"][0]

from pyspark.sql.functions import col, sum as spark_sum, countDistinct

# Group and aggregate
report = (
    final_merged
    .groupBy(region_col)
    .agg(
        spark_sum(unit_price_col).alias("TotalSales"),
        countDistinct("SalesOrderNumber").alias("OrderCount")
    )
    .orderBy(col("TotalSales").desc())
)
print("")
display(report)

# Save report to CSV (single file)
report_path = "/Volumes/labdaysix/default/data/output/region_sales_report.csv"
report.coalesce(1).write.mode("overwrite").option("header", True).csv(report_path)

# Output row count for next task
row_count = report.count()
display(row_count)
#dbutils.notebook.exit("Report Generated!")
