In [0]:
# gold layer transformations

from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

spark = SparkSession.builder.getOrCreate()

# 0) Read the Silver table
spark.sql("USE CATALOG spark_catalog")
spark.sql("USE default")

silver = spark.table("silver_covid_nyt")

# ----------------------------------------------------------------------------
# 1) Build the fact: daily totals by region
# ----------------------------------------------------------------------------
gold_fact = (
    silver
      .groupBy("event_date", "region")
      .agg(
        sum("cases" ).alias("total_cases"),
        sum("deaths").alias("total_deaths"),
        count("*"   ).alias("event_count")
      )
      .withColumn("year",  year("event_date"))
      .withColumn("month", month("event_date"))
      .withColumn("day",   dayofmonth("event_date"))
)

print("=== gold_fact (daily totals) ===")
display(gold_fact.limit(5))


# ----------------------------------------------------------------------------
# 2) Build a small date dimension
# ----------------------------------------------------------------------------
dim_date = (
    gold_fact
      .select("event_date")
      .distinct()
      .withColumn("year",   year("event_date"))
      .withColumn("month",  month("event_date"))
      .withColumn("day",    dayofmonth("event_date"))
      .withColumn("weekday", date_format("event_date","E"))
      .withColumn("is_weekend", expr("weekday IN ('Sat','Sun')"))
)

print("=== dim_date (date dimension) ===")
display(dim_date.limit(5))


# ----------------------------------------------------------------------------
# 3) Build the region dimension
# ----------------------------------------------------------------------------
# For demo, infer from Silver (in real life, use a master dim table)
dim_region = (
    silver
      .select("region")
      .distinct()
      .withColumnRenamed("region","region_code")
      .withColumn("region_name", initcap(col("region_code")))
)

print("=== dim_region (region dimension) ===")
display(dim_region)


# ----------------------------------------------------------------------------
# 4) Join into a classic star: gold_covid_star
# ----------------------------------------------------------------------------
gold_star = (
    gold_fact
      .join(dim_date,   on="event_date",    how="left")
      .join(dim_region, on=gold_fact.region == dim_region.region_code, how="left")
      .select(
        col("event_date"),
        col("year"      ).alias("fact_year"),
        col("month"     ).alias("fact_month"),
        col("day"       ).alias("fact_day"),
        col("region"    ).alias("region_code"),
        col("region_name"),
        "total_cases","total_deaths","event_count",
        "weekday","is_weekend"
      )
)

print("=== gold_star (fact + dims) ===")
display(gold_star.limit(5))


# ----------------------------------------------------------------------------
# 5) Final QC: no null keys
# ----------------------------------------------------------------------------
null_keys = gold_star.filter(col("event_date").isNull() | col("region_code").isNull())
assert null_keys.count() == 0, "Found null keys in Gold star schema!"

# ----------------------------------------------------------------------------
# 6) Register in Hive Metastore (spark_catalog) or Unity Catalog
# ----------------------------------------------------------------------------
# Example: Hive Metastore in database `exampledb`
spark.sql("USE CATALOG spark_catalog; USE exampledb")
gold_fact.write.mode("overwrite") .format("delta") \
      .option("path","dbfs:/tmp/gold/covid_daily_metrics_Transformation_ex") \
      .saveAsTable("covid_daily_metrics_Transformation_ex")

dim_date.write.mode("overwrite").format("delta") \
      .option("path","dbfs:/tmp/gold/dim_date_Transformation_ex") \
      .saveAsTable("dim_date_Transformation_ex")

dim_region.write.mode("overwrite").format("delta") \
      .option("path","dbfs:/tmp/gold/dim_region_Transformation_ex") \
      .saveAsTable("dim_region_Transformation_ex")

gold_star.write.mode("overwrite").format("delta") \
      .option("path","dbfs:/tmp/gold/covid_star_Transformation_ex") \
      .saveAsTable("covid_covid_star_Transformation_ex")

print("✅ Gold layer tables written and registered.")

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, year, month, dayofmonth, date_format, expr, sum as sum_, count

spark = SparkSession.builder.getOrCreate()

# 0) point at the right catalog & schema
spark.sql("USE CATALOG spark_catalog")
spark.sql("USE default")

# 1) read the Silver table
silver = spark.table("silver_covid_nyt")

# ----------------------------------------------------------------------------
# 1) Build the fact: daily totals by region, *renaming* the date parts
# ----------------------------------------------------------------------------
gold_fact = (
    silver
      .groupBy("event_date", "region")
      .agg(
        sum_("cases" ).alias("total_cases"),
        sum_("deaths").alias("total_deaths"),
        count("*"   ).alias("event_count")
      )
      # rename the date‐part columns here
      .withColumn("fact_year",  year("event_date"))
      .withColumn("fact_month", month("event_date"))
      .withColumn("fact_day",   dayofmonth("event_date"))
)

print("=== gold_fact (daily totals) ===")
display(gold_fact.limit(5))


# ----------------------------------------------------------------------------
# 2) Build a small date dimension, using *the same* event_date but new names
# ----------------------------------------------------------------------------
dim_date = (
    gold_fact
      .select("event_date")
      .distinct()
      .withColumn("date_year",   year("event_date"))
      .withColumn("date_month",  month("event_date"))
      .withColumn("date_day",    dayofmonth("event_date"))
      .withColumn("weekday",     date_format("event_date","E"))
      .withColumn("is_weekend",  expr("weekday IN ('Sat','Sun')"))
)

print("=== dim_date (date dimension) ===")
display(dim_date.limit(5))


# ----------------------------------------------------------------------------
# 3) Build the region dimension
# ----------------------------------------------------------------------------
dim_region = (
    silver
      .select("region")
      .distinct()
      .withColumnRenamed("region","region_code")
      .withColumn("region_name", expr("initcap(region_code)"))
)

print("=== dim_region (region dimension) ===")
display(dim_region.limit(5))


# ----------------------------------------------------------------------------
# 4) Join into a classic star: gold_covid_star
# ----------------------------------------------------------------------------
gold_star = (
    gold_fact
      # join to date dim on the shared event_date
      .join(dim_date,   on="event_date", how="left")
      #join to region dim, fully qualifying ambiguous `region`
      .join(dim_region, gold_fact["region"] == dim_region["region_code"], how="left")
      .select(
        col("event_date"),
        col("fact_year"),
        col("fact_month"),
        col("fact_day"),
        col("region"   ).alias("region_code"),
        col("region_name"),
        col("total_cases"),
        col("total_deaths"),
        col("event_count"),
        col("weekday"),
        col("is_weekend")
      )
)

print("=== gold_star (fact + dims) ===")
display(gold_star.limit(5))


# ----------------------------------------------------------------------------
# 5) Sanity‐check: no null keys
# ----------------------------------------------------------------------------
null_keys = gold_star.filter(
    col("event_date").isNull() |
    col("region_code").isNull()
)
assert null_keys.count() == 0, f"Null keys! {null_keys.count()} rows"


# ----------------------------------------------------------------------------
# 6) Write & register into Hive metastore
# ----------------------------------------------------------------------------
spark.sql("USE CATALOG spark_catalog"); 
spark.sql("USE default");

gold_fact.write.mode("overwrite").format("delta").option("path","dbfs:/tmp/gold/covid_daily_metrics_tr_gold").saveAsTable("gold_covid_daily_metrics_tr_gold")

dim_date.write.mode("overwrite").format("delta").option("path","dbfs:/tmp/gold/dim_date_tr_gold").saveAsTable("gold_dim_date_tr_gold")

dim_region.write.mode("overwrite").format("delta").option("path","dbfs:/tmp/gold/dim_region_tr_gold").saveAsTable("gold_dim_region_tr_gold")

gold_star.write.mode("overwrite").format("delta").option("path","dbfs:/tmp/gold/covid_star_tr_gold").saveAsTable("gold_covid_star_tr_gold")

print("✅ Gold layer tables written and registered.")