In [0]:
%run ./1_config.py

In [0]:
import os, importlib
try:
    dbutils.widgets.dropdown("ENV", "dev", ["dev", "qa"], "Environment")
    dbutils.widgets.dropdown("STORAGE_ACCOUNT", "trafficsa2", ["trafficsa2", "trafficsaqa"], "Storage account")
    dbutils.widgets.text("METASTORE_ACCOUNT", "trafficsa2", "Metastore account")
    os.environ["ENV"] = dbutils.widgets.get("ENV").strip().lower()
    os.environ["STORAGE_ACCOUNT"] = dbutils.widgets.get("STORAGE_ACCOUNT").strip()
    os.environ["METASTORE_ACCOUNT"] = (dbutils.widgets.get("METASTORE_ACCOUNT") or os.environ["STORAGE_ACCOUNT"]).strip()
except NameError:
    pass


In [0]:
from pyspark.sql import functions as F, SparkSession, DataFrame
from pyspark.sql.utils import AnalysisException

class GoldBuilder:
    """
    Gold star schema built from Silver.
      - Assumes Silver has: TimeKey (midnight, BIGINT), DetectorKey (BIGINT), RegionKey (BIGINT),
        V00..V95, NB_SCATS_SITE, NB_DETECTOR, CT_RECORDS, QT_VOLUME_24HOUR, CT_ALARM_24HOUR.
      - Melts V00..V95 to 15-min rows and computes INTERVAL TimeKey for dim_time + 15-min fact.
      - Daily fact is derived from Silver's midnight TimeKey -> ReadingDate.
      - dim_detector is built from Silver keys and enriched by dim_region (region_code/suburb).
      - No SiteKey anywhere.
    """

    def __init__(self, conf_obj: "Config" = None):
        self.conf = conf_obj or conf
        self.spark: SparkSession = SparkSession.getActiveSession() or SparkSession.builder.getOrCreate()

        self.catalog = self.conf.catalog
        self.db_name = self.conf.db_name

        # FQNs
        self.silver_fqn       = self.conf.table_fqn(self.conf.silver_table)
        self.dim_time_fqn     = self.conf.table_fqn("dim_time")
        self.dim_detector_fqn = self.conf.table_fqn("dim_detector")
        self.dim_region_fqn   = self.conf.table_fqn("dim_region")
        self.fact_15_fqn      = self.conf.table_fqn("fact_traffic_15min")
        self.fact_daily_fqn   = self.conf.table_fqn("fact_daily_summary")

        self._bootstrap_uc()
        self._ensure_dim_tables()
        self._ensure_fact_tables()

    # ---------------------------- bootstrap / DDL ----------------------------
    def _bootstrap_uc(self):
        self.spark.sql(f"CREATE CATALOG IF NOT EXISTS {self.catalog}")
        self.spark.sql(f"CREATE SCHEMA  IF NOT EXISTS {self.catalog}.{self.db_name}")
        self.spark.sql(f"USE CATALOG {self.catalog}")
        self.spark.sql(f"USE {self.db_name}")

    def _ensure_dim_tables(self):
        self.spark.sql(f"""
            CREATE OR REPLACE TABLE {self.dim_time_fqn} (
                TimeKey BIGINT,
                Date DATE,
                Hour INT,
                Year INT,
                Month INT,
                DayOfWeek STRING,
                WeekdayFlag BOOLEAN
            ) USING DELTA
        """)
        self.spark.sql(f"""
            CREATE OR REPLACE TABLE {self.dim_detector_fqn} (
                DetectorKey BIGINT,
                NB_SCATS_SITE INT,
                NB_DETECTOR INT,
                NM_REGION STRING,
                RegionKey BIGINT,
                suburb STRING
            ) USING DELTA
        """)

    def _ensure_fact_tables(self):
        self.spark.sql(f"""
            CREATE OR REPLACE TABLE {self.fact_15_fqn} (
                TimeKey BIGINT,
                DetectorKey BIGINT,
                RegionKey BIGINT,
                IntervalStartTime TIMESTAMP,
                Volume BIGINT,
                Year INT,
                Month INT
            ) USING DELTA
            PARTITIONED BY (Year, Month)
        """)
        self.spark.sql(f"""
            CREATE OR REPLACE TABLE {self.fact_daily_fqn} (
                DateKey INT,
                DetectorKey BIGINT,
                RegionKey BIGINT,
                ReadingDate DATE,
                TotalVolume BIGINT,
                AlarmCount BIGINT,
                IntervalsWithData INT
            ) USING DELTA
        """)

    # ---------------------------- helpers ----------------------------
    def _available_vcols(self, df: DataFrame):
        return [f"V{i:02d}" for i in range(96) if f"V{i:02d}" in df.columns]

    # Wide (V00..V95) -> long with IntervalStartTime and interval TimeKey (computed later)
    def _melt_15min(self, df: DataFrame) -> DataFrame:
        vcols = self._available_vcols(df)
        if not vcols:
            raise ValueError("No V00..V95 columns found in Silver. Rebuild Silver so it includes the volume columns.")

        base_cols = ["DetectorKey", "NB_SCATS_SITE", "NB_DETECTOR", "RegionKey", "TimeKey"]
        sel_cols = [c for c in base_cols if c in df.columns] + vcols

        df2 = (df.select(*sel_cols)
                 .withColumn("volumes", F.array(*[F.col(c).cast("long") for c in vcols])))

        exploded = df2.select(
            *[c for c in base_cols if c in df2.columns],
            F.posexplode_outer("volumes").alias("IntervalIndex", "Volume")
        )

        # Silver.TimeKey is midnight for the day (epoch seconds). Build interval start timestamps.
        exploded = (exploded
            .withColumn("MidnightTs", F.to_timestamp(F.from_unixtime(F.col("TimeKey"))))
            .withColumn("IntervalStartTime", F.expr("MidnightTs + make_interval(0,0,0,0,0, IntervalIndex*15, 0)"))
            .drop("MidnightTs"))

        return exploded

    # ---------------------------- dimensions ----------------------------
    # Use INTERVAL time (not Silver midnight) for dim_time PK
    def _upsert_dim_time(self, intervals_df: DataFrame):
        dim = (intervals_df
               .withColumn("IntervalTimeKey", F.col("IntervalStartTime").cast("long").cast("bigint"))
               .select(
                   F.col("IntervalTimeKey").alias("TimeKey"),
                   F.to_date("IntervalStartTime").alias("Date"),
                   F.hour("IntervalStartTime").alias("Hour"),
                   F.year("IntervalStartTime").alias("Year"),
                   F.month("IntervalStartTime").alias("Month"),
                   F.date_format("IntervalStartTime", "E").alias("DayOfWeek"),
                   F.when(F.dayofweek("IntervalStartTime").isin(1, 7), F.lit(False))
                    .otherwise(F.lit(True)).alias("WeekdayFlag")
               )
               .dropDuplicates(["TimeKey"])
        )
        dim.createOrReplaceTempView("__dim_time_src")

        self.spark.sql(f"""
            MERGE INTO {self.dim_time_fqn} AS tgt
            USING __dim_time_src AS src
              ON tgt.TimeKey = src.TimeKey
            WHEN MATCHED THEN UPDATE SET
              tgt.Date = src.Date,
              tgt.Hour = src.Hour,
              tgt.Year = src.Year,
              tgt.Month = src.Month,
              tgt.DayOfWeek = src.DayOfWeek,
              tgt.WeekdayFlag = src.WeekdayFlag
            WHEN NOT MATCHED THEN INSERT (
              TimeKey, Date, Hour, Year, Month, DayOfWeek, WeekdayFlag
            ) VALUES (
              src.TimeKey, src.Date, src.Hour, src.Year, src.Month, src.DayOfWeek, src.WeekdayFlag
            )
        """)

    # Build from Silver keys, enrich from dim_region; avoid NM_REGION dependency in Silver
    def _upsert_dim_detector(self, silver_df: DataFrame):
        dim = (silver_df
               .select("DetectorKey", "NB_SCATS_SITE", "NB_DETECTOR", "RegionKey")
               .dropna(subset=["DetectorKey", "NB_SCATS_SITE", "NB_DETECTOR"])
               .dropDuplicates())

        try:
            region = self.spark.table(self.dim_region_fqn).select("RegionKey", "region_code", "suburb")
            dim = dim.join(region, "RegionKey", "left")
        except AnalysisException:
            dim = dim.withColumn("region_code", F.lit(None).cast("string")) \
                     .withColumn("suburb", F.lit(None).cast("string"))

        # Keep NM_REGION column for compatibility, populated from region_code
        dim = dim.withColumn("NM_REGION", F.col("region_code")).drop("region_code")

        dim.createOrReplaceTempView("__dim_detector_src")

        self.spark.sql(f"""
            MERGE INTO {self.dim_detector_fqn} AS tgt
            USING __dim_detector_src AS src
              ON tgt.DetectorKey = src.DetectorKey
            WHEN MATCHED THEN UPDATE SET
              tgt.NB_SCATS_SITE = src.NB_SCATS_SITE,
              tgt.NB_DETECTOR   = src.NB_DETECTOR,
              tgt.NM_REGION     = src.NM_REGION,
              tgt.RegionKey     = src.RegionKey,
              tgt.suburb        = src.suburb
            WHEN NOT MATCHED THEN INSERT (
              DetectorKey, NB_SCATS_SITE, NB_DETECTOR, NM_REGION, RegionKey, suburb
            ) VALUES (
              src.DetectorKey, src.NB_SCATS_SITE, src.NB_DETECTOR, src.NM_REGION, src.RegionKey, src.suburb
            )
        """)

    # ---------------------------- facts ----------------------------
    def _build_fact_15min(self, silver_df: DataFrame):
        melted = self._melt_15min(silver_df).cache()

        fact = (melted
                .withColumn("TimeKey", F.col("IntervalStartTime").cast("long").cast("bigint"))
                .withColumn("Year", F.year("IntervalStartTime"))
                .withColumn("Month", F.month("IntervalStartTime"))
                .select("TimeKey", "DetectorKey", "RegionKey", "IntervalStartTime", "Volume", "Year", "Month"))

        self.spark.sql(f"TRUNCATE TABLE {self.fact_15_fqn}")
        (fact.write
             .mode("append")
             .format("delta")
             .partitionBy("Year", "Month")
             .saveAsTable(self.fact_15_fqn))

        rows = fact.count()
        melted.unpersist()
        return rows

    def _build_fact_daily(self, silver_df: DataFrame):
        # Silver.TimeKey is midnight; derive day directly from it
        fact = (silver_df
                .withColumn("ReadingDate", F.to_date(F.from_unixtime(F.col("TimeKey"))))
                .select(
                    F.date_format("ReadingDate", "yyyyMMdd").cast("int").alias("DateKey"),
                    "DetectorKey",
                    "RegionKey",
                    "ReadingDate",
                    F.col("QT_VOLUME_24HOUR").cast("long").alias("TotalVolume"),
                    F.col("CT_ALARM_24HOUR").cast("long").alias("AlarmCount"),
                    F.col("CT_RECORDS").cast("int").alias("IntervalsWithData")
                ))

        self.spark.sql(f"TRUNCATE TABLE {self.fact_daily_fqn}")
        fact.write.mode("append").format("delta").saveAsTable(self.fact_daily_fqn)
        return fact.count()

    # ---------------------------- public APIs ----------------------------
    def rebuild_all(self):
        # Source
        silver = self.spark.table(self.silver_fqn)

        # Build dims
        melted = self._melt_15min(silver)
        self._upsert_dim_time(melted)
        self._upsert_dim_detector(silver)

        # Build facts
        rows15 = self._build_fact_15min(silver)
        rowsD  = self._build_fact_daily(silver)

        print(f"✅ Full Gold rebuild complete: 15min={rows15}, daily={rowsD}")

    def validate(self):
        f15 = self.spark.table(self.fact_15_fqn)
        fd  = self.spark.table(self.fact_daily_fqn)
        print(f"🔎 {self.fact_15_fqn}: {f15.count()} rows")
        try:
            parts = self.spark.sql(f"SHOW PARTITIONS {self.fact_15_fqn}").collect()
            if parts:
                sample = ", ".join([r['partition'] for r in parts[:5]] + (["..."] if len(parts) > 5 else []))
                print("   partitions example:", sample)
        except Exception:
            pass
        print(f"🔎 {self.fact_daily_fqn}: {fd.count()} rows")


# ---------------------------- example run ----------------------------
GB = GoldBuilder(conf)
GB.rebuild_all()
GB.validate()