In [0]:
%run ./1_config.py

In [0]:
# --- inline params (dev/qa) — optional; keep if running this as a notebook ---
import os, importlib
try:
    dbutils.widgets.dropdown("ENV", "dev", ["dev", "qa"], "Environment")
    dbutils.widgets.dropdown("STORAGE_ACCOUNT", "trafficsa2", ["trafficsa2", "trafficsaqa"], "Storage account")
    dbutils.widgets.text("METASTORE_ACCOUNT", "trafficsa2", "Metastore account")
    os.environ["ENV"] = dbutils.widgets.get("ENV").strip().lower()
    os.environ["STORAGE_ACCOUNT"] = dbutils.widgets.get("STORAGE_ACCOUNT").strip()
    os.environ["METASTORE_ACCOUNT"] = (dbutils.widgets.get("METASTORE_ACCOUNT") or os.environ["STORAGE_ACCOUNT"]).strip()
except NameError:
    # running as a pure module; ENV already set or defaults used
    pass


In [0]:
import os, time
from pyspark.sql import functions as F

class SetupHelper:
    def __init__(self):
        self.catalog = conf.catalog
        self.db_name = conf.db_name

    # ------------------ catalog & db ------------------
    def create_catalog_and_db(self):
        spark.sql(f"CREATE CATALOG IF NOT EXISTS {self.catalog}")
        spark.sql(f"USE CATALOG {self.catalog}")
        spark.sql(f"CREATE DATABASE IF NOT EXISTS {self.db_name}")
        spark.sql(f"USE {self.db_name}")

    # ------------------ tables ------------------
    def create_tables(self):
        spark.sql(f"USE CATALOG {self.catalog}")
        spark.sql(f"USE {self.db_name}")

        # --- drop old/deprecated ---
        spark.sql(f"DROP TABLE IF EXISTS {self.db_name}.dim_site")    # no longer needed
        spark.sql(f"DROP TABLE IF EXISTS {self.db_name}.dim_region")  # force recreate

        # --- dim_region ---
        spark.sql(f"""
            CREATE TABLE {self.db_name}.dim_region (
                RegionKey   BIGINT NOT NULL,
                region_code STRING NOT NULL,
                suburb      STRING
            )
            USING DELTA
        """)

        # --- dim_time ---
        spark.sql(f"""
            CREATE TABLE IF NOT EXISTS {self.db_name}.dim_time (
                TimeKey     BIGINT  NOT NULL,
                Date        DATE    NOT NULL,
                Hour        INT     NOT NULL,
                Year        INT     NOT NULL,
                Month       INT     NOT NULL,
                DayOfWeek   STRING  NOT NULL,
                WeekdayFlag BOOLEAN NOT NULL,
                PRIMARY KEY (TimeKey)
            )
            USING DELTA
        """)

        # --- dim_detector ---
        spark.sql(f"""
            CREATE TABLE IF NOT EXISTS {self.db_name}.dim_detector (
                DetectorKey   BIGINT  NOT NULL,
                NB_SCATS_SITE INT     NOT NULL,
                NB_DETECTOR   INT     NOT NULL,
                NM_REGION     STRING,
                RegionKey     BIGINT,
                suburb        STRING,
                PRIMARY KEY (DetectorKey)
            )
            USING DELTA
        """)

        # --- fact_traffic_15min ---
        spark.sql(f"""
            CREATE TABLE IF NOT EXISTS {self.db_name}.fact_traffic_15min (
                TimeKey          BIGINT NOT NULL,
                DetectorKey      BIGINT NOT NULL,
                RegionKey        BIGINT NOT NULL,
                IntervalStartTime TIMESTAMP NOT NULL,
                volume_map       MAP<STRING, INT>,
                CT_RECORDS       INT,
                QT_VOLUME_24HOUR INT,
                CT_ALARM_24HOUR  INT
            )
            USING DELTA
        """)

        # --- fact_daily_summary ---
        spark.sql(f"""
            CREATE TABLE IF NOT EXISTS {self.db_name}.fact_daily_summary (
                TimeKey       BIGINT NOT NULL,
                DetectorKey   BIGINT NOT NULL,
                RegionKey     BIGINT NOT NULL,
                ReadingDate   DATE NOT NULL,
                records_24h   INT,
                volume_24h    INT,
                alarms_24h    INT
            )
            USING DELTA
        """)

    # ------------------ seed region ------------------
    def seed_dim_region_from_map(self):
        items = [(code, name) for code, name in REGION_MAP.items()]
        df = (spark.createDataFrame(items, "region_code STRING, suburb STRING")
              .withColumn("RegionKey", F.crc32(F.upper(F.col("region_code"))).cast("bigint"))
              .select("RegionKey", "region_code", "suburb"))

        (df.write
            .format("delta")
            .mode("overwrite")            # always overwrite to keep consistent
            .option("overwriteSchema", "true")
            .saveAsTable(f"{self.db_name}.dim_region"))

    # ------------------ utils ------------------
    def assert_table(self, name):
        spark.sql(f"DESCRIBE TABLE {self.db_name}.{name}").collect()

    def setup(self):
        t0 = time.time()
        self.create_catalog_and_db()
        self.create_tables()
        self.seed_dim_region_from_map()
        print(f"✅ Setup completed in {int(time.time() - t0)}s")

    def validate(self):
        t0 = time.time()
        for tbl in [
            "dim_time",
            "dim_detector",
            "dim_region",
            "fact_traffic_15min",
            "fact_daily_summary"
        ]:
            self.assert_table(tbl)
        print(f"✅ Validation completed in {int(time.time() - t0)}s")


# ------------------ run setup ------------------
setup = SetupHelper()
setup.setup()
setup.validate()