In [0]:
%run ./1_config.py

In [0]:
# --- inline params (dev/qa) — optional; keep if running this as a notebook ---
import os, importlib
try:
    dbutils.widgets.dropdown("ENV", "dev", ["dev", "qa"], "Environment")
    dbutils.widgets.dropdown("STORAGE_ACCOUNT", "trafficsa2", ["trafficsa2", "trafficsaqa"], "Storage account")
    dbutils.widgets.text("METASTORE_ACCOUNT", "trafficsa2", "Metastore account")
    os.environ["ENV"] = dbutils.widgets.get("ENV").strip().lower()
    os.environ["STORAGE_ACCOUNT"] = dbutils.widgets.get("STORAGE_ACCOUNT").strip()
    os.environ["METASTORE_ACCOUNT"] = (dbutils.widgets.get("METASTORE_ACCOUNT") or os.environ["STORAGE_ACCOUNT"]).strip()
except NameError:
    # running as a pure module; ENV already set or defaults used
    pass


In [0]:
class SetupHelper:
    def __init__(self, conf_obj=None):
        self.conf = conf_obj or conf
        self.catalog  = self.conf.catalog
        self.db_name  = self.conf.db_name
        self.landing_zone   = self.conf.raw_data_path
        self.checkpoint_base= self.conf.checkpoint_base
        self.initialized = False

    # ---------- catalog & schema ----------
    def create_db(self):
        spark.catalog.clearCache()
        print(f"Creating schema {self.catalog}.{self.db_name}...", end='')
        spark.sql(f"CREATE CATALOG IF NOT EXISTS {self.catalog}")
        spark.sql(f"CREATE SCHEMA IF NOT EXISTS {self.catalog}.{self.db_name}")
        spark.sql(f"USE CATALOG {self.catalog}")
        spark.sql(f"USE {self.db_name}")
        self.initialized = True
        print("Done")

    # ---------- Bronze landing ----------
    def raw_traffic(self):
        assert self.initialized, "Database is not initialized."
        print("Creating raw_traffic table...", end='')
        cols96 = ",\n                ".join([f"V{i:02d} INT" for i in range(96)])
        spark.sql(f"""
            CREATE TABLE IF NOT EXISTS {self.catalog}.{self.db_name}.raw_traffic (
                NB_SCATS_SITE INT,
                QT_INTERVAL_COUNT STRING,        -- date string of the reading day
                NB_DETECTOR INT,
                {cols96},                        -- 96 x 15-min volumes
                NM_REGION STRING,
                CT_RECORDS INT,
                QT_VOLUME_24HOUR INT,
                CT_ALARM_24HOUR INT,
                PartitionDate DATE,
                load_time TIMESTAMP,
                source_file STRING
            )
            USING DELTA
            PARTITIONED BY (PartitionDate)
        """)
        print("Done")

    # ---------- Dimensions ----------
    def dim_time(self):
        assert self.initialized, "Database is not initialized."
        print("Creating dim_time table...", end='')
        spark.sql(f"""
            CREATE TABLE IF NOT EXISTS {self.catalog}.{self.db_name}.dim_time (
                Date DATE,             -- calendar date
                Hour INT,              -- 0-23
                Year INT,
                Month INT,             -- 1-12
                DayOfWeek STRING,      -- Mon..Sun
                WeekdayFlag BOOLEAN    -- True for Mon-Fri
            )
            USING DELTA
        """)
        print("Done")

    def dim_detector(self):
        """
        NOTE: CT_ALARM_24HOUR is included as requested.
        Treat it as a 'snapshot' attribute (e.g., last loaded day) and
        do not aggregate analysis off this column; use facts for that.
        """
        assert self.initialized, "Database is not initialized."
        print("Creating dim_detector table...", end='')
        spark.sql(f"""
            CREATE TABLE IF NOT EXISTS {self.catalog}.{self.db_name}.dim_detector (
                NB_DETECTOR INT,
                NB_SCATS_SITE INT,
                NM_REGION STRING,
                SUBURB STRING
            )
            USING DELTA
        """)
        print("Done")

    def region_lookup(self):
        assert self.initialized, "Database is not initialized."
        print("Ensuring region_lookup table...", end='')
        spark.sql(f"""
            CREATE TABLE IF NOT EXISTS {self.catalog}.{self.db_name}.region_lookup (
                NM_REGION STRING,
                SUBURB   STRING
            )
            USING DELTA
        """)
        print("Done")

    # ---------- (Empty) Gold facts: will be populated by your Gold loader ----------
    def fact_traffic_15min(self):
        """
        Fact at 15-minute grain. Populated by 5_gold_loader after reshaping V00..V95.
        Partitioned by Year/Month for pruning.
        """
        assert self.initialized, "Database is not initialized."
        print("Creating fact_traffic_15min table (empty schema)...", end='')
        spark.sql(f"""
            CREATE TABLE IF NOT EXISTS {self.catalog}.{self.db_name}.fact_traffic_15min (
                TimeKey BIGINT,                 -- yyyymmddHHMM from IntervalStartTime
                DetectorKey STRING,             -- hash or surrogate from (NB_SCATS_SITE, NB_DETECTOR)
                SiteKey STRING,                 -- hash of NB_SCATS_SITE
                RegionKey STRING,               -- hash of NM_REGION
                IntervalStartTime TIMESTAMP,
                Volume BIGINT,
                Year INT,
                Month INT
            )
            USING DELTA
            PARTITIONED BY (Year, Month)
        """)
        print("Done")

    def fact_daily_summary(self):
        """
        Optional daily fact for quick daily cards/tiles.
        Populated by 5_gold_loader from QT_VOLUME_24HOUR / CT_ALARM_24HOUR.
        """
        assert self.initialized, "Database is not initialized."
        print("Creating fact_daily_summary table (empty schema)...", end='')
        spark.sql(f"""
            CREATE TABLE IF NOT EXISTS {self.catalog}.{self.db_name}.fact_daily_summary (
                DateKey INT,                    -- yyyymmdd
                DetectorKey STRING,
                SiteKey STRING,
                RegionKey STRING,
                ReadingDate DATE,
                TotalVolume BIGINT,             -- QT_VOLUME_24HOUR
                AlarmCount BIGINT,              -- CT_ALARM_24HOUR
                IntervalsWithData INT           -- CT_RECORDS
            )
            USING DELTA
        """)
        print("Done")

    # ---------- Orchestration ----------
    def setup(self):
        import time
        t0 = time.time()
        print("\nStarting setup ...")
        self.create_db()
        self.raw_traffic()
        self.dim_time()
        self.dim_detector()     # now includes CT_ALARM_24HOUR
        self.region_lookup()
        # Pre-create empty fact tables so downstream Gold loader can overwrite/append safely
        self.fact_traffic_15min()
        self.fact_daily_summary()
        print(f"Setup completed in {int(time.time() - t0)} seconds")

    def assert_table(self, table):
        result = (spark.sql(f"SHOW TABLES IN {self.catalog}.{self.db_name}")
                    .filter(f"tableName = '{table}'"))
        assert result.count() == 1, f"The table '{table}' is missing"
        print(f"Found {self.catalog}.{self.db_name}.{table}: Success")

    def validate(self):
        import time
        t0 = time.time()
        print("\nStarting setup validation ...")
        assert (spark.sql(f"SHOW DATABASES IN {self.catalog}")
                  .filter(f"databaseName == '{self.db_name}'").count() == 1), \
               f"The database '{self.catalog}.{self.db_name}' is missing"
        for tbl in [
            "raw_traffic",
            "dim_time",
            "dim_detector",
            "region_lookup",
            "fact_traffic_15min",
            "fact_daily_summary"
        ]:
            self.assert_table(tbl)
        print(f"Setup validation completed in {int(time.time() - t0)} seconds")


# Example interactive:
setup = SetupHelper()
setup.setup()
setup.validate()
