In [0]:
%run ./1_config.py

In [0]:
# --- inline params (dev/qa) — optional; keep if running this as a notebook ---
import os, importlib
try:
    dbutils.widgets.dropdown("ENV", "dev", ["dev", "qa"], "Environment")
    dbutils.widgets.dropdown("STORAGE_ACCOUNT", "trafficsa2", ["trafficsa2", "trafficsaqa"], "Storage account")
    dbutils.widgets.text("METASTORE_ACCOUNT", "trafficsa2", "Metastore account")
    os.environ["ENV"] = dbutils.widgets.get("ENV").strip().lower()
    os.environ["STORAGE_ACCOUNT"] = dbutils.widgets.get("STORAGE_ACCOUNT").strip()
    os.environ["METASTORE_ACCOUNT"] = (dbutils.widgets.get("METASTORE_ACCOUNT") or os.environ["STORAGE_ACCOUNT"]).strip()
except NameError:
    # running as a pure module; ENV already set or defaults used
    pass


In [0]:

class SetupHelper:
    def __init__(self, conf_obj=None):
        self.conf = conf_obj or conf
        self.catalog  = self.conf.catalog
        self.db_name  = self.conf.db_name
        self.landing_zone   = self.conf.raw_data_path
        self.checkpoint_base= self.conf.checkpoint_base
        self.initialized = False

    def create_db(self):
        spark.catalog.clearCache()
        print(f"Creating schema {self.catalog}.{self.db_name}...", end='')
        spark.sql(f"CREATE CATALOG IF NOT EXISTS {self.catalog}")
        spark.sql(f"CREATE SCHEMA IF NOT EXISTS {self.catalog}.{self.db_name}")
        spark.sql(f"USE CATALOG {self.catalog}")
        spark.sql(f"USE {self.db_name}")
        self.initialized = True
        print("Done")

    def raw_traffic(self):
        assert self.initialized, "Database is not initialized."
        print("Creating raw_traffic table...", end='')
        cols96 = ",\n                ".join([f"V{i:02d} INT" for i in range(96)])
        spark.sql(f"""
            CREATE TABLE IF NOT EXISTS {self.catalog}.{self.db_name}.raw_traffic (
                NB_SCATS_SITE INT,
                QT_INTERVAL_COUNT STRING,
                NB_DETECTOR INT,
                {cols96},
                NM_REGION STRING,
                CT_RECORDS INT,
                QT_VOLUME_24HOUR INT,
                CT_ALARM_24HOUR INT,
                PartitionDate DATE,
                load_time TIMESTAMP,
                source_file STRING
            )
            USING DELTA
            PARTITIONED BY (PartitionDate)
        """)
        print("Done")

    def dim_time(self):
        assert self.initialized, "Database is not initialized."
        print("Creating dim_time table...", end='')
        spark.sql(f"""
            CREATE TABLE IF NOT EXISTS {self.catalog}.{self.db_name}.dim_time (
                Date DATE,
                Hour INT,
                Year INT,
                Month INT,
                DayOfWeek STRING,
                WeekdayFlag BOOLEAN
            )
            USING DELTA
        """)
        print("Done")

    def dim_detector(self):
        assert self.initialized, "Database is not initialized."
        print("Creating dim_detector table...", end='')
        spark.sql(f"""
            CREATE TABLE IF NOT EXISTS {self.catalog}.{self.db_name}.dim_detector (
                NB_DETECTOR INT,
                NB_SCATS_SITE INT,
                NM_REGION STRING,
                suburb STRING
            )
            USING DELTA
        """)
        print("Done")

    def region_lookup(self):
        assert self.initialized, "Database is not initialized."
        print("Ensuring region_lookup table...", end='')
        spark.sql(f"""
            CREATE TABLE IF NOT EXISTS {self.catalog}.{self.db_name}.region_lookup (
                NM_REGION STRING,
                SUBURB   STRING
            )
            USING DELTA
        """)
        print("Done")

    def setup(self):
        import time
        t0 = time.time()
        print("\nStarting setup ...")
        self.create_db()
        self.raw_traffic()
        self.dim_time()
        self.dim_detector()
        self.region_lookup()   # added so Gold can rely on it
        print(f"Setup completed in {int(time.time() - t0)} seconds")

    def assert_table(self, table):
        result = spark.sql(f"SHOW TABLES IN {self.catalog}.{self.db_name}") \
                      .filter(f"tableName = '{table}'")
        assert result.count() == 1, f"The table '{table}' is missing"
        print(f"Found {self.catalog}.{self.db_name}.{table}: Success")

    def validate(self):
        import time
        t0 = time.time()
        print("\nStarting setup validation ...")
        assert spark.sql(f"SHOW DATABASES IN {self.catalog}") \
                    .filter(f"databaseName == '{self.db_name}'").count() == 1, \
               f"The database '{self.catalog}.{self.db_name}' is missing"
        for tbl in ["raw_traffic", "dim_time", "dim_detector", "region_lookup"]:
            self.assert_table(tbl)
        print(f"Setup validation completed in {int(time.time() - t0)} seconds")

    def cleanup(self):
        """Drop schema and remove checkpoints (raw remains)."""
        if spark.sql(f"SHOW DATABASES IN {self.catalog}") \
                .filter(f"databaseName == '{self.db_name}'").count() == 1:
            print(f"Dropping {self.catalog}.{self.db_name}...", end='')
            spark.sql(f"DROP SCHEMA {self.catalog}.{self.db_name} CASCADE")
            print("Done")
        print(f"Deleting {self.checkpoint_base}...", end='')
        dbutils.fs.rm(self.checkpoint_base, True)
        print("Done")

# Example interactive:
setup = SetupHelper()
setup.setup()
setup.validate()
#setup.cleanup()

