In [0]:
%run ./1_config.py

In [0]:
# --- inline params (dev/qa) ---
import os, importlib
try:
    dbutils.widgets.dropdown("ENV", "dev", ["dev", "qa"], "Environment")
    dbutils.widgets.dropdown("STORAGE_ACCOUNT", "trafficsa2", ["trafficsa2", "trafficsaqa"], "Storage account")
    dbutils.widgets.text("METASTORE_ACCOUNT", "trafficsa2", "Metastore account")
    os.environ["ENV"] = dbutils.widgets.get("ENV").strip().lower()
    os.environ["STORAGE_ACCOUNT"] = dbutils.widgets.get("STORAGE_ACCOUNT").strip()
    os.environ["METASTORE_ACCOUNT"] = (dbutils.widgets.get("METASTORE_ACCOUNT") or os.environ["STORAGE_ACCOUNT"]).strip()
except NameError:
    pass

In [0]:
import os
from typing import Optional
from pyspark.sql import functions as F
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.utils import AnalysisException

class SilverLoader:
    """
    Silver layer for SCATS 'Traffic Signal Volume':
      - Reads Bronze table: {catalog}.{db}.raw_traffic
      - Derives keys: TimeKey (15-min), DetectorKey, RegionKey
      - Normalizes region_code
      - Adds record_hash for idempotent merge
    """

    def __init__(self, conf_obj: Optional["Config"] = None):
        self.conf = conf_obj or conf
        self.catalog = self.conf.catalog
        self.db_name = self.conf.db_name
        self.spark: SparkSession = SparkSession.getActiveSession() or SparkSession.builder.getOrCreate()

        self.bronze_fqn = self.conf.table_fqn(self.conf.bronze_table)
        self.silver_fqn = self.conf.table_fqn(self.conf.silver_table)

        self._bootstrap_uc()
        self._create_silver_if_not_exists()

    # ---------------- bootstrap ----------------
    def _bootstrap_uc(self):
        self.spark.sql(f"CREATE CATALOG IF NOT EXISTS {self.catalog}")
        self.spark.sql(f"CREATE SCHEMA IF NOT EXISTS {self.catalog}.{self.db_name}")
        self.spark.sql(f"USE CATALOG {self.catalog}")
        self.spark.sql(f"USE {self.db_name}")

    def _create_silver_if_not_exists(self):
        cols96 = ",\n                ".join([f"V{i:02d} INT" for i in range(96)])
        self.spark.sql(f"""
            CREATE TABLE IF NOT EXISTS {self.silver_fqn} (
                TimeKey BIGINT,
                DetectorKey BIGINT,
                RegionKey BIGINT,
                region_code STRING,
                {cols96},
                CT_RECORDS INT,
                QT_VOLUME_24HOUR INT,
                CT_ALARM_24HOUR INT,
                PartitionDate DATE,
                record_hash STRING
            )
            USING DELTA
            PARTITIONED BY (PartitionDate)
        """)

    # ---------------- transforms ----------------
    def _add_keys(self, df: DataFrame) -> DataFrame:
        # parse timestamp
        ts = F.coalesce(
            F.to_timestamp("QT_INTERVAL_COUNT", "yyyy-MM-dd HH:mm:ss"),
            F.to_timestamp("QT_INTERVAL_COUNT", "yyyy/MM/dd HH:mm:ss"),
            F.to_timestamp("QT_INTERVAL_COUNT", "dd/MM/yyyy HH:mm:ss"),
            F.to_timestamp("QT_INTERVAL_COUNT", "yyyy-MM-dd"),
            F.to_timestamp("QT_INTERVAL_COUNT", "yyyy/MM/dd"),
            F.to_timestamp("QT_INTERVAL_COUNT")
        )
        df = df.withColumn("ReadingTs", ts)

        # TimeKey = floor to 15 min (900 sec)
        df = df.withColumn("TimeKey", (F.unix_timestamp("ReadingTs") / 900).cast("bigint") * 900)

        # DetectorKey = hash of site + detector
        df = df.withColumn(
            "DetectorKey",
            F.crc32(F.concat_ws(":", 
                F.col("NB_SCATS_SITE").cast("string"),
                F.col("NB_DETECTOR").cast("string"))
            ).cast("bigint")
        )

        # RegionKey from normalized region code
        df = df.withColumn("region_code",
                           F.upper(F.trim(F.regexp_replace("NM_REGION", r"\s+", ""))))
        df = df.withColumn("RegionKey", F.crc32(F.col("region_code")).cast("bigint"))

        return df

    def _add_hash(self, df: DataFrame) -> DataFrame:
        return df.withColumn(
            "record_hash",
            F.sha2(F.concat_ws("||",
                               F.coalesce(F.col("TimeKey").cast("string"), F.lit("")),
                               F.coalesce(F.col("DetectorKey").cast("string"), F.lit("")),
                               F.coalesce(F.col("RegionKey").cast("string"), F.lit(""))), 256)
        )

    def _project(self, df: DataFrame) -> DataFrame:
        base_cols = [
            "TimeKey", "DetectorKey", "RegionKey",
            "NB_SCATS_SITE", "NB_DETECTOR", "QT_INTERVAL_COUNT",
            "NM_REGION", "region_code"
        ]
        vcols = [f"V{i:02d}" for i in range(96) if f"V{i:02d}" in df.columns]   # ✅ keep all Vxx
        other = [
            "CT_RECORDS", "QT_VOLUME_24HOUR", "CT_ALARM_24HOUR",
            "PartitionDate", "record_hash", "ReadingDate", "ReadingTs"
        ]
        cols = [c for c in base_cols + vcols + other if c in df.columns]
        return df.select(*cols)

    # ---------------- upsert logic ----------------
    def upsert_from_bronze(self, since_load_time: Optional[str] = None) -> int:
        try:
            bronze = self.spark.table(self.bronze_fqn)
        except AnalysisException:
            raise RuntimeError(f"❌ Bronze table {self.bronze_fqn} not found. Run Bronze loader first.")

        if since_load_time:
            bronze = bronze.where(F.col("load_time") >= F.to_timestamp(F.lit(since_load_time)))

        df = (bronze
              .transform(self._add_keys)
              .transform(self._add_hash)
              .transform(self._project)
              .cache())

        df.createOrReplaceTempView("__incoming_silver")

        cols = df.columns
        update_set = ", ".join([f"tgt.{c} = src.{c}" for c in cols if c != "record_hash"])
        insert_cols = ", ".join(cols)
        insert_vals = ", ".join([f"src.{c}" for c in cols])

        merge_sql = f"""
        MERGE INTO {self.silver_fqn} AS tgt
        USING __incoming_silver AS src
        ON tgt.record_hash = src.record_hash
        WHEN MATCHED THEN UPDATE SET {update_set}
        WHEN NOT MATCHED THEN INSERT ({insert_cols}) VALUES ({insert_vals})
        """
        self.spark.sql(merge_sql)

        count = df.count()
        df.unpersist()
        return count

    def rebuild_all(self) -> int:
        self.spark.sql(f"TRUNCATE TABLE {self.silver_fqn}")
        return self.upsert_from_bronze()

    def validate(self, show_sample: int = 5):
        df = self.spark.table(self.silver_fqn)
        print(f"✅ {self.silver_fqn}: {df.count()} rows")
        if show_sample:
            display(df.limit(show_sample))


# ---------------- example ----------------
SL = SilverLoader(conf)
rows = SL.rebuild_all()
print(f"🔁 Silver rows written: {rows}")
SL.validate()