In [0]:
%run ./1_config.py

In [0]:
# --- inline params (dev/qa) ---
import os, importlib
try:
    dbutils.widgets.dropdown("ENV", "dev", ["dev", "qa"], "Environment")
    dbutils.widgets.dropdown("STORAGE_ACCOUNT", "trafficsa2", ["trafficsa2", "trafficsaqa"], "Storage account")
    dbutils.widgets.text("METASTORE_ACCOUNT", "trafficsa2", "Metastore account")
    os.environ["ENV"] = dbutils.widgets.get("ENV").strip().lower()
    os.environ["STORAGE_ACCOUNT"] = dbutils.widgets.get("STORAGE_ACCOUNT").strip()
    os.environ["METASTORE_ACCOUNT"] = (dbutils.widgets.get("METASTORE_ACCOUNT") or os.environ["STORAGE_ACCOUNT"]).strip()
except NameError:
    pass


In [0]:
import datetime
from typing import Optional, List
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.functions import (
    col, current_timestamp, to_date, input_file_name,
    to_timestamp, coalesce
)

class LoadRawTraffic:
    """
    Bronze loader for SCATS 'Traffic Signal Volume' CSVs.
    """
    def __init__(self, catalog: str, table_name: str, checkpoint_dir: Optional[str] = None, env: Optional[str] = None):
        self.conf = Config(env or catalog)
        self.catalog = catalog
        self.db_name = self.conf.db_name
        self.landing_zone = self.conf.raw_data_path
        self.table_name = table_name
        self.table_fqn = self.conf.table_fqn(table_name)
        base_chk = f"{self.conf.checkpoint_base}/bronze"
        self.checkpoint_dir = checkpoint_dir or f"{base_chk}/{self.table_name.replace('.', '_')}"
        self.spark = SparkSession.getActiveSession() or SparkSession.builder.getOrCreate()

    def _bootstrap_uc(self) -> None:
        self.spark.sql(f"CREATE CATALOG IF NOT EXISTS {self.catalog}")
        self.spark.sql(f"CREATE SCHEMA IF NOT EXISTS {self.catalog}.{self.db_name}")
        self.spark.sql(f"USE CATALOG {self.catalog}")
        self.spark.sql(f"USE {self.db_name}")

    @staticmethod
    def _raw_table_columns_sql() -> str:
        cols = [f"V{i:02d} INT" for i in range(96)]
        return ",\n                ".join(cols)

    def _create_raw_table_if_not_exists(self) -> None:
        cols96 = self._raw_table_columns_sql()
        self.spark.sql(f"""
            CREATE TABLE IF NOT EXISTS {self.table_fqn} (
                NB_SCATS_SITE INT,
                QT_INTERVAL_COUNT STRING,
                NB_DETECTOR INT,
                {cols96},
                NM_REGION STRING,
                CT_RECORDS INT,
                QT_VOLUME_24HOUR INT,
                CT_ALARM_24HOUR INT,
                PartitionDate DATE,
                load_time TIMESTAMP,
                source_file STRING
            )
            USING DELTA
            PARTITIONED BY (PartitionDate)
        """)

    @staticmethod
    def _parse_partition_date(df: DataFrame) -> DataFrame:
        return df.withColumn(
            "PartitionDate",
            to_date(
                coalesce(
                    to_timestamp(col("QT_INTERVAL_COUNT"), "yyyy-MM-dd HH:mm:ss"),
                    to_timestamp(col("QT_INTERVAL_COUNT"), "yyyy/MM/dd HH:mm:ss"),
                    to_timestamp(col("QT_INTERVAL_COUNT"), "dd/MM/yyyy HH:mm:ss"),
                    to_timestamp(col("QT_INTERVAL_COUNT")),
                )
            )
        )

    @staticmethod
    def _drop_if_exists(df: DataFrame, cols: List[str]) -> DataFrame:
        for c in cols:
            if c in df.columns:
                df = df.drop(c)
        return df

    @staticmethod
    def _cast_volume_cols_int(df: DataFrame) -> DataFrame:
        for i in range(96):
            c = f"V{i:02d}"
            if c in df.columns:
                df = df.withColumn(c, col(c).cast("int"))
        return df

    def create_db(self) -> None:
        self._bootstrap_uc()
        self._create_raw_table_if_not_exists()

    def batch_load(self, start_date: str = "2025-05-01", end_date: str = "2025-05-01") -> None:
        print(f"üì¶ Batch load: {start_date} ‚Üí {end_date} from {self.landing_zone}")
        start = datetime.date.fromisoformat(start_date)
        end = datetime.date.fromisoformat(end_date)

        for day in (start + datetime.timedelta(n) for n in range((end - start).days + 1)):
            filename = f"VSDATA_{day.strftime('%Y%m%d')}.csv"
            path = f"{self.landing_zone}/{filename}"
            try:
                df = (self.spark.read.option("header", True).csv(path))
                if "QT_INTERVAL_COUNT" not in df.columns:
                    raise ValueError("Column QT_INTERVAL_COUNT is missing in the file.")

                df = self._drop_if_exists(df, ["_rescued_data"])
                df = self._cast_volume_cols_int(df)
                df = (df.transform(self._parse_partition_date)
                        .withColumn("load_time", current_timestamp())
                        .withColumn("source_file", input_file_name()))

                (df.write.format("delta")
                    .mode("append")
                    .option("mergeSchema", "true")
                    .partitionBy("PartitionDate")
                    .saveAsTable(self.table_fqn))

                print(f"‚úÖ Loaded {filename}")
            except Exception as e:
                print(f"‚ö†Ô∏è Failed to load {filename}: {e}")

    def stream_load(self, file_pattern: str = "VSDATA_202506*.csv", trigger_once: bool = True, reset_checkpoint: bool = True) -> None:
        print(f"üåä Streaming load for pattern {file_pattern}")
        stream_path = f"{self.landing_zone}/{file_pattern}"
        stream_chk = f"{self.checkpoint_dir}/streaming"
        schema_loc = f"{stream_chk}/schema"

        if reset_checkpoint:
            print(f"üßπ Cleaning checkpoint: {stream_chk}")
            try:
                dbutils.fs.rm(stream_chk, recurse=True)
                print("‚úÖ Checkpoint cleared.")
            except Exception as e:
                print(f"‚ö†Ô∏è Could not clear checkpoint ({e}). Continuing...")

        reader = (self.spark.readStream.format("cloudFiles")
                    .option("cloudFiles.format", "csv")
                    .option("cloudFiles.inferColumnTypes", "true")
                    .option("header", True)
                    .option("cloudFiles.schemaLocation", schema_loc)
                    .load(stream_path))

        stream_df = (reader.transform(lambda d: self._drop_if_exists(d, ["_rescued_data"]))
                           .transform(self._cast_volume_cols_int)
                           .transform(self._parse_partition_date)
                           .withColumn("load_time", current_timestamp())
                           .withColumn("source_file", col("_metadata.file_path")))

        writer = (stream_df.writeStream.format("delta")
                    .option("checkpointLocation", stream_chk)
                    .option("mergeSchema", "true")
                    .partitionBy("PartitionDate")
                    .outputMode("append"))

        writer = writer.trigger(once=True) if trigger_once else writer.trigger(availableNow=True)
        query = writer.toTable(self.table_fqn)
        query.awaitTermination()
        print("‚úÖ Streaming load completed.")

    def validate_table(self) -> None:
        print(f"üîé Validating {self.table_fqn} ...")
        try:
            df = self.spark.table(self.table_fqn)
            total = df.count()
            parts = df.select("PartitionDate").distinct().orderBy("PartitionDate").collect()
            print(f"‚úÖ {self.table_fqn}: {total} rows across {len(parts)} partitions.")
        except Exception as e:
            print(f"‚ùå Validation failed: {e}")

# run cell (bronze)
conf = Config()  # uses your ENV/widgets from the top
bronze = LoadRawTraffic(catalog=conf.catalog, table_name=conf.bronze_table, env=conf.env)

bronze.create_db()
bronze.batch_load(start_date="2025-05-01", end_date="2025-05-01")
bronze.stream_load(file_pattern="VSDATA_202506*.csv", trigger_once=True)  
bronze.validate_table()  # raises on failure
print("‚úÖ bronze load/validate OK")
