In [0]:
# 01_Bronze.py
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType
from pyspark.sql.functions import current_timestamp, lit
import os

spark = SparkSession.builder.getOrCreate()

# ─── 1) Derive a unique process_id ────────────────────────────────────────────
#conf       = spark.conf.getAll()
#process_id = f"daily_ingest|{conf.get('spark.databricks.job.id','interactive')}_{conf.get('spark.databricks.job.#runId','interactive')}"

all_conf   = spark.conf.getAll   # note the ()
job_id     = all_conf.get("spark.databricks.job.id",   "interactive")
run_id     = all_conf.get("spark.databricks.job.runId", "interactive")
process_id = f"daily_ingest|{job_id}_{run_id}"


# ─── 2) Define the Bronze schema ─────────────────────────────────────────────
bronze_schema = StructType([
    StructField("user_id",         StringType(), True),
    StructField("app_name",        StringType(), True),
    StructField("event_type",      StringType(), True),
    StructField("event_timestamp", StringType(), True),
    StructField("device", StructType([
        StructField("os",     StringType(), True),
        StructField("region", StringType(), True)
    ]), True)
])

# ─── 3) Ensure the raw folder exists under DBFS /tmp ─────────────────────────
raw_path = "/tmp/raw/telemetry"
if not os.path.exists(f"/dbfs{raw_path}"):
    dbutils.fs.mkdirs(f"dbfs:{raw_path}")
    # Optionally drop in sample JSON:
    # dbutils.fs.put(f"dbfs:{raw_path}/sample1.json", '{"user_id":"u1","app_name":"PS",...}', True)

# ─── 4) Read the raw JSON into our Bronze schema ─────────────────────────────
raw_df = (spark.read
    .schema(bronze_schema)
    .option("multiline", True)
    .json(raw_path)
)

# ─── 5) Enrich with audit columns ─────────────────────────────────────────────
bronze_df = (raw_df
    .withColumn("ingest_ts",  current_timestamp())
    .withColumn("process_id", lit(process_id))
)

# ─── 6) Write to Bronze Delta (overwrite for idempotence) ────────────────────
bronze_path = "dbfs:/tmp/bronze/cc_events"
bronze_df.write \
    .format("delta") \
    .mode("overwrite") \
    .option("mergeSchema","true") \
    .save(bronze_path)

# ─── 7) Register Bronze in Hive Metastore for SQL access ─────────────────────


spark.sql(f"""USE CATALOG spark_catalog;""")   
# -- the legacy HMS catalog
spark.sql(f"""USE SCHEMA default;""")        
#-- or whichever HM DB you prefer

spark.sql(f"""
  CREATE TABLE IF NOT EXISTS bronze_cc_events
  USING DELTA
  LOCATION '{bronze_path}'
""")

# ─── 8) Sanity-check ─────────────────────────────────────────────────────────
display(spark.read.format("delta").load(bronze_path))