In [0]:
Scratch
#01_Bronze
# Databricks notebook source
#01_Bronze notebook
from pyspark.sql.types import StructType, StructField, StringType
from pyspark.sql.functions import current_timestamp, lit, input_file_name
import os
import builtins

# Delete the old Bronze folder entirely
dbutils.fs.rm("dbfs:/tmp/bronze/cc_events/", recurse=True)

# 1) Derive a unique process_id
all_conf   = spark.conf.getAll   # note the ()
job_id     = all_conf.get("spark.databricks.job.id",   "interactive")
run_id     = all_conf.get("spark.databricks.job.runId", "interactive")
process_id = f"daily_ingest|{job_id}_{run_id}"


# 1) derive process_id with safe fallback


# 2) Define the Bronze schema 
bronze_schema = StructType([
    StructField("user_id",         StringType(), True),
    StructField("app_name",        StringType(), True),
    StructField("event_type",      StringType(), True),
    StructField("event_timestamp", StringType(), True),
    StructField("device", StructType([
        StructField("os",     StringType(), True),
        StructField("region", StringType(), True)
    ]), True)
])

# 3) Ensure the raw folder exists under DBFS /tmp
raw_dbfs = "dbfs:/tmp/raw/telemetry/"
raw_local = "/dbfs/tmp/raw/telemetry/"
if not os.path.exists(raw_local):
    dbutils.fs.mkdirs(raw_dbfs)
    # Optionally burst in sample JSON here for a first run:
    # dbutils.fs.put(f"{raw_dbfs}sample1.json", '{"user_id":"u1","app_name":"PS",...}', True)

# 4) Read the raw JSON into our Bronze schema
raw_df = spark.read \
    .schema(bronze_schema) \
    .option("multiline", True) \
    .json(raw_dbfs)

# 5) Enrich with audit columns
bronze_df = (raw_df
    .withColumn("ingest_ts",   current_timestamp())
    .withColumn("process_id",  lit(process_id))
)

# 6) Write to Bronze Delta (append-only)
bronze_path = "dbfs:/tmp/bronze/cc_events/"
bronze_df.write \
    .format("delta") \
    .mode("append") \
    .save(bronze_path)

# 7) Sanity-check your Bronze table
display(spark.read.format("delta").load(bronze_path))
