In [0]:
# Bronze Layer Ingestion for telemetry1 events
# Spark 4.0.0, serverless cluster compatible

from pyspark.sql.functions import (
    current_timestamp, lit, input_file_name, date_format,
    sha2, concat_ws, col
)
from pyspark.sql.types import (
    StructType, StructField, StringType, TimestampType, StructType
)
import uuid

# 0) Derive a unique process_id (fallback to UUID when job configs absent)
try:
    job_id = spark.conf.get("spark.databricks.job.id")
except:
    job_id = "interactive"
try:
    run_id = spark.conf.get("spark.databricks.job.runId")
except:
    run_id = "interactive"
process_id = f"bronze_ingest|{job_id}_{run_id}|{uuid.uuid4()}"

# 1) Define raw telemetry schema
telemetry_schema = StructType([
    StructField("user_id",         StringType(), True),
    StructField("app_name",        StringType(), True),
    StructField("event_type",      StringType(), True),
    StructField("event_timestamp", TimestampType(), True),
    StructField("device", StructType([
        StructField("os",     StringType(), True),
        StructField("region", StringType(), True)
    ]), True)
])

# 2) Paths for raw and bronze layers
raw_path    = "dbfs:/tmp/raw/telemetry1/"
bronze_path = "dbfs:/tmp/bronze/telemetry1/"

# Ensure bronze folder exists
dbutils.fs.mkdirs(bronze_path)

# Read raw JSON data
raw_df = spark.read \
    .schema(telemetry_schema) \
    .json(raw_path)

# 3) Enrich with audit, lineage, and metadata
bronze_df = (
    raw_df
    .withColumn("ingest_ts",     current_timestamp())            # ingestion timestamp
    .withColumn("process_id",    lit(process_id))                  # unique process/run id
    .withColumn("file_name",     input_file_name())                 # source file for lineage
    .withColumn("source_system", lit("telemetry1"))             # source identifier
    .withColumn("ingest_date",   date_format(col("ingest_ts"), "yyyy-MM-dd")) 
    .withColumn("record_hash",   sha2(
            concat_ws(
              "|",
              col("user_id"),
              col("app_name"),
              col("event_type"),
              col("event_timestamp").cast("string"),
              col("device.os"),
              col("device.region")
            ),
            256
          ) 
    )
    .withColumn("partition_key", date_format(col("event_timestamp"), "yyyy-MM-dd"))
)


# 4) Write to Bronze Delta (append-only), partitioned for faster reads
bronze_df.write \
    .format("delta") \
    .mode("append") \
    .partitionBy("partition_key") \
    .save(bronze_path)

# 5) Sanity-check
print("Bronze count:", spark.read.format("delta").load(bronze_path).count())
spark.read.format("delta").load(bronze_path).show(5, truncate=False)
