In [0]:
#01_Bronze
# Databricks notebook source
#01_Bronze notebook
from pyspark.sql.types import StructType, StructField, StringType
from pyspark.sql.functions import current_timestamp, lit, input_file_name
import os
import builtins

# Delete the old Bronze folder entirely
dbutils.fs.rm("dbfs:/tmp/bronze/cc_events/", recurse=True)

# 1) Derive a unique process_id
all_conf   = spark.conf.getAll   # note the ()
job_id     = all_conf.get("spark.databricks.job.id",   "interactive")
run_id     = all_conf.get("spark.databricks.job.runId", "interactive")
process_id = f"daily_ingest|{job_id}_{run_id}"


# 1) derive process_id with safe fallback


# 2) Define the Bronze schema 
bronze_schema = StructType([
    StructField("user_id",         StringType(), True),
    StructField("app_name",        StringType(), True),
    StructField("event_type",      StringType(), True),
    StructField("event_timestamp", StringType(), True),
    StructField("device", StructType([
        StructField("os",     StringType(), True),
        StructField("region", StringType(), True)
    ]), True)
])

# 3) Ensure the raw folder exists under DBFS /tmp
raw_dbfs = "dbfs:/tmp/raw/telemetry/"
raw_local = "/dbfs/tmp/raw/telemetry/"
if not os.path.exists(raw_local):
    dbutils.fs.mkdirs(raw_dbfs)
    # Optionally burst in sample JSON here for a first run:
    # dbutils.fs.put(f"{raw_dbfs}sample1.json", '{"user_id":"u1","app_name":"PS",...}', True)

# 4) Read the raw JSON into our Bronze schema
raw_df = spark.read \
    .schema(bronze_schema) \
    .option("multiline", True) \
    .json(raw_dbfs)

# 5) Enrich with audit columns
bronze_df = (raw_df
    .withColumn("ingest_ts",   current_timestamp())
    .withColumn("process_id",  lit(process_id))
)

# 6) Write to Bronze Delta (append-only)
bronze_path = "dbfs:/tmp/bronze/cc_events/"
bronze_df.write \
    .format("delta") \
    .mode("append") \
    .save(bronze_path)

# 7) Sanity-check your Bronze table
display(spark.read.format("delta").load(bronze_path))


In [0]:
#Create Temlemetry Data
import uuid

process_id = f"demo_ingest|{uuid.uuid4()}"

# 1) Imports & parameters
from pyspark.sql.functions import (
    rand, floor, concat, lit,
    current_timestamp, unix_timestamp,
    when, col, struct, array, element_at
)

num_unique = 50_000
apps    = ["Photos","Mail","Chat","Store","Search"]
events  = ["click","view","purchase","login","logout","scroll"]
oss     = ["iOS","Android","Windows","Linux"]
regions = ["IN","US","EU","APAC","LATAM"]

# 2) Build 50k unique events
base = (
    spark.range(num_unique)
         .withColumn("user_id", concat(lit("user_"), floor(rand()*10000).cast("int")))
         .withColumn("app_name",
             element_at(array(*[lit(a) for a in apps]),
                        (floor(rand()*len(apps)) + 1).cast("int"))
         )
         .withColumn("event_type",
             element_at(array(*[lit(e) for e in events]),
                        (floor(rand()*len(events)) + 1).cast("int"))
         )
         .withColumn("event_timestamp",
             (unix_timestamp() - floor(rand()*86400)).cast("timestamp")
         )
         .withColumn("device",
             struct(
               element_at(array(*[lit(o) for o in oss]),
                          (floor(rand()*len(oss)) + 1).cast("int")).alias("os"),
               element_at(array(*[lit(r) for r in regions]),
                          (floor(rand()*len(regions)) + 1).cast("int")).alias("region")
             )
         )
)

# 3) Inject ~1% nulls / ~1% garbage
base = (
    base
      .withColumn("user_id",
          when(rand() < 0.01, None).otherwise(col("user_id"))
      )
      .withColumn("event_type",
          when(rand() < 0.01, lit("BAD_EVENT")).otherwise(col("event_type"))
      )
)

# 4) Duplicate to reach 100k total
telemetry1 = base.union(base)

# 5) Add audit columns & write out as JSON
telemetry1 = telemetry1 \
    .withColumn("ingest_ts",  current_timestamp()) \
    .withColumn("process_id", lit(process_id))

dbutils.fs.rm("dbfs:/tmp/raw/telemetry1/", recurse=True)
telemetry1.write.mode("overwrite").json("dbfs:/tmp/raw/telemetry1/")

# 6) Sanity‐check
print("Total events written:",
      spark.read.json("dbfs:/tmp/raw/telemetry1/").count())
spark.read.json("dbfs:/tmp/raw/telemetry1/").show(5, truncate=False)
