In [0]:
#Create Temlemetry Data
import uuid

process_id = f"demo_ingest|{uuid.uuid4()}"

# 1) Imports & parameters
from pyspark.sql.functions import (
    rand, floor, concat, lit,
    current_timestamp, unix_timestamp,
    when, col, struct, array, element_at
)

num_unique = 50_000
apps    = ["Photos","Mail","Chat","Store","Search"]
events  = ["click","view","purchase","login","logout","scroll"]
oss     = ["iOS","Android","Windows","Linux"]
regions = ["IN","US","EU","APAC","LATAM"]

# 2) Build 50k unique events
base = (
    spark.range(num_unique)
         .withColumn("user_id", concat(lit("user_"), floor(rand()*10000).cast("int")))
         .withColumn("app_name",
             element_at(array(*[lit(a) for a in apps]),
                        (floor(rand()*len(apps)) + 1).cast("int"))
         )
         .withColumn("event_type",
             element_at(array(*[lit(e) for e in events]),
                        (floor(rand()*len(events)) + 1).cast("int"))
         )
         .withColumn("event_timestamp",
             (unix_timestamp() - floor(rand()*86400)).cast("timestamp")
         )
         .withColumn("device",
             struct(
               element_at(array(*[lit(o) for o in oss]),
                          (floor(rand()*len(oss)) + 1).cast("int")).alias("os"),
               element_at(array(*[lit(r) for r in regions]),
                          (floor(rand()*len(regions)) + 1).cast("int")).alias("region")
             )
         )
)

# 3) Inject ~1% nulls / ~1% garbage
base = (
    base
      .withColumn("user_id",
          when(rand() < 0.01, None).otherwise(col("user_id"))
      )
      .withColumn("event_type",
          when(rand() < 0.01, lit("BAD_EVENT")).otherwise(col("event_type"))
      )
)

# 4) Duplicate to reach 100k total
telemetry1 = base.union(base)

# 5) Add audit columns & write out as JSON
telemetry1 = telemetry1 \
    .withColumn("ingest_ts",  current_timestamp()) \
    .withColumn("process_id", lit(process_id))

dbutils.fs.rm("dbfs:/tmp/raw/telemetry1/", recurse=True)
telemetry1.write.mode("overwrite").json("dbfs:/tmp/raw/telemetry1/")

# 6) Sanity‐check
print("Total events written:",
      spark.read.json("dbfs:/tmp/raw/telemetry1/").count())
spark.read.json("dbfs:/tmp/raw/telemetry1/").show(5, truncate=False)
