In [0]:
from pyspark.sql import functions as F

rapid_clean = spark.table("workspace.bronze.rapid_step_tests_clean")
device_clean = spark.table("workspace.bronze.device_messages_clean")
joined = spark.table("workspace.bronze.step_test_device_messages")

print("rapid_step_tests_clean:", rapid_clean.count())
print("device_messages_clean:", device_clean.count())
print("step_test_device_messages:", joined.count())

display(joined.select("customer", "device_id", "start_ts", "stop_ts", "event_ts", "sensor_type", "distance").limit(20))


In [0]:
from pyspark.sql import functions as F

display(
    spark.table("workspace.bronze.rapid_step_tests_clean")
    .select(
        F.min("start_ts").alias("min_start"),
        F.max("stop_ts").alias("max_stop"),
        F.min(F.col("stop_ts").cast("long") - F.col("start_ts").cast("long")).alias("min_duration_s"),
        F.max(F.col("stop_ts").cast("long") - F.col("start_ts").cast("long")).alias("max_duration_s"),
    )
)


In [0]:
display(
    spark.table("workspace.bronze.step_test_device_messages")
    .select((F.col("stop_ts").cast("long") - F.col("start_ts").cast("long")).alias("duration_s"))
    .groupBy("duration_s")
    .count()
    .orderBy(F.col("count").desc())
    .limit(20)
)


In [0]:
from pyspark.sql import functions as F

joined = spark.table("workspace.bronze.step_test_device_messages")

display(
    joined.agg(
        F.count("*").alias("rows"),
        F.countDistinct("device_id").alias("distinct_devices"),
        F.countDistinct("customer").alias("distinct_customers"),
        F.countDistinct("start_ts", "stop_ts", "device_id").alias("distinct_tests"),
        F.countDistinct("device_id", "event_ts", "message").alias("approx_distinct_messages")
    )
)


Easy: importing the parquet files and creating the cleaned views was straightforward once the schema location was correct. Confusing: test_time looked like an epoch timestamp but converted to 1970, so I had to verify which fields were real timestamps. Ethics risk: linking device telemetry with step-test data can reveal sensitive behavior, so access controls and minimizing exposed fields matter.