In [0]:
df_device_raw = spark.table("device_message_raw")
df_steps_raw  = spark.table("rapid_step_test_raw")

print("device_message_raw columns:")
print(df_device_raw.columns)

print("\nrapid_step_test_raw columns:")
print(df_steps_raw.columns)

display(df_device_raw.limit(10))
display(df_steps_raw.limit(10))


In [0]:
%sql
SHOW SCHEMAS;



In [0]:
%sql
USE bronze;
SHOW TABLES;


In [0]:
spark.sql("USE bronze")

df_device_raw = spark.table("device_messages_raw")
df_steps_raw  = spark.table("rapid_step_tests_raw")

print("device_messages_raw columns:")
print(df_device_raw.columns)

print("\nrapid_step_tests_raw columns:")
print(df_steps_raw.columns)

display(df_device_raw.limit(10))
display(df_steps_raw.limit(10))


In [0]:
from pyspark.sql.functions import col, regexp_extract

df_device_prepped = (
    df_device_raw
    # pull the number out of strings like "75cm"
    .withColumn("distance_cm", regexp_extract(col("distance"), r"(\d+)", 1).cast("int"))
    # keep only what we need for labeling + later output
    .select(
        "timestamp",
        "device_id",
        "sensor_type",
        "distance_cm"
    )
)

display(df_device_prepped.limit(10))


In [0]:
from pyspark.sql.functions import col, when, lit

# Keep only the window columns we need
df_steps_window = df_steps_raw.select(
    "device_id",
    col("start_time").cast("long").alias("start_time"),
    col("stop_time").cast("long").alias("stop_time")
)

# Label each sensor reading as step/no_step if its timestamp falls inside any step-test window
df_labeled = (
    df_device_prepped.alias("d")
    .join(
        df_steps_window.alias("s"),
        (col("d.device_id") == col("s.device_id")) &
        (col("d.timestamp").between(col("s.start_time"), col("s.stop_time"))),
        "left"
    )
    .withColumn(
        "step_label",
        when(col("s.start_time").isNotNull(), lit("step")).otherwise(lit("no_step"))
    )
    .withColumn(
        "source",
        when(col("s.start_time").isNotNull(), lit("step")).otherwise(lit("device"))
    )
    .select(
        col("d.timestamp").alias("timestamp"),
        col("d.device_id").alias("device_id"),
        col("d.sensor_type").alias("sensor_type"),
        col("d.distance_cm").alias("distance_cm"),
        "step_label",
        "source"
    )
)

display(df_labeled.limit(20))


In [0]:
from pyspark.sql.functions import count

(df_labeled
 .groupBy("step_label")
 .agg(count("*").alias("row_count"))
 .orderBy("step_label")
 .show()
)


In [0]:
spark.sql("USE silver")

(df_labeled
 .write
 .mode("overwrite")
 .saveAsTable("labeled_step_test")
)

print("Saved table: silver.labeled_step_test")


In [0]:
%sql
DROP TABLE IF EXISTS silver.labeled_step_test;


In [0]:
spark.sql("USE silver")

(df_labeled
 .write
 .format("delta")
 .mode("overwrite")
 .saveAsTable("labeled_step_test")
)

print("Saved table: silver.labeled_step_test")


In [0]:
%sql
SELECT step_label, COUNT(*) AS row_count
FROM silver.labeled_step_test
GROUP BY step_label;


In [0]:
%sql
SELECT source, COUNT(*) AS row_count
FROM silver.labeled_step_test
GROUP BY source;


In [0]:
%sql
SELECT *
FROM silver.labeled_step_test
WHERE step_label IS NULL OR source IS NULL
LIMIT 50;


In [0]:
%sql
SELECT *
FROM silver.labeled_step_test
LIMIT 50;


Are we labeling data fairly? We label “step” only when a sensor reading timestamp falls inside a recorded Rapid Step Test window for the same device_id, and “no_step” otherwise. That’s consistent and reproducible, but it can still mislabel edge cases if start/stop times are imperfect, clocks drift, or the user moves without stepping during the window.

Are we protecting identity? The dataset includes device_id, which could be identifying if it can be linked back to a person. We’re not joining to any customer identity tables here, and any sharing should avoid exposing real-world identity mappings.

Are we avoiding medical claims? Yes. This labeling is about activity during a test window (step vs no_step) and does not diagnose medical conditions or infer health outcomes.