In [1]:
# ============================================================
# Sliding-Window Feature Extraction — PySpark
# ============================================================
# Reads the cleaned PAMAP2 parquet produced by data_ingestion,
# segments every (subject, activity) stream into 5-second
# windows, and computes per-window statistics:
#   - mean, std, min, max   for every sensor column
#   - Signal Magnitude Area (SMA) for each triaxial sensor group
# Output: one row per window → ready for classification.
# ============================================================

from pyspark.sql import SparkSession, Window
from pyspark.sql.types import DoubleType, IntegerType
from pyspark.sql.functions import (
    col, lit, floor, abs as spark_abs,
    count,
    mean   as F_mean,
    stddev as F_stddev,
    min    as F_min,
    max    as F_max,
)

spark = (
    SparkSession.builder
    .appName("PAMAP2_FeatureEngineering")
    .master("local[*]")
    .config("spark.driver.memory", "4g")
    .config("spark.sql.shuffle.partitions", "8")
    .getOrCreate()
)
spark.sparkContext.setLogLevel("WARN")

# ── Load cleaned parquet from ingestion pipeline ─────────────
INPUT_PATH  = r"C:/Users/johnu/Desktop/BigDataProject/data/pamap2_clean.parquet"

df = spark.read.parquet(INPUT_PATH)
print(f"Loaded {df.count():,} rows  ×  {len(df.columns)} columns")
df.printSchema()

Loaded 2,724,953 rows  ×  44 columns
root
 |-- timestamp: double (nullable = true)
 |-- activity_id: integer (nullable = true)
 |-- heart_rate: double (nullable = true)
 |-- hand_temperature: double (nullable = true)
 |-- hand_acc_16g_x: double (nullable = true)
 |-- hand_acc_16g_y: double (nullable = true)
 |-- hand_acc_16g_z: double (nullable = true)
 |-- hand_acc_6g_x: double (nullable = true)
 |-- hand_acc_6g_y: double (nullable = true)
 |-- hand_acc_6g_z: double (nullable = true)
 |-- hand_gyro_x: double (nullable = true)
 |-- hand_gyro_y: double (nullable = true)
 |-- hand_gyro_z: double (nullable = true)
 |-- hand_mag_x: double (nullable = true)
 |-- hand_mag_y: double (nullable = true)
 |-- hand_mag_z: double (nullable = true)
 |-- chest_temperature: double (nullable = true)
 |-- chest_acc_16g_x: double (nullable = true)
 |-- chest_acc_16g_y: double (nullable = true)
 |-- chest_acc_16g_z: double (nullable = true)
 |-- chest_acc_6g_x: double (nullable = true)
 |-- chest_acc_6g_y

In [2]:
# ============================================================
# 1. Define column groups and window parameters
# ============================================================

WINDOW_SEC = 5.0        # window length in seconds
SAMPLE_HZ  = 100        # IMU sampling rate
EXPECTED_SAMPLES = int(WINDOW_SEC * SAMPLE_HZ)        # 500
MIN_SAMPLES      = int(EXPECTED_SAMPLES * 0.5)         # 250 (reject < 50 %-full windows)

# ── Sensor columns to aggregate ──────────────────────────────
# Everything that is a DoubleType sensor reading (not metadata)
META_COLS = {"timestamp", "activity_id", "subject_id", "session_type"}

sensor_cols = sorted([
    c for c in df.columns
    if c not in META_COLS
    and isinstance(df.schema[c].dataType, DoubleType)
])

# ── Triaxial groups for SMA ──────────────────────────────────
# SMA = (1/n) * Σ (|x| + |y| + |z|)   over the window
# One SMA value per triaxial sensor × body location
IMU_LOCATIONS = ["hand", "chest", "ankle"]
TRIAXIAL_SENSORS = ["acc_16g", "acc_6g", "gyro", "mag"]

triaxial_groups = [
    (f"{loc}_{sensor}", f"{loc}_{sensor}_x", f"{loc}_{sensor}_y", f"{loc}_{sensor}_z")
    for loc in IMU_LOCATIONS
    for sensor in TRIAXIAL_SENSORS
]

print(f"Window        : {WINDOW_SEC}s  ({EXPECTED_SAMPLES} samples @ {SAMPLE_HZ} Hz)")
print(f"Min samples   : {MIN_SAMPLES}  (discard shorter windows)")
print(f"Sensor cols   : {len(sensor_cols)}")
print(f"Triaxial SMA  : {len(triaxial_groups)} groups")
for name, x, y, z in triaxial_groups[:4]:
    print(f"   {name}  →  {x}, {y}, {z}")
print(f"   ... ({len(triaxial_groups) - 4} more)")

Window        : 5.0s  (500 samples @ 100 Hz)
Min samples   : 250  (discard shorter windows)
Sensor cols   : 40
Triaxial SMA  : 12 groups
   hand_acc_16g  →  hand_acc_16g_x, hand_acc_16g_y, hand_acc_16g_z
   hand_acc_6g  →  hand_acc_6g_x, hand_acc_6g_y, hand_acc_6g_z
   hand_gyro  →  hand_gyro_x, hand_gyro_y, hand_gyro_z
   hand_mag  →  hand_mag_x, hand_mag_y, hand_mag_z
   ... (8 more)


In [3]:
# ============================================================
# 2. Assign each row to a 5-second window
# ============================================================
# Windows are computed *within* each (subject, activity) segment
# so that no window ever straddles two different activities.
#
#   window_id = floor( (t − t_min) / 5.0 )
#
# t_min is the earliest timestamp in each segment, computed
# once via a Spark Window aggregate and broadcast-joined back.
# ============================================================

from pyspark.sql.functions import min as F_min_win   # alias to avoid clash

seg_window = Window.partitionBy("subject_id", "activity_id")

df_win = (
    df
    .withColumn("_t0", F_min_win("timestamp").over(seg_window))
    .withColumn(
        "window_id",
        floor((col("timestamp") - col("_t0")) / lit(WINDOW_SEC)).cast("long"),
    )
    .drop("_t0")
)

# Quick look: how many windows per activity
print("=== Windows per activity (sample) ===")
(
    df_win
    .groupBy("activity_id")
    .agg(
        count("*").alias("rows"),
        F_max("window_id").alias("max_window_id"),
    )
    .orderBy("activity_id")
    .show(20, truncate=False)
)

=== Windows per activity (sample) ===


+-----------+------+-------------+
|activity_id|rows  |max_window_id|
+-----------+------+-------------+
|1          |192523|54           |
|2          |185188|57           |
|3          |189931|51           |
|4          |238761|67           |
|5          |98199 |49           |
|6          |164600|50           |
|7          |188107|59           |
|9          |83646 |167          |
|10         |309935|221          |
|11         |54519 |109          |
|12         |117216|156          |
|13         |104944|133          |
|16         |175353|48           |
|17         |238690|75           |
|18         |99878 |54           |
|19         |187188|108          |
|20         |46915 |57           |
|24         |49360 |26           |
+-----------+------+-------------+



In [4]:
# ============================================================
# 3. Build aggregation expressions
# ============================================================
# For every sensor column → mean, std, min, max   (4 features)
# For every triaxial group → SMA                  (1 feature)
# Plus a sample count per window for quality gating.
#
# Everything is expressed as a single list of Column objects
# so the entire extraction runs in ONE groupBy().agg() pass.
# ============================================================

agg_exprs = [
    count("*").alias("sample_count"),       # for quality filter
]

# ── 3a. Per-sensor statistics ────────────────────────────────
for c in sensor_cols:
    agg_exprs.extend([
        F_mean(col(c)).alias(f"{c}_mean"),
        F_stddev(col(c)).alias(f"{c}_std"),
        F_min(col(c)).alias(f"{c}_min"),
        F_max(col(c)).alias(f"{c}_max"),
    ])

# ── 3b. Signal Magnitude Area (SMA) per triaxial group ──────
# SMA = mean( |x| + |y| + |z| )  over the window
for name, x_col, y_col, z_col in triaxial_groups:
    agg_exprs.append(
        F_mean(
            spark_abs(col(x_col)) + spark_abs(col(y_col)) + spark_abs(col(z_col))
        ).alias(f"{name}_sma")
    )

stat_features  = len(sensor_cols) * 4
sma_features   = len(triaxial_groups)
total_features = stat_features + sma_features

print(f"Stat features (mean/std/min/max) : {stat_features}  ({len(sensor_cols)} cols × 4)")
print(f"SMA features                     : {sma_features}")
print(f"Total feature columns            : {total_features}")
print(f"Aggregation expressions built    : {len(agg_exprs)} (incl. sample_count)")

Stat features (mean/std/min/max) : 160  (40 cols × 4)
SMA features                     : 12
Total feature columns            : 172
Aggregation expressions built    : 173 (incl. sample_count)


In [5]:
# ============================================================
# 4. Execute the windowed aggregation
# ============================================================
# Group keys: subject_id, activity_id, window_id
# → one output row per 5-second window.
# ============================================================

GROUP_KEYS = ["subject_id", "activity_id", "window_id"]

df_features_raw = (
    df_win
    .groupBy(*GROUP_KEYS)
    .agg(*agg_exprs)
)

print(f"Raw feature rows   : {df_features_raw.count():,}")
print(f"Output columns     : {len(df_features_raw.columns)}")

# Show distribution of sample counts (how full are the windows?)
print("\n=== Sample count distribution across windows ===")
df_features_raw.select("sample_count").summary("min", "25%", "50%", "75%", "max").show()

Raw feature rows   : 5,525
Output columns     : 176

=== Sample count distribution across windows ===


+-------+------------+
|summary|sample_count|
+-------+------------+
|    min|           1|
|    25%|         500|
|    50%|         500|
|    75%|         500|
|    max|         501|
+-------+------------+



In [6]:
# ============================================================
# 5. Quality gate — drop incomplete windows
# ============================================================
# A full 5 s window at 100 Hz = 500 samples.
# Discard any window with < 250 samples (< 50 % full).
# These occur at activity boundaries or due to sensor dropout.
# Also drop the helper columns (window_id, sample_count).
# ============================================================

df_features = (
    df_features_raw
    .filter(col("sample_count") >= MIN_SAMPLES)
    .drop("window_id", "sample_count")
)

remaining = df_features.count()
dropped   = df_features_raw.count() - remaining

print(f"Windows kept    : {remaining:,}")
print(f"Windows dropped : {dropped:,}  (< {MIN_SAMPLES} samples)")
print(f"Final columns   : {len(df_features.columns)}  "
      f"({len(df_features.columns) - 2} features + subject_id + activity_id)")

# ── Class balance after windowing ────────────────────────────
print("\n=== Activity distribution (windowed) ===")
df_features.groupBy("activity_id").count().orderBy("activity_id").show(25, truncate=False)

Windows kept    : 5,447
Windows dropped : 78  (< 250 samples)
Final columns   : 174  (172 features + subject_id + activity_id)

=== Activity distribution (windowed) ===


+-----------+-----+
|activity_id|count|
+-----------+-----+
|1          |384  |
|2          |372  |
|3          |379  |
|4          |477  |
|5          |196  |
|6          |328  |
|7          |376  |
|9          |167  |
|10         |620  |
|11         |109  |
|12         |234  |
|13         |210  |
|16         |351  |
|17         |477  |
|18         |200  |
|19         |373  |
|20         |94   |
|24         |100  |
+-----------+-----+



In [7]:
# ============================================================
# 6. Sanity check — peek at a few feature rows
# ============================================================

sample_cols = (
    ["subject_id", "activity_id"]
    + [c for c in df_features.columns if "hand_acc_16g" in c][:6]
    + [c for c in df_features.columns if "_sma" in c][:3]
)

print("=== Sample feature values (hand accelerometer 16 g) ===")
df_features.select(sample_cols).show(8, truncate=False)

=== Sample feature values (hand accelerometer 16 g) ===


+----------+-----------+-------------------+---------------------+------------------+------------------+-------------------+---------------------+------------------+------------------+------------------+
|subject_id|activity_id|hand_acc_16g_x_mean|hand_acc_16g_x_std   |hand_acc_16g_x_min|hand_acc_16g_x_max|hand_acc_16g_y_mean|hand_acc_16g_y_std   |hand_acc_16g_sma  |hand_acc_6g_sma   |hand_gyro_sma     |
+----------+-----------+-------------------+---------------------+------------------+------------------+-------------------+---------------------+------------------+------------------+------------------+
|103       |3          |0.5844499633268498 |0.004180867909264736 |0.5712149568566967|0.5940995307758121|0.4317561757384165 |0.004511962930591272 |1.4372349472376116|1.5285712471133523|1.413904558684545 |
|103       |3          |0.5774536395759716 |0.004354096668627515 |0.5613958172672966|0.5871381161205174|0.42870209322215114|0.005108805774356302 |1.448109255896683 |1.5547332175270807|

In [8]:
# ============================================================
# 7. Save windowed features as Parquet (partitioned by subject)
# ============================================================

OUTPUT_PATH = r"C:/Users/johnu/Desktop/BigDataProject/data/pamap2_features.parquet"

(
    df_features
    .repartition("subject_id")
    .write
    .mode("overwrite")
    .partitionBy("subject_id")
    .parquet(OUTPUT_PATH)
)

# ── Verify ───────────────────────────────────────────────────
df_verify = spark.read.parquet(OUTPUT_PATH)
print(f"Saved {df_verify.count():,} windowed feature rows")
print(f"Columns         : {len(df_verify.columns)}")
print(f"Partitions      : {df_verify.select('subject_id').distinct().count()} subjects")
print(f"\nFeature schema (first 20 fields):")
for f in df_verify.schema.fields[:20]:
    print(f"  {f.name:40s}  {f.dataType.simpleString()}")

Saved 5,447 windowed feature rows
Columns         : 174


Partitions      : 9 subjects

Feature schema (first 20 fields):
  activity_id                               int
  ankle_acc_16g_x_mean                      double
  ankle_acc_16g_x_std                       double
  ankle_acc_16g_x_min                       double
  ankle_acc_16g_x_max                       double
  ankle_acc_16g_y_mean                      double
  ankle_acc_16g_y_std                       double
  ankle_acc_16g_y_min                       double
  ankle_acc_16g_y_max                       double
  ankle_acc_16g_z_mean                      double
  ankle_acc_16g_z_std                       double
  ankle_acc_16g_z_min                       double
  ankle_acc_16g_z_max                       double
  ankle_acc_6g_x_mean                       double
  ankle_acc_6g_x_std                        double
  ankle_acc_6g_x_min                        double
  ankle_acc_6g_x_max                        double
  ankle_acc_6g_y_mean                       double
  ankle_acc_6g_y_std 