In [0]:
import os

print("CWD:", os.getcwd())
project_root = os.path.dirname(os.getcwd())
etl_dir = os.path.join(project_root, "etl_pipeline")

print("Project root:", project_root)
print("etl_pipeline dir exists?", os.path.exists(etl_dir))
if os.path.exists(etl_dir):
    print("Files in etl_pipeline:", os.listdir(etl_dir))


In [0]:
import json, os, glob

keywords = [
    "Pipeline(", "ColumnTransformer", "joblib.dump", "X_train_transformed",
    "fit_transform", "train_test_split", "stedi_feature_pipeline"
]

for path in sorted(glob.glob("*.ipynb")):
    try:
        nb = json.load(open(path, "r", encoding="utf-8"))
        text = "\n".join(
            "".join(cell.get("source", "")) for cell in nb.get("cells", [])
            if cell.get("cell_type") in ("code", "markdown")
        )
        hits = [k for k in keywords if k in text]
        if hits:
            print(path, "->", hits)
    except Exception as e:
        print(path, "-> couldn't read:", e)


In [0]:
from pyspark.sql import SparkSession
import pandas as pd

spark = SparkSession.builder.getOrCreate()

df = spark.table("silver.labeled_step_test").toPandas()
df.head()


In [0]:
df.columns

In [0]:
from sklearn.model_selection import train_test_split

feature_cols_numeric = ["distance_cm"]
feature_cols_categorical = ["sensor_type", "device_id"]
label_col = "step_label"

X = df[feature_cols_numeric + feature_cols_categorical]
y = df[label_col]

X_train_raw, X_test_raw, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Train:", X_train_raw.shape, "Test:", X_test_raw.shape)
print("Train label %:\n", y_train.value_counts(normalize=True))
print("Test label %:\n", y_test.value_counts(normalize=True))


In [0]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown="ignore")

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, feature_cols_numeric),
        ("cat", categorical_transformer, feature_cols_categorical),
    ]
)

pipeline = Pipeline(steps=[("preprocess", preprocessor)])

pipeline.fit(X_train_raw)

X_train = pipeline.transform(X_train_raw)
X_test  = pipeline.transform(X_test_raw)

print("Transformed shapes:", X_train.shape, X_test.shape)
print("Type:", type(X_train))


In [0]:
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression(max_iter=300)
log_reg.fit(X_train, y_train)

log_reg_score = log_reg.score(X_test, y_test)
log_reg_score


In [0]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf.fit(X_train, y_train)

rf_score = rf.score(X_test, y_test)
rf_score


In [0]:
results = {
    "Logistic Regression baseline": log_reg_score,
    "Random Forest baseline": rf_score
}
results


#Markdown
Logistic Regression performed slightly better than Random Forest (0.9511 vs 0.9508 accuracy), though the difference is tiny. For noisy sensor data, Logistic Regression can be more stable because it’s simpler and less likely to overfit, while Random Forest can chase noise even if it sometimes captures complex patterns better. I’m wondering why the scores are so close. Does the heavy class imbalance (about 95% “step”) make accuracy less meaningful, and how do the models compare specifically on the minority “no_step” cases? I also wonder whether tuning Random Forest parameters (like depth or number of trees) would change the result. Testing matters before real use because wrong predictions could affect people wearing the device (missed steps or false alerts) and anyone relying on the data (researchers, clinicians, or decision-makers). Fairness matters because models can silently perform worse on minority cases or certain devices/sensor types, and discipleship also requires applying principles consistently and justly rather than unevenly.