In [0]:
from pyspark.sql import SparkSession
import pandas as pd

spark = SparkSession.builder.getOrCreate()

df_spark = spark.table("workspace.silver.labeled_step_test")
df = df_spark.toPandas()

df.head()

In [0]:
df.columns

In [0]:
df["step_label"].value_counts(dropna=False)

In [0]:
# Numeric feature columns (already numbers)
feature_cols_numeric = ["distance_cm"]

# Categorical feature columns (text/IDs that need encoding)
feature_cols_categorical = ["sensorType", "deviceId"]

# Label column (what we want to predict)
label_col = "step_label"

In [0]:
# See the exact column names that exist in your dataframe
print(df.columns)

# Helpful: show anything that looks like sensor/device/step/distance
print("\nPossible sensor columns:", [c for c in df.columns if "sensor" in c.lower()])
print("Possible device columns:", [c for c in df.columns if "device" in c.lower()])
print("Possible label columns:",  [c for c in df.columns if "step" in c.lower() and "label" in c.lower()])
print("Possible distance columns:", [c for c in df.columns if "dist" in c.lower()])

In [0]:
from sklearn.model_selection import train_test_split

X = df[feature_cols_numeric + feature_cols_categorical]
y = df[label_col]

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("\nTrain label distribution:\n", y_train.value_counts(normalize=True))
print("\nTest label distribution:\n", y_test.value_counts(normalize=True))

In [0]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

# Scale numeric features
numeric_transformer = StandardScaler()

# One-hot encode categorical features
categorical_transformer = OneHotEncoder(handle_unknown="ignore")

# Combine transformations
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, feature_cols_numeric),
        ("cat", categorical_transformer, feature_cols_categorical)
    ]
)

In [0]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline(steps=[
    ("preprocess", preprocessor)
])

pipeline

In [0]:
# Fit only on training data (avoids leakage)
pipeline.fit(X_train)

# Transform both sets
X_train_transformed = pipeline.transform(X_train)
X_test_transformed = pipeline.transform(X_test)

print("Transformed train shape:", X_train_transformed.shape)
print("Transformed test shape:", X_test_transformed.shape)
print("Type:", type(X_train_transformed))

In [0]:
import joblib
import os

# Option A: always writable on the cluster node
pipeline_path = "/tmp/stedi_feature_pipeline.pkl"
joblib.dump(pipeline, pipeline_path)

print("Saved pipeline to:", pipeline_path)
print("File exists:", os.path.exists(pipeline_path))

In [0]:
import os
import joblib
import numpy as np
import pandas as pd

export_dir = "/tmp/exports/model"
os.makedirs(export_dir, exist_ok=True)

# Save pipeline
joblib.dump(pipeline, f"{export_dir}/stedi_feature_pipeline.pkl")

# Save transformed features
np.save(f"{export_dir}/X_train_transformed.npy", X_train_transformed, allow_pickle=True)
np.save(f"{export_dir}/X_test_transformed.npy", X_test_transformed, allow_pickle=True)

# Save labels
pd.to_pickle(y_train, f"{export_dir}/y_train.pkl")
pd.to_pickle(y_test, f"{export_dir}/y_test.pkl")

print("Saved files to:", export_dir)
print(os.listdir(export_dir))

#Markdown
A consistent feature pipeline helps prevent unfairness because every row of data is processed the same way, every time. That reduces hidden bias caused by inconsistent scaling or encoding and makes the results easier to audit. It also helps avoid data leakage by fitting preprocessing only on the training set. Doctrine and Covenants 130:20â€“21 reminds me that reliable outcomes come from consistent principles, which connects to fairness in how we apply rules to everyone.