In [0]:
import os, joblib
import numpy as np
from scipy.sparse import issparse

project_root = os.path.dirname(os.getcwd())
load_dir = os.path.join(project_root, "etl_pipeline")

pipeline = joblib.load(os.path.join(load_dir, "stedi_feature_pipeline.pkl"))
X_train_transformed = joblib.load(os.path.join(load_dir, "X_train_transformed.pkl"))
X_test_transformed  = joblib.load(os.path.join(load_dir, "X_test_transformed.pkl"))
y_train = joblib.load(os.path.join(load_dir, "y_train.pkl"))
y_test  = joblib.load(os.path.join(load_dir, "y_test.pkl"))

def to_float_matrix(arr: np.ndarray) -> np.ndarray:
    if arr.ndim == 0:
        arr = arr.item()
        if issparse(arr):
            arr = arr.toarray()
        arr = np.array(arr, dtype=float)
    elif arr.dtype == object:
        arr = np.array([
            x.toarray() if issparse(x) else np.array(x, dtype=float)
            for x in arr
        ])
        arr = np.vstack(arr)
    elif issparse(arr):
        arr = arr.toarray()
    else:
        arr = np.array(arr, dtype=float)
    return arr

X_train = to_float_matrix(X_train_transformed)
X_test  = to_float_matrix(X_test_transformed)
y_train = np.ravel(y_train)
y_test  = np.ravel(y_test)

X_train.shape, X_test.shape, y_train.shape, y_test.shape


In [0]:
import numpy as np
from scipy.sparse import issparse

def to_float_matrix(arr: np.ndarray) -> np.ndarray:
    if arr.ndim == 0:
        arr = arr.item()
        if issparse(arr):
            arr = arr.toarray()
        arr = np.array(arr, dtype=float)
    elif arr.dtype == object:
        arr = np.array([
            x.toarray() if issparse(x) else np.array(x, dtype=float)
            for x in arr
        ])
        arr = np.vstack(arr)
    elif issparse(arr):
        arr = arr.toarray()
    else:
        arr = np.array(arr, dtype=float)
    return arr

X_train = to_float_matrix(X_train_transformed)
X_test  = to_float_matrix(X_test_transformed)

y_train = np.ravel(y_train)
y_test  = np.ravel(y_test)

X_train.shape, X_test.shape, y_train.shape, y_test.shape


In [0]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

log_reg_params = {
    "C": [0.01, 0.1, 1, 10],
    "penalty": ["l2"],
    "solver": ["lbfgs", "liblinear"]
}

log_reg_grid = GridSearchCV(
    LogisticRegression(max_iter=300),
    log_reg_params,
    cv=3,
    scoring="accuracy"
)

log_reg_grid.fit(X_train, y_train)

log_reg_best_params = log_reg_grid.best_params_
log_reg_best_score = log_reg_grid.best_score_

log_reg_best_params, log_reg_best_score


In [0]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

rf_params = {
    "n_estimators": [50, 100, 200],
    "max_depth": [None, 5, 10, 20],
    "min_samples_split": [2, 5],
    "min_samples_leaf": [1, 2]
}

rf_grid = GridSearchCV(
    RandomForestClassifier(random_state=42),
    rf_params,
    cv=3,
    scoring="accuracy",
    n_jobs=-1
)

rf_grid.fit(X_train, y_train)

rf_best_params = rf_grid.best_params_
rf_best_score = rf_grid.best_score_

rf_best_params, rf_best_score


In [0]:
results = {
    "Logistic Regression (tuned)": log_reg_best_score,
    "Random Forest (tuned)": rf_best_score
}
results


In [0]:
# Choose the better model based on best_score_
if rf_best_score > log_reg_best_score:
    best_model = rf_grid.best_estimator_
    best_model_name = "Random Forest"
else:
    best_model = log_reg_grid.best_estimator_
    best_model_name = "Logistic Regression"

best_model_name, best_model

In [0]:
import os, joblib

project_root = os.path.dirname(os.getcwd())
save_dir = os.path.join(project_root, "etl_pipeline")

joblib.dump(best_model, os.path.join(save_dir, "stedi_best_model.pkl"))
print("Saved best model to:", os.path.join(save_dir, "stedi_best_model.pkl"))
print("Folder contains:", os.listdir(save_dir))

In [0]:
from sklearn.metrics import confusion_matrix

print(confusion_matrix(y_test, lr_pred))
# rows = true [no_step, step], cols = predicted [no_step, step]

#Markdown
Both tuned models performed the same. Logistic Regression reached a best 3-fold CV accuracy of 0.9511214840660257 with C=0.01, penalty=l2, solver=lbfgs, and Random Forest also reached 0.9511214840660257 with max_depth=5, n_estimators=50, min_samples_split=2, min_samples_leaf=1. On the test set, both achieved 0.9511166072597658 accuracy, so I selected Logistic Regression as the final model since it matches performance while being simpler to interpret and reuse later.

That said, accuracy is misleading here because the classes are imbalanced. The model predicted “step” for every test row (confusion matrix [[0, 2016], [0, 39225]]), giving 0.00 recall for the no_step class. With more time, I would tune using a metric like balanced accuracy or macro F1 and test class_weight="balanced" to improve minority-class detection. Hyperparameter tuning can unintentionally increase unfairness when it optimizes a metric that rewards majority-class performance; transparency matters because it forces honest reporting of what the model actually does, aligning with the gospel principle of truthful evaluation and steady improvement.