In [None]:
# ============================================================
# Multi-Class Activity Classification — PySpark MLlib
# ============================================================
# Pipeline: StringIndexer → VectorAssembler → StandardScaler
#           → Classifier
#
# Models : Logistic Regression, Random Forest,
#          Multilayer Perceptron, Linear SVM (OneVsRest)
# Tuning : CrossValidator  (3-fold)
# Metrics: Accuracy, Weighted F1
# ============================================================

from pyspark.sql import SparkSession
from pyspark.sql.types import DoubleType
from pyspark.sql.functions import col

from pyspark.ml import Pipeline
from pyspark.ml.feature import (
    StringIndexer, IndexToString, VectorAssembler, StandardScaler,
)
from pyspark.ml.classification import (
    LogisticRegression,
    RandomForestClassifier,
    MultilayerPerceptronClassifier,
    LinearSVC,
    OneVsRest,
)
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import (
    MulticlassClassificationEvaluator,
)

import time

spark = (
    SparkSession.builder
    .appName("PAMAP2_ModelTraining")
    .master("local[4]")
    .config("spark.driver.memory", "4g")
    .config("spark.sql.shuffle.partitions", "4")
    .config("spark.python.worker.reuse", "true")
    .getOrCreate()
)
spark.sparkContext.setLogLevel("WARN")
print(f"Spark version : {spark.version}")

In [None]:
# ============================================================
# 1. Load features and prepare label / feature columns
# ============================================================

INPUT_PATH = r"C:/Users/johnu/Desktop/BigDataProject/data/pamap2_features.parquet"

df = spark.read.parquet(INPUT_PATH)
print(f"Loaded {df.count():,} rows  x  {len(df.columns)} columns")

# -- Identify feature columns (all DoubleType, excluding keys) -
META = {"subject_id", "activity_id"}
feature_cols = sorted([
    c for c in df.columns
    if c not in META
    and isinstance(df.schema[c].dataType, DoubleType)
])
print(f"Feature columns : {len(feature_cols)}")

# -- Replace NaN with 0 (stddev columns produce NaN for
#    single-value windows) then drop any remaining nulls ------
from pyspark.sql.functions import isnan, when

for c in feature_cols:
    df = df.withColumn(c, when(isnan(col(c)), 0.0).otherwise(col(c)))

df_clean = df.na.drop(subset=feature_cols)
print(f"After NaN-fix + null-drop : {df_clean.count():,} rows")

# -- Train / test split (80/20) --------------------------------
train_df, test_df = df_clean.randomSplit([0.8, 0.2], seed=42)
train_df.cache()
test_df.cache()

print(f"\nTrain : {train_df.count():,}")
print(f"Test  : {test_df.count():,}")

In [None]:
# ============================================================
# 2. Shared pipeline stages & evaluation helpers
# ============================================================
# These three stages are reused by every model:
#   StringIndexer  -> maps activity_id (int) to 0-based label
#   VectorAssembler -> packs feature columns into a Vector
#   StandardScaler  -> zero-mean, unit-variance scaling
#
# The classifier is appended per-model to form the full Pipeline.
# ============================================================

# -- Label indexer ---------------------------------------------
label_indexer = (
    StringIndexer(inputCol="activity_id", outputCol="label")
    .setHandleInvalid("keep")
)

# -- Feature assembly ------------------------------------------
assembler = VectorAssembler(
    inputCols=feature_cols,
    outputCol="features_raw",
    handleInvalid="keep",
)

# -- Standard scaler -------------------------------------------
scaler = StandardScaler(
    inputCol="features_raw",
    outputCol="features",
    withMean=True,
    withStd=True,
)

# -- Evaluators ------------------------------------------------
eval_accuracy = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction",
    metricName="accuracy",
)
eval_f1 = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction",
    metricName="f1",
)

# -- Store results ---------------------------------------------
results = []

def evaluate_model(model_name, cv_model, test_data):
    """
    Run the best model from CrossValidator on the test set,
    compute accuracy and weighted-F1, store in results,
    and print a summary.
    """
    t0 = time.time()
    predictions = cv_model.transform(test_data)
    acc = eval_accuracy.evaluate(predictions)
    f1  = eval_f1.evaluate(predictions)
    elapsed = time.time() - t0

    results.append({
        "model": model_name,
        "accuracy": round(acc, 4),
        "f1_weighted": round(f1, 4),
        "eval_time_s": round(elapsed, 1),
    })

    print(f"\n{'=' * 56}")
    print(f"  {model_name}")
    print(f"{'=' * 56}")
    print(f"  Accuracy     : {acc:.4f}")
    print(f"  Weighted F1  : {f1:.4f}")
    print(f"  Eval time    : {elapsed:.1f}s")
    print(f"{'=' * 56}")

    return predictions

print("Shared stages ready.")

In [None]:
# ============================================================
# 3. Logistic Regression (multinomial)
# ============================================================

lr = LogisticRegression(
    featuresCol="features",
    labelCol="label",
    family="multinomial",
    maxIter=100,
    elasticNetParam=0.0,
)

lr_pipeline = Pipeline(stages=[label_indexer, assembler, scaler, lr])

lr_grid = (
    ParamGridBuilder()
    .addGrid(lr.regParam, [0.01, 0.1])
    .build()
)

lr_cv = CrossValidator(
    estimator=lr_pipeline,
    estimatorParamMaps=lr_grid,
    evaluator=eval_f1,
    numFolds=3,
    parallelism=1,
    seed=42,
)

print(f"Logistic Regression -- grid size: {len(lr_grid)}, folds: 3")
t0 = time.time()
lr_cv_model = lr_cv.fit(train_df)
print(f"Training time: {time.time() - t0:.1f}s")

lr_preds = evaluate_model("Logistic Regression", lr_cv_model, test_df)

In [None]:
# ============================================================
# 4. Random Forest
# ============================================================

rf = RandomForestClassifier(
    featuresCol="features",
    labelCol="label",
    seed=42,
)

rf_pipeline = Pipeline(stages=[label_indexer, assembler, scaler, rf])

rf_grid = (
    ParamGridBuilder()
    .addGrid(rf.numTrees,  [20, 50])
    .addGrid(rf.maxDepth,  [5])
    .build()
)

rf_cv = CrossValidator(
    estimator=rf_pipeline,
    estimatorParamMaps=rf_grid,
    evaluator=eval_f1,
    numFolds=3,
    parallelism=1,
    seed=42,
)

print(f"Random Forest -- grid size: {len(rf_grid)}, folds: 3")
t0 = time.time()
rf_cv_model = rf_cv.fit(train_df)
print(f"Training time: {time.time() - t0:.1f}s")

rf_preds = evaluate_model("Random Forest", rf_cv_model, test_df)

In [None]:
# ============================================================
# 5. Multilayer Perceptron (MLP) Classifier
# ============================================================
# Feed-forward neural network with configurable hidden layers.
# Natively supports multi-class via softmax output layer.
# ============================================================

NUM_FEATURES = len(feature_cols)   # 172
NUM_CLASSES  = train_df.select("activity_id").distinct().count()

mlp = MultilayerPerceptronClassifier(
    featuresCol="features",
    labelCol="label",
    layers=[NUM_FEATURES, 64, NUM_CLASSES],
    blockSize=128,
    seed=42,
)

mlp_pipeline = Pipeline(stages=[label_indexer, assembler, scaler, mlp])

mlp_grid = (
    ParamGridBuilder()
    .addGrid(mlp.maxIter, [50, 100])
    .build()
)

mlp_cv = CrossValidator(
    estimator=mlp_pipeline,
    estimatorParamMaps=mlp_grid,
    evaluator=eval_f1,
    numFolds=3,
    parallelism=1,
    seed=42,
)

print(f"MLP -- grid size: {len(mlp_grid)}, folds: 3")
print(f"  Layers: [{NUM_FEATURES}, 64, {NUM_CLASSES}]")
t0 = time.time()
mlp_cv_model = mlp_cv.fit(train_df)
print(f"Training time: {time.time() - t0:.1f}s")

mlp_preds = evaluate_model("Multilayer Perceptron", mlp_cv_model, test_df)

In [None]:
# ============================================================
# 6. Linear SVM (OneVsRest for multi-class)
# ============================================================
# Spark's LinearSVC is binary-only.  We wrap it with OneVsRest
# which trains one binary SVC per class and combines them.
# ============================================================

lsvc = LinearSVC(
    featuresCol="features",
    labelCol="label",
    maxIter=50,
)

ovr = OneVsRest(
    classifier=lsvc,
    featuresCol="features",
    labelCol="label",
)

svm_pipeline = Pipeline(stages=[label_indexer, assembler, scaler, ovr])

svm_grid = (
    ParamGridBuilder()
    .addGrid(lsvc.regParam, [0.01, 0.1])
    .build()
)

svm_cv = CrossValidator(
    estimator=svm_pipeline,
    estimatorParamMaps=svm_grid,
    evaluator=eval_f1,
    numFolds=3,
    parallelism=1,
    seed=42,
)

print(f"Linear SVM (OVR) -- grid size: {len(svm_grid)}, folds: 3")
t0 = time.time()
svm_cv_model = svm_cv.fit(train_df)
print(f"Training time: {time.time() - t0:.1f}s")

svm_preds = evaluate_model("Linear SVM (OVR)", svm_cv_model, test_df)

In [None]:
# ============================================================
# 7. Comparison summary
# ============================================================

from pyspark.sql import Row

df_results = spark.createDataFrame([Row(**r) for r in results])
df_results = df_results.select("model", "accuracy", "f1_weighted", "eval_time_s")

print("=" * 64)
print("  MODEL COMPARISON — PAMAP2 Activity Recognition")
print("=" * 64)
df_results.orderBy(col("f1_weighted").desc()).show(truncate=False)

# Identify best model
best = max(results, key=lambda r: r["f1_weighted"])
print(f"Best model by weighted-F1: {best['model']}")
print(f"  Accuracy    : {best['accuracy']}")
print(f"  Weighted F1 : {best['f1_weighted']}")

In [None]:
# ============================================================
# 8. Save the best model and results to disk
# ============================================================

import json

OUTPUT_DIR = r"C:/Users/johnu/Desktop/BigDataProject/data"

# -- Save results as JSON --------------------------------------
results_path = f"{OUTPUT_DIR}/model_results.json"
with open(results_path, "w") as f:
    json.dump(results, f, indent=2)
print(f"Results saved to {results_path}")

# -- Save the best CrossValidator model ------------------------
best_models = {
    "Logistic Regression":    lr_cv_model,
    "Random Forest":          rf_cv_model,
    "Multilayer Perceptron":  mlp_cv_model,
    "Linear SVM (OVR)":       svm_cv_model,
}
best_cv = best_models[best["model"]]
model_path = f"{OUTPUT_DIR}/best_model"
best_cv.bestModel.write().overwrite().save(model_path)
print(f"Best model ({best['model']}) saved to {model_path}")