In [0]:
import mlflow
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
import pyspark.sql.functions as F
from pyspark.sql.functions import col

# Do NOT call mlflow.pyspark.ml.autolog() — crashes on serverless
CATALOG = 'ecommerce'
print('Imports done')

Imports done


In [0]:
# MLflow Connectivity Test (Run First)
# Always run this first to confirm MLflow is working before you start training.

mlflow.set_experiment('/ecom_purchase_prediction_2')

with mlflow.start_run(run_name='connectivity_test') as run:
    mlflow.log_param('test', 'ok')
    mlflow.log_metric('test_metric', 1.0)
    print(f'MLflow working! Run ID: {run.info.run_id}')
    print('Check left sidebar -> Experiments -> /ecom_purchase_prediction_2')


MLflow working! Run ID: 4acea46d7f5a4c88915bc7c29aab9921
Check left sidebar -> Experiments -> /ecom_purchase_prediction_2


In [0]:
# Clear all MLflow experiments (delete all runs in the experiment)
from mlflow.tracking import MlflowClient

client = MlflowClient()
experiment = mlflow.get_experiment_by_name('/ecom_purchase_prediction_2')
if experiment:
    runs = client.search_runs(experiment.experiment_id)
    for run in runs:
        client.delete_run(run.info.run_id)
    print('All runs deleted from experiment /ecom_purchase_prediction_2.')
else:
    print('Experiment not found.')

All runs deleted from experiment /ecom_purchase_prediction_2.


In [0]:
# Load Data from Day 5
df_all   = spark.table(f'{CATALOG}.gold.ml_dataset')
df_train = df_all.filter(col('split') == 'train')
df_test  = df_all.filter(col('split') == 'test')

print(f'Train rows: {df_train.count():,}')
print(f'Test rows:  {df_test.count():,}')

# 5 numerical features only — avoids the maxBins error from string-encoded categories
FEATURE_COLS = [
    'total_events', 'total_sessions', 'total_views',
    'total_cart_adds', 'avg_price_viewed'
]
LABEL_COL  = 'will_purchase'
WEIGHT_COL = 'class_weight'

display(df_train.select(FEATURE_COLS + [LABEL_COL, WEIGHT_COL]).limit(5))


Train rows: 4,579,212
Test rows:  1,142,381


total_events,total_sessions,total_views,total_cart_adds,avg_price_viewed,will_purchase,class_weight
1,1,0,1,0.0,0,0.5888
1,1,0,1,0.0,0,0.5888
1,1,0,1,0.0,0,0.5888
1,1,0,1,0.0,0,0.5888
1,1,0,1,0.0,0,0.5888


Why 5 features only? favourite_brand has 2,414 unique values which causes a RandomForest error (maxBins=32 by default). Keeping only the 5 numerical features avoids this entirely. These 5 features carry the strongest predictive signal anyway.

In [0]:
# Define Evaluators
# Define once, reuse for every model.

# AUC — primary metric (correct for imbalanced data)
auc_evaluator = BinaryClassificationEvaluator(
    labelCol         = LABEL_COL,
    rawPredictionCol = 'rawPrediction',
    metricName       = 'areaUnderROC'
)

# F1 — secondary metric
f1_evaluator = MulticlassClassificationEvaluator(
    labelCol      = LABEL_COL,
    predictionCol = 'prediction',
    metricName    = 'f1'
)
print('Evaluators ready')


Evaluators ready


In [0]:
# Train Logistic Regression
# Logistic Regression needs StandardScaler because our features have very different ranges (total_events mean=22, avg_price_viewed mean=307). Without scaling, the model gets confused by the magnitude difference.

# Step 1: Combine all features into one vector
assembler = VectorAssembler(
    inputCols     = FEATURE_COLS,
    outputCol     = 'features_raw',
    handleInvalid = 'skip'
)

# Step 2: Scale features to same range (required for Logistic Regression)
scaler = StandardScaler(
    inputCol  = 'features_raw',
    outputCol = 'features',
    withMean  = True,
    withStd   = True
)

# Step 3: Define model
lr = LogisticRegression(
    featuresCol = 'features',
    labelCol    = LABEL_COL,
    weightCol   = WEIGHT_COL,   # class weights from Day 5
    maxIter     = 100,
    regParam    = 0.01          # small regularisation to prevent overfitting
)

# Chain all steps into one pipeline
lr_pipeline = Pipeline(stages=[assembler, scaler, lr])

# Train with MLflow tracking
with mlflow.start_run(run_name='logistic_regression') as run:
    mlflow.log_param('model',    'LogisticRegression')
    mlflow.log_param('maxIter',  100)
    mlflow.log_param('regParam', 0.01)
    mlflow.log_param('features', str(FEATURE_COLS))

    print('Training Logistic Regression...')
    lr_model = lr_pipeline.fit(df_train)
    print('Done!')

    lr_preds = lr_model.transform(df_test)
    lr_auc   = auc_evaluator.evaluate(lr_preds)
    lr_f1    = f1_evaluator.evaluate(lr_preds)

    mlflow.log_metric('test_auc', lr_auc)
    mlflow.log_metric('test_f1',  lr_f1)

    lr_run_id = run.info.run_id
    print(f'Logistic Regression -> AUC: {lr_auc:.4f}  |  F1: {lr_f1:.4f}')


Training Logistic Regression...
Done!
Logistic Regression -> AUC: 0.9125  |  F1: 0.8901


In [0]:
# Inspect Logistic Regression Results
# See sample predictions
display(
    lr_preds.select('will_purchase', 'prediction', 'probability',
                    'total_events', 'total_cart_adds').limit(10)
)

# Confusion matrix — how many did the model get right vs wrong
print('=== CONFUSION MATRIX ===')
lr_preds.groupBy('will_purchase', 'prediction') \
    .count() \
    .orderBy('will_purchase', 'prediction') \
    .show()

# will_purchase=1, prediction=1 -> correctly predicted buyer     (True Positive)
# will_purchase=1, prediction=0 -> missed a buyer                (False Negative)
# will_purchase=0, prediction=1 -> wrongly predicted buyer       (False Positive)
# will_purchase=0, prediction=0 -> correctly predicted non-buyer (True Negative)



will_purchase,prediction,probability,total_events,total_cart_adds
0,0.0,"{""type"":""1"",""size"":null,""indices"":null,""values"":[""0.6193027973883906"",""0.38069720261160944""]}",1,1
0,0.0,"{""type"":""1"",""size"":null,""indices"":null,""values"":[""0.6193027973883906"",""0.38069720261160944""]}",1,1
0,0.0,"{""type"":""1"",""size"":null,""indices"":null,""values"":[""0.6193027973883906"",""0.38069720261160944""]}",1,1
0,0.0,"{""type"":""1"",""size"":null,""indices"":null,""values"":[""0.6193027973883906"",""0.38069720261160944""]}",1,1
0,0.0,"{""type"":""1"",""size"":null,""indices"":null,""values"":[""0.6193027973883906"",""0.38069720261160944""]}",1,1
0,0.0,"{""type"":""1"",""size"":null,""indices"":null,""values"":[""0.6193027973883906"",""0.38069720261160944""]}",1,1
0,0.0,"{""type"":""1"",""size"":null,""indices"":null,""values"":[""0.6193027973883906"",""0.38069720261160944""]}",1,1
0,0.0,"{""type"":""1"",""size"":null,""indices"":null,""values"":[""0.6193027973883906"",""0.38069720261160944""]}",1,1
0,0.0,"{""type"":""1"",""size"":null,""indices"":null,""values"":[""0.6193027973883906"",""0.38069720261160944""]}",1,1
0,0.0,"{""type"":""1"",""size"":null,""indices"":null,""values"":[""0.6193027973883906"",""0.38069720261160944""]}",1,1


=== CONFUSION MATRIX ===
+-------------+----------+------+
|will_purchase|prediction| count|
+-------------+----------+------+
|            0|       0.0|893473|
|            0|       1.0| 76826|
|            1|       0.0| 52234|
|            1|       1.0|119848|
+-------------+----------+------+



Train Random Forest
Random Forest does NOT need StandardScaler. Trees split on feature value thresholds — the actual scale does not matter. We also use only 5 numerical features to avoid the maxBins error that occurs with string-encoded categories.


In [0]:
# For RF: VectorAssembler only — no scaler needed
assembler_rf = VectorAssembler(
    inputCols     = FEATURE_COLS,
    outputCol     = 'features',       # RF reads from 'features' directly
    handleInvalid = 'skip'
)

rf = RandomForestClassifier(
    featuresCol = 'features',
    labelCol    = LABEL_COL,
    weightCol   = WEIGHT_COL,
    numTrees    = 50,                 # 50 trees — good balance of speed vs accuracy
    maxDepth    = 5,                  # keep shallow to avoid overfitting
    seed        = 42                  # fixed seed = reproducible results
)

rf_pipeline = Pipeline(stages=[assembler_rf, rf])

with mlflow.start_run(run_name='random_forest') as run:
    mlflow.log_param('model',    'RandomForest')
    mlflow.log_param('numTrees', 50)
    mlflow.log_param('maxDepth', 5)
    mlflow.log_param('features', str(FEATURE_COLS))

    print('Training Random Forest (takes longer than LR)...')
    rf_model = rf_pipeline.fit(df_train)
    print('Done!')

    rf_preds = rf_model.transform(df_test)
    rf_auc   = auc_evaluator.evaluate(rf_preds)
    rf_f1    = f1_evaluator.evaluate(rf_preds)

    mlflow.log_metric('test_auc', rf_auc)
    mlflow.log_metric('test_f1',  rf_f1)

    rf_run_id = run.info.run_id
    print(f'Random Forest -> AUC: {rf_auc:.4f}  |  F1: {rf_f1:.4f}')


Training Random Forest (takes longer than LR)...
Done!
Random Forest -> AUC: 0.9438  |  F1: 0.8829


Feature Importances (Random Forest)
Random Forest tells you which features contributed most to its predictions — for free, after training. This helps you understand what actually drives purchase behaviour in your data.


In [0]:
rf_fitted = rf_model.stages[-1]   # last stage of the pipeline = the RF model

importances = list(zip(FEATURE_COLS, rf_fitted.featureImportances))
importances.sort(key=lambda x: x[1], reverse=True)

print('=== FEATURE IMPORTANCES ===')
print(f'{"Feature":<25} {"Importance":>10}')
print('-' * 38)
for feat, imp in importances:
    bar = '#' * int(imp * 60)
    print(f'{feat:<25} {imp:>10.4f}  {bar}')


=== FEATURE IMPORTANCES ===
Feature                   Importance
--------------------------------------
total_cart_adds               0.6783  ########################################
total_events                  0.2173  #############
total_sessions                0.0600  ###
total_views                   0.0428  ##
avg_price_viewed              0.0016  


> Tune Random Forest (Try Better Parameters)
Tuning means trying different settings to see if we can improve AUC. We try numTrees=20 vs 100 and maxDepth=5 vs 10. Each combination trains a separate model and logs its AUC to MLflow so you can compare.

> This is simple manual tuning — not CrossValidator. We train each combination separately and log it as its own MLflow run. Easy to understand, easy to debug.


In [0]:
# Define the combinations you want to try
param_combos = [
    {'numTrees': 20,  'maxDepth': 5},
    {'numTrees': 20,  'maxDepth': 10},
    {'numTrees': 100, 'maxDepth': 5},
    {'numTrees': 100, 'maxDepth': 10},
]

best_auc       = 0
best_params    = None
best_run_id    = None

for params in param_combos:
    run_name = f'rf_trees{params["numTrees"]}_depth{params["maxDepth"]}'

    with mlflow.start_run(run_name=run_name) as run:
        mlflow.log_param('model',    'RandomForest')
        mlflow.log_param('numTrees', params['numTrees'])
        mlflow.log_param('maxDepth', params['maxDepth'])

        rf_tune = RandomForestClassifier(
            featuresCol = 'features',
            labelCol    = LABEL_COL,
            weightCol   = WEIGHT_COL,
            numTrees    = params['numTrees'],
            maxDepth    = params['maxDepth'],
            seed        = 42
        )
        pipeline_tune = Pipeline(stages=[assembler_rf, rf_tune])

        print(f'Training: numTrees={params["numTrees"]} maxDepth={params["maxDepth"]}...')
        model_tune  = pipeline_tune.fit(df_train)
        preds_tune  = model_tune.transform(df_test)
        auc_tune    = auc_evaluator.evaluate(preds_tune)
        f1_tune     = f1_evaluator.evaluate(preds_tune)

        mlflow.log_metric('test_auc', auc_tune)
        mlflow.log_metric('test_f1',  f1_tune)

        print(f'  AUC: {auc_tune:.4f}  |  F1: {f1_tune:.4f}')

        if auc_tune > best_auc:
            best_auc    = auc_tune
            best_params = params
            best_run_id = run.info.run_id

print(f'\nBest params: {best_params}')
print(f'Best AUC:    {best_auc:.4f}')
print(f'Best run ID: {best_run_id}')


Training: numTrees=20 maxDepth=5...
  AUC: 0.9444  |  F1: 0.8804
Training: numTrees=20 maxDepth=10...
  AUC: 0.9648  |  F1: 0.9025
Training: numTrees=100 maxDepth=5...
  AUC: 0.9441  |  F1: 0.8821
Training: numTrees=100 maxDepth=10...
  AUC: 0.9657  |  F1: 0.9064

Best params: {'numTrees': 100, 'maxDepth': 10}
Best AUC:    0.9657
Best run ID: f98d8dd38638437ab739745f6547db73


In [0]:

print('=' * 55)
print('DAY 6 — MODEL COMPARISON SUMMARY')
print('=' * 55)
print(f'{"Model":<35} {"AUC":>8} {"F1":>8}')
print('-' * 55)
print(f'{"Logistic Regression":<35} {lr_auc:>8.4f} {lr_f1:>8.4f}')
print(f'{"Random Forest (numTrees=50, depth=5)":<35} {rf_auc:>8.4f} {rf_f1:>8.4f}')
print(f'{"Best Tuned RF (" + str(best_params) + ")":<35} {best_auc:>8.4f}')
print('=' * 55)
print(f'\nWinner: Random Forest (tuned)')
print(f'Best AUC: {best_auc:.4f}')
print('This model gets registered in MLflow Model Registry on Day 7.')


DAY 6 — MODEL COMPARISON SUMMARY
Model                                    AUC       F1
-------------------------------------------------------
Logistic Regression                   0.9125   0.8901
Random Forest (numTrees=50, depth=5)   0.9438   0.8829
Best Tuned RF ({'numTrees': 100, 'maxDepth': 10})   0.9657

Winner: Random Forest (tuned)
Best AUC: 0.9657
This model gets registered in MLflow Model Registry on Day 7.
