#### Step 1: Load Features & Prepare Data


In [0]:
# Step 1.1: Load the gold table
ml_features = spark.table("gold.ml_features")

#Step 1.2: Sample the dataset to avoid kernel crash (0.1% of ~109M rows)
df_sampled = ml_features.sample(fraction=0.001,seed=42)

# Step 1.3: Drop the nulls
df_sampled = df_sampled.dropna()

# Step 1.4: Convert to Pandas
df_ml = df_sampled.toPandas()

# Step 1.5: Separate features and label
X = df_ml.drop('label', axis=1)
y = df_ml['label']

# Step 1.5: verify
print("Feature Columns:", X.columns.tolist())
print("Number of rows:", X.shape[0])
print("Label distribution:\n", y.value_counts())

Feature Columns: ['price_log', 'hour', 'day_of_week', 'is_weekend', 'time_since_first_event']
Number of rows: 109010
Label distribution:
 label
0    107340
1      1670
Name: count, dtype: int64


#### Step 2: Train-Test Split

In [0]:
from sklearn.model_selection import train_test_split

# Train-test split with stratify
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

# Print row counts
print("Training rows:", X_train.shape[0])
print("Test rows:", X_test.shape[0])

# Print label distribution safely
print("\nTrain label distribution:", y_train.value_counts().to_dict())
print("Test label distribution:", y_test.value_counts().to_dict())

type(y)

Training rows: 87208
Test rows: 21802

Train label distribution: {0: 85872, 1: 1336}
Test label distribution: {0: 21468, 1: 334}


pandas.core.series.Series

#### Step 3: MLflow Experiment -Logistic Regression

In [0]:
import mlflow
import mlflow.sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score,confusion_matrix

#Start MLflow experiment
mlflow.set_experiment("/Shared/day12_purchase_prediction")

with mlflow.start_run(run_name="logistic_regression_v1"):
    # Log parameters
    mlflow.log_param("model_type", "LogisticRegression")
    mlflow.log_param("test_size", 0.2)
    mlflow.log_param("random_state", 42)

    #Train the model
    model = LogisticRegression(max_iter=200, class_weight="balanced") #class_weight balances for imbalance
    model.fit(X_train, y_train)

    #Predict on test set
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:,1]

    #Compute metrics
    acc = accuracy_score(y_test, y_pred)
    roc = roc_auc_score(y_test, y_proba)

    mlflow.log_metric("accuracy", acc)
    mlflow.log_metric("roc_auc", roc)

    #Optional: log confusion matrix as artifact
    cm = confusion_matrix(y_test, y_pred)
    print("Confusion Matrix:\n", cm)

    #Log the model
    mlflow.sklearn.log_model(model, artifact_path="model")

print(f"Run finished. Accuracy: {acc:.4f}, ROC AUC: {roc:.4f}")

Confusion Matrix:
 [[11778  9690]
 [  149   185]]




Run finished. Accuracy: 0.5487, ROC AUC: 0.5769


- #### Step 4: Random Forest with Class Weights

In [0]:
import mlflow
import mlflow.sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score, f1_score, confusion_matrix

#Start MLflow experiment
mlflow.set_experiment("/Shared/day12_purchase_prediction")

with mlflow.start_run(run_name="random_forest_v1"):
    #Log parameters
    mlflow.log_param("model_type", "RandomForest")
    mlflow.log_param("n_estimators", 150)
    mlflow.log_param("max_depth", 10)
    mlflow.log_param("class_weight", "balanced")
    mlflow.log_param("random_state", 42)

    #Train the model
    rf = RandomForestClassifier(
        n_estimators=150,
        max_depth=10,
        class_weight="balanced",
        random_state=42
    )
    rf.fit(X_train, y_train)

    #Predict on test set
    y_pred = rf.predict(X_test)
    y_proba = rf.predict_proba(X_test)[:, 1]

    #Compute metrics
    acc = accuracy_score(y_test, y_pred)
    roc = roc_auc_score(y_test, y_proba)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    cm= confusion_matrix(y_test, y_pred)

    #log metrics
    mlflow.log_metric("accuracy", acc)
    mlflow.log_metric("roc_auc", roc)
    mlflow.log_metric("precision", precision)
    mlflow.log_metric("recall", recall)
    mlflow.log_metric("f1_score", f1)

    #Optional: log confusion matrix as artifact
    print("Confusion Matrix:\n", cm)

    #Log the model
    mlflow.sklearn.log_model(rf, artifact_path="model")

print(f"Random Forest run finished. Accuracy: {acc:.4f}, ROC-AUC: {roc:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}")

Confusion Matrix:
 [[16521  4947]
 [  202   132]]




Random Forest run finished. Accuracy: 0.7638, ROC-AUC: 0.6323, Precision: 0.0260, Recall: 0.3952, F1: 0.0488


#### Step 5: Gradient Boosting (Ensemble)

In [0]:
import mlflow
import mlflow.sklearn
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.utils.class_weight import compute_sample_weight

# End any active MLflow run
mlflow.end_run()

# Set experiment
mlflow.set_experiment("/Shared/day12_purchase_prediction")

with mlflow.start_run(run_name="gradient_boosting_weighted"):
    # Log parameters
    mlflow.log_param("model_type", "GradientBoosting")
    mlflow.log_param("n_estimators", 300)
    mlflow.log_param("max_depth", 5)
    mlflow.log_param("learning_rate", 0.05)
    mlflow.log_param("threshold", 0.5)
    mlflow.log_param("random_state", 42)

    # Compute sample weights for class imbalance
    sample_weights = compute_sample_weight(class_weight="balanced", y=y_train)

    # Train model with sample weights
    gb = GradientBoostingClassifier(
        n_estimators=300,
        max_depth=5,
        learning_rate=0.05,
        random_state=42
    )
    gb.fit(X_train, y_train, sample_weight=sample_weights)

    # Predict probabilities
    y_proba = gb.predict_proba(X_test)[:, 1]

    # Apply threshold for minority class
    threshold = 0.5
    y_pred = (y_proba > threshold).astype(int)

    # Compute metrics
    acc = accuracy_score(y_test, y_pred)
    roc = roc_auc_score(y_test, y_proba)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)

    # Log metrics
    mlflow.log_metric("accuracy", acc)
    mlflow.log_metric("roc_auc", roc)
    mlflow.log_metric("precision", precision)
    mlflow.log_metric("recall", recall)
    mlflow.log_metric("f1_score", f1)

    # Optional: print confusion matrix
    print("Confusion Matrix:\n", cm)

    # Log the model
    mlflow.sklearn.log_model(gb, artifact_path="model")

print(f"Gradient Boosting Weighted (threshold={threshold}) run finished. Accuracy: {acc:.4f}, ROC-AUC: {roc:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}")


Confusion Matrix:
 [[15359  6109]
 [  177   157]]




Gradient Boosting Weighted (threshold=0.5) run finished. Accuracy: 0.7117, ROC-AUC: 0.6384, Precision: 0.0251, Recall: 0.4701, F1: 0.0476


#### Step 6: Extra Trees

In [0]:
import mlflow
import mlflow.sklearn
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.utils.class_weight import compute_sample_weight

# End any active MLflow run
mlflow.end_run()

# Set experiment
mlflow.set_experiment("/Shared/day12_purchase_prediction")

with mlflow.start_run(run_name="extra_trees_weighted"):
    # Log parameters
    mlflow.log_param("model_type", "ExtraTrees")
    mlflow.log_param("n_estimators", 100)
    mlflow.log_param("max_depth", 10)
    mlflow.log_param("random_state", 42)
    mlflow.log_param("threshold", 0.3)
    
    # Compute sample weights for class imbalance
    sample_weights = compute_sample_weight(class_weight="balanced", y=y_train)
    
    # Train model
    et = ExtraTreesClassifier(
        n_estimators=100,
        max_depth=10,
        random_state=42
    )
    et.fit(X_train, y_train, sample_weight=sample_weights)
    
    # Predict probabilities
    y_proba = et.predict_proba(X_test)[:, 1]
    
    # Apply threshold
    threshold = 0.3
    y_pred = (y_proba > threshold).astype(int)
    
    # Compute metrics
    acc = accuracy_score(y_test, y_pred)
    roc = roc_auc_score(y_test, y_proba)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)
    
    # Log metrics
    mlflow.log_metric("accuracy", acc)
    mlflow.log_metric("roc_auc", roc)
    mlflow.log_metric("precision", precision)
    mlflow.log_metric("recall", recall)
    mlflow.log_metric("f1_score", f1)
    
    # Print confusion matrix
    print("Confusion Matrix:\n", cm)
    
    # Log the model
    mlflow.sklearn.log_model(et, artifact_path="model")

print(f"Extra Trees Weighted (threshold={threshold}) run finished. Accuracy: {acc:.4f}, ROC-AUC: {roc:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}")


Confusion Matrix:
 [[ 2625 18843]
 [   16   318]]




Extra Trees Weighted (threshold=0.3) run finished. Accuracy: 0.1350, ROC-AUC: 0.5935, Precision: 0.0166, Recall: 0.9521, F1: 0.0326


#### Step 7: AdaBoost

In [0]:
from sklearn.ensemble import AdaBoostClassifier

# End any active MLflow run
mlflow.end_run()

with mlflow.start_run(run_name="adaboost_weighted"):
    # Log parameters
    mlflow.log_param("model_type", "AdaBoost")
    mlflow.log_param("n_estimators", 150)
    mlflow.log_param("learning_rate", 0.1)
    mlflow.log_param("random_state", 42)
    mlflow.log_param("threshold", 0.4)
    
    # Compute sample weights for class imbalance
    sample_weights = compute_sample_weight(class_weight="balanced", y=y_train)
    
    # Train model
    ab = AdaBoostClassifier(
        n_estimators=150,
        learning_rate=0.1,
        random_state=42
    )
    ab.fit(X_train, y_train, sample_weight=sample_weights)
    
    # Predict probabilities
    y_proba = ab.predict_proba(X_test)[:, 1]
    
    # Apply threshold
    threshold = 0.4
    y_pred = (y_proba > threshold).astype(int)
    
    # Compute metrics
    acc = accuracy_score(y_test, y_pred)
    roc = roc_auc_score(y_test, y_proba)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)
    
    # Log metrics
    mlflow.log_metric("accuracy", acc)
    mlflow.log_metric("roc_auc", roc)
    mlflow.log_metric("precision", precision)
    mlflow.log_metric("recall", recall)
    mlflow.log_metric("f1_score", f1)
    
    # Print confusion matrix
    print("Confusion Matrix:\n", cm)
    
    # Log the model
    mlflow.sklearn.log_model(ab, artifact_path="model")

print(f"AdaBoost Weighted (threshold={threshold}) run finished. Accuracy: {acc:.4f}, ROC-AUC: {roc:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}")


Confusion Matrix:
 [[ 1655 19813]
 [    3   331]]




AdaBoost Weighted (threshold=0.4) run finished. Accuracy: 0.0911, ROC-AUC: 0.5342, Precision: 0.0164, Recall: 0.9910, F1: 0.0323


#### Step 8: Train models & find best per-model results

In [0]:
import mlflow
import mlflow.sklearn
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier, AdaBoostClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.utils.class_weight import compute_sample_weight
import numpy as np

# End any active MLflow run
mlflow.end_run()

# Set experiment
mlflow.set_experiment("/Shared/day12_purchase_prediction")

# Define models and parameter grids
models = {
    "RandomForest": {"model": RandomForestClassifier, "params": {"n_estimators": [100,150], "max_depth": [5,10]}},
    "GradientBoosting": {"model": GradientBoostingClassifier, "params": {"n_estimators": [200,300], "max_depth": [3,5], "learning_rate":[0.05,0.1]}},
    "ExtraTrees": {"model": ExtraTreesClassifier, "params": {"n_estimators": [100,150], "max_depth": [5,10]}},
    "AdaBoost": {"model": AdaBoostClassifier, "params": {"n_estimators": [100,150], "learning_rate":[0.05,0.1]}}
}

# Thresholds to try for minority class
thresholds = [0.1, 0.3, 0.5, 0.6]

# Store best results per model
best_results = {}

# Loop over models
for model_name, m_info in models.items():
    print(f"Running model: {model_name}")
    ModelClass = m_info["model"]
    param_grid = m_info["params"]
    
    best_f1 = -1
    best_metrics = None
    
    # Grid search over parameters
    import itertools
    keys, values = zip(*param_grid.items())
    for param_set in itertools.product(*values):
        params = dict(zip(keys, param_set))
        
        # Compute sample weights for class imbalance if supported
        sample_weights = compute_sample_weight(class_weight="balanced", y=y_train) if model_name != "AdaBoost" else None
        
        # Train model
        model = ModelClass(**params, random_state=42)
        if sample_weights is not None:
            model.fit(X_train, y_train, sample_weight=sample_weights)
        else:
            model.fit(X_train, y_train)
        
        # Predict probabilities
        y_proba = model.predict_proba(X_test)[:,1]
        
        # Try all thresholds
        for threshold in thresholds:
            y_pred = (y_proba > threshold).astype(int)
            
            acc = accuracy_score(y_test, y_pred)
            roc = roc_auc_score(y_test, y_proba)
            precision = precision_score(y_test, y_pred, zero_division=0)
            recall = recall_score(y_test, y_pred, zero_division=0)
            f1 = f1_score(y_test, y_pred, zero_division=0)
            cm = confusion_matrix(y_test, y_pred)
            
            # Save if F1 improves
            if f1 > best_f1:
                best_f1 = f1
                best_metrics = {
                    "model_name": model_name,
                    "params": params,
                    "threshold": threshold,
                    "accuracy": acc,
                    "roc_auc": roc,
                    "precision": precision,
                    "recall": recall,
                    "f1": f1,
                    "confusion_matrix": cm
                }
    
    best_results[model_name] = best_metrics
    print(f"Best {model_name}: ROC-AUC={best_metrics['roc_auc']:.4f}, F1={best_metrics['f1']:.4f}, Threshold={best_metrics['threshold']}")


Running model: RandomForest
Best RandomForest: ROC-AUC=0.6323, F1=0.0513, Threshold=0.6
Running model: GradientBoosting
Best GradientBoosting: ROC-AUC=0.6416, F1=0.0560, Threshold=0.6
Running model: ExtraTrees
Best ExtraTrees: ROC-AUC=0.5956, F1=0.0405, Threshold=0.6
Running model: AdaBoost
Best AdaBoost: ROC-AUC=0.5341, F1=0.0302, Threshold=0.1


#### Step 9: Overall Best model

In [0]:
import pprint

# Find overall best model by F1 score
overall_best = max(best_results.values(), key=lambda x: x['f1'])

print("="*80)
print("OVERALL BEST MODEL")
print(f"Model: {overall_best['model_name']}")
print(f"Params: {overall_best['params']}")
print(f"Threshold: {overall_best['threshold']}")
print(f"Accuracy: {overall_best['accuracy']:.4f}")
print(f"ROC-AUC: {overall_best['roc_auc']:.4f}")
print(f"Precision: {overall_best['precision']:.4f}")
print(f"Recall: {overall_best['recall']:.4f}")
print(f"F1: {overall_best['f1']:.4f}")
print("Confusion Matrix:")
print(overall_best['confusion_matrix'])
print("="*80)

# Optional: log overall best model to MLflow
mlflow.end_run()
mlflow.set_experiment("/Shared/day12_purchase_prediction")
with mlflow.start_run(run_name="overall_best_model"):
    mlflow.log_param("model_name", overall_best['model_name'])
    mlflow.log_params(overall_best['params'])
    mlflow.log_param("threshold", overall_best['threshold'])
    
    mlflow.log_metric("accuracy", overall_best['accuracy'])
    mlflow.log_metric("roc_auc", overall_best['roc_auc'])
    mlflow.log_metric("precision", overall_best['precision'])
    mlflow.log_metric("recall", overall_best['recall'])
    mlflow.log_metric("f1_score", overall_best['f1'])
    
    # Log the model artifact
    # Re-train the model with best params and sample weights
    ModelClass = {
        "RandomForest": RandomForestClassifier,
        "GradientBoosting": GradientBoostingClassifier,
        "ExtraTrees": ExtraTreesClassifier,
        "AdaBoost": AdaBoostClassifier
    }[overall_best['model_name']]
    
    model_params = overall_best['params']
    sample_weights = compute_sample_weight(class_weight="balanced", y=y_train) if overall_best['model_name'] != "AdaBoost" else None
    model = ModelClass(**model_params, random_state=42)
    if sample_weights is not None:
        model.fit(X_train, y_train, sample_weight=sample_weights)
    else:
        model.fit(X_train, y_train)
    
    mlflow.sklearn.log_model(model, artifact_path="model")

print("Overall best model logged to MLflow successfully!")


OVERALL BEST MODEL
Model: GradientBoosting
Params: {'n_estimators': 300, 'max_depth': 3, 'learning_rate': 0.1}
Threshold: 0.6
Accuracy: 0.8547
ROC-AUC: 0.6416
Precision: 0.0311
Recall: 0.2814
F1: 0.0560
Confusion Matrix:
[[18540  2928]
 [  240    94]]




Overall best model logged to MLflow successfully!
