In [0]:
import os
from datetime import datetime
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
from sklearn.model_selection import RandomizedSearchCV, cross_val_score
from scipy.stats import randint, uniform

import pandas as pd
import numpy as np

In [0]:
dataset = spark.table("default.dataset")

In [0]:
split_day_row = dataset.select(expr("percentile_approx(time_since_test_start, 0.8)").alias("split_day")).collect()
split_day = int(split_day_row[0]["split_day"])

print(f"Split day: {split_day}")

# Create train and test sets based on time
train_data = dataset.filter(col("time_since_test_start") <= split_day)
test_data = dataset.filter(col("time_since_test_start") > split_day)

print(f"Training set: {train_data.count()} rows")
print(f"Test set: {test_data.count()} rows")

Split day: 21
Training set: 72010 rows
Test set: 14422 rows


In [0]:
numerical_features = [
    'age', 'months_since_register', 'credit_card_limit',
    'hist_spent', 'hist_count', 'rolling_spent_30d', 'rolling_count_30d', 
    'hist_offer_completion_rate', 'discount_value', 'min_value', 'duration'
]

boolean_features = [
    'is_new_customer', 'is_continuous_customer', 'is_tenured_customer', 
    'is_high_tenured_customer', 'is_extreme_tenured_customer',
    'has_email', 'has_mobile', 'has_social', 'has_web'
]

categorical_features = ['offer_type', 'gender']

all_features = numerical_features + boolean_features + categorical_features

# Select features and target, convert to pandas
train_pandas = train_data.select(all_features + ["label"]).toPandas()
test_pandas = test_data.select(all_features + ["label"]).toPandas()

X_train = train_pandas[all_features]
y_train = train_pandas["label"]
X_test = test_pandas[all_features]
y_test = test_pandas["label"]

for col in numerical_features:
    if col in X_train.columns:
        X_train[col] = X_train[col].fillna(X_train[col].mean())
        X_test[col] = X_test[col].fillna(X_train[col].mean())  # Use train mean for test

for col in categorical_features:
    if col in X_train.columns:
        X_train[col] = X_train[col].fillna('missing')
        X_test[col] = X_test[col].fillna('missing')

print("Feature engineering completed")
print(f"Numerical features: {len(numerical_features)}")
print(f"Boolean features: {len(boolean_features)}")
print(f"Categorical features: {len(categorical_features)}")

Feature engineering completed
Numerical features: 11
Boolean features: 9
Categorical features: 2


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[col] = X_train[col].fillna(X_train[col].mean())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test[col] = X_test[col].fillna(X_train[col].mean())  # Use train mean for test
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[col] = X_train[col].fillna('missing')
A value is trying to b

In [0]:
cat_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

num_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='mean'))
])

# Combine transformers
preprocessor = ColumnTransformer(transformers=[
    ('cat', cat_transformer, categorical_features),
    ('num', num_transformer, numerical_features)
], remainder='passthrough')  # This will handle boolean features

clf = Pipeline([
    ('prep', preprocessor),
    ('model', RandomForestClassifier(n_estimators=500, random_state=42))
])

print("sklearn Pipeline created successfully")
print(f"Training set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")

sklearn Pipeline created successfully
Training set shape: (72010, 22)
Test set shape: (14422, 22)


In [0]:
print(f"\nNaN values in X_train:")
print(X_train.isnull().sum())
print(f"\nColumns with NaN values: {X_train.columns[X_train.isnull().any()].tolist()}")


NaN values in X_train:
age                            0
months_since_register          0
credit_card_limit              0
hist_spent                     0
hist_count                     0
rolling_spent_30d              0
rolling_count_30d              0
hist_offer_completion_rate     0
discount_value                 0
min_value                      0
duration                       0
is_new_customer                0
is_continuous_customer         0
is_tenured_customer            0
is_high_tenured_customer       0
is_extreme_tenured_customer    0
has_email                      0
has_mobile                     0
has_social                     0
has_web                        0
offer_type                     0
gender                         0
dtype: int64

Columns with NaN values: []


In [0]:
# param_distributions = {
#     'model__n_estimators': randint(300, 500),  # Random integers between 50-500
#     'model__max_depth': [10, 20, 30, 50, None],  # Discrete choices including None
#     'model__min_samples_split': randint(2, 20),  # Random integers between 2-20
#     'model__min_samples_leaf': randint(1, 10),  # Random integers between 1-10
#     'model__max_features': ['sqrt', 'log2', None],  # Feature sampling strategies
# }

# n_iter = 20  # Number of random combinations to try
# print("Starting hyperparameter tuning with RandomizedSearchCV...")
# print(f"Random parameter combinations to test: {n_iter}")
# print("This is more efficient than testing all possible combinations!")

# # Create RandomizedSearchCV object
# random_search = RandomizedSearchCV(
#     estimator=clf,
#     param_distributions=param_distributions,
#     n_iter=n_iter,  # Number of random combinations to try
#     cv=3,  # 5-fold cross-validation
#     scoring='roc_auc',  # Use ROC-AUC as the scoring metric
#     n_jobs=-1,  # Use all available cores
#     verbose=1,  # Show progress
#     random_state=42  # For reproducibility
# )

# # Fit the random search
# print("Training models with random hyperparameter combinations...")
# random_search.fit(X_train, y_train)

# print("Hyperparameter tuning completed!")
# print(f"Best ROC-AUC score: {random_search.best_score_:.4f}")
# print("Best parameters:")
# for param, value in random_search.best_params_.items():
#     print(f"  {param}: {value}")

# # Use the best model for predictions
# best_clf = random_search.best_estimator_
# print("\nMaking predictions with the best model...")
# y_pred = best_clf.predict(X_test)
# y_proba = best_clf.predict_proba(X_test)[:, 1]

# print("Predictions completed with optimized model")

In [0]:
# print(f"Best Cross-Validation ROC-AUC Score: {random_search.best_score_:.4f}")
# print(f"Best Parameters:")
# for param, value in random_search.best_params_.items():
#     print(f"  {param}: {value}")

# # Show top 5 parameter combinations from random search
# results_df = pd.DataFrame(random_search.cv_results_)
# top_5 = results_df.nlargest(5, 'mean_test_score')[['params', 'mean_test_score', 'std_test_score']]

# print(f"\nTop 5 Random Parameter Combinations:")
# for i, (idx, row) in enumerate(top_5.iterrows(), 1):
#     print(f"{i}. Score: {row['mean_test_score']:.4f} (+/- {row['std_test_score']*2:.4f})")
#     print(f"   Parameters: {row['params']}")

# # Show parameter space coverage
# print(f"\nRandomized Search Statistics:")
# print(f"Total combinations tested: {len(results_df)}")
# print(f"Best score improvement over baseline: {random_search.best_score_ - 0.5:.4f}")
# print(f"Standard deviation of scores: {results_df['mean_test_score'].std():.4f}")

In [0]:
print("Training the sklearn model...")
clf.fit(X_train, y_train)

print("Model training completed")
print("Making predictions on test set...")
y_pred = clf.predict(X_test)
y_proba = clf.predict_proba(X_test)[:, 1]

print("Predictions completed")

Training the sklearn model...
Model training completed
Making predictions on test set...
Predictions completed


In [0]:
print("Classification Report:")
report_text = classification_report(y_test, y_pred)
print(report_text)

# Extract metrics from classification_report
report_dict = classification_report(y_test, y_pred, output_dict=True)

# Get ROC AUC score
auc_score = roc_auc_score(y_test, y_proba)
print(f"\nROC AUC Score: {auc_score:.4f}")

# Extract metrics for each class from classification_report
precision_0 = report_dict['0']['precision']
recall_0 = report_dict['0']['recall']
f1_0 = report_dict['0']['f1-score']

precision_1 = report_dict['1']['precision']
recall_1 = report_dict['1']['recall']
f1_1 = report_dict['1']['f1-score']

accuracy = report_dict['accuracy']

# Calculate confusion matrix for display
cm = confusion_matrix(y_test, y_pred)
tn, fp, fn, tp = cm.ravel()

print(f"\nConfusion Matrix:")
print(f"True Negatives: {tn}")
print(f"False Positives: {fp}")
print(f"False Negatives: {fn}")
print(f"True Positives: {tp}")

print(f"\nDetailed Metrics (from sklearn classification_report):")
print(f"Accuracy: {accuracy:.4f}")
print(f"Class 0 - Precision: {precision_0:.4f}, Recall: {recall_0:.4f}, F1: {f1_0:.4f}")
print(f"Class 1 - Precision: {precision_1:.4f}, Recall: {recall_1:.4f}, F1: {f1_1:.4f}")

Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.75      0.80      7433
           1       0.76      0.86      0.81      6989

    accuracy                           0.80     14422
   macro avg       0.81      0.80      0.80     14422
weighted avg       0.81      0.80      0.80     14422


ROC AUC Score: 0.8956

Confusion Matrix:
True Negatives: 5584
False Positives: 1849
False Negatives: 997
True Positives: 5992

Detailed Metrics (from sklearn classification_report):
Accuracy: 0.8027
Class 0 - Precision: 0.8485, Recall: 0.7512, F1: 0.7969
Class 1 - Precision: 0.7642, Recall: 0.8573, F1: 0.8081


In [0]:
# clf = best_clf

model = clf.named_steps['model']
feature_importances = model.feature_importances_

# Get feature names from the preprocessor
encoder = clf.named_steps['prep'].named_transformers_['cat']
cat_names = encoder.get_feature_names_out(categorical_features)
all_feature_names = list(cat_names) + numerical_features + boolean_features

importance_df = pd.DataFrame({
    'feature': all_feature_names,
    'importance': feature_importances
}).sort_values('importance', ascending=False)

print("Top 20 Most Important Features:")
print(importance_df.head(20).to_string(index=False))

print(f"\nTotal features: {len(all_feature_names)}")
print(f"Top 5 features:")
for i, (idx, row) in enumerate(importance_df.head(5).iterrows()):
    print(f"  {i+1}. {row['feature']}: {row['importance']:.4f}")

Top 20 Most Important Features:
                   feature  importance
     months_since_register    0.238809
                       age    0.176240
         credit_card_limit    0.162147
            discount_value    0.067997
                  duration    0.064170
                 min_value    0.062090
  offer_type_informational    0.049897
            gender_missing    0.033020
hist_offer_completion_rate    0.022688
           is_new_customer    0.016234
       is_tenured_customer    0.011884
                  gender_F    0.011393
       offer_type_discount    0.011143
                  gender_M    0.009545
                has_social    0.009499
  is_high_tenured_customer    0.008472
           offer_type_bogo    0.007521
         rolling_spent_30d    0.007016
                hist_spent    0.006873
                   has_web    0.005088

Total features: 27
Top 5 features:
  1. months_since_register: 0.2388
  2. age: 0.1762
  3. credit_card_limit: 0.1621
  4. discount_value: 0.0680
  

In [0]:
current_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')

metrics_data = []
description = "Hybrid PySpark + sklearn RandomForest with historical, rolling features, tenure segments and credit limit"
features_str = ",".join(all_features)

metrics_rows = [
    {"type": "metric", "metric": "precision", "class": "0", "name": "precision_class_0", "value": precision_0},
    {"type": "metric", "metric": "recall", "class": "0", "name": "recall_class_0", "value": recall_0},
    {"type": "metric", "metric": "f1-score", "class": "0", "name": "f1_class_0", "value": f1_0},
    {"type": "metric", "metric": "precision", "class": "1", "name": "precision_class_1", "value": precision_1},
    {"type": "metric", "metric": "recall", "class": "1", "name": "recall_class_1", "value": recall_1},
    {"type": "metric", "metric": "f1-score", "class": "1", "name": "f1_class_1", "value": f1_1},
    {"type": "metric", "metric": "accuracy", "class": "", "name": "accuracy", "value": accuracy},
    {"type": "metric", "metric": "roc_auc", "class": "", "name": "roc_auc", "value": auc_score}
]

for row in metrics_rows:
    row.update({
        "model_name": "RandomForest",
        "run_date": current_time,
        "description": description,
        "features": features_str
    })

metrics_df = pd.DataFrame(metrics_rows)

importance_export = importance_df.copy()
importance_export["type"] = "feature_importance"
importance_export["metric"] = ""
importance_export["class"] = ""
importance_export["name"] = importance_export["feature"]
importance_export["value"] = importance_export["importance"]
importance_export["model_name"] = "RandomForest"
importance_export["run_date"] = current_time
importance_export["description"] = description
importance_export["features"] = features_str

importance_export = importance_export[metrics_df.columns]

combined_results = pd.concat([metrics_df, importance_export], ignore_index=True)

In [0]:
combined_results_spark = spark.createDataFrame(combined_results)

combined_results_spark.write.mode("append").saveAsTable("metrics")

print(f"Results saved to Databricks table: metrics")
print(f"Total records added: {len(combined_results)}")
print(f"Records saved at: {current_time}")

Results saved to Databricks table: metrics
Total records added: 35
Records saved at: 2025-08-13 00:10:45
