
# Bank Marketing Campaign Classification

Improved pipeline using one-hot encoding, stratified splits, and tuned gradient boosting to lift recall/F1 on the term deposit prediction.


## 1. Imports

In [None]:
from pathlib import Path
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.base import clone
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    recall_score,
    precision_score,
    roc_auc_score,
    precision_recall_curve,
)

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.pipeline import Pipeline as ImbPipeline

import matplotlib.pyplot as plt
import seaborn as sns
import shap
import joblib

sns.set(style="whitegrid")

## 2. Load the Data

In [None]:

# Resolve project paths and organize outputs
PROJECT_ROOT = Path.cwd()
if not (PROJECT_ROOT / "datasets").exists() and (PROJECT_ROOT.parent / "datasets").exists():
    PROJECT_ROOT = PROJECT_ROOT.parent
if not (PROJECT_ROOT / "datasets").exists():
    raise FileNotFoundError("Could not locate datasets folder. Run from project root or src directory.")

DATA_DIR = PROJECT_ROOT / "datasets" / "bank-marketing"
OUTPUT_DIR = PROJECT_ROOT / "outputs"
PLOTS_DIR = OUTPUT_DIR / "plots"
MODELS_DIR = OUTPUT_DIR / "models"
PREDICTIONS_DIR = OUTPUT_DIR / "predictions"
for path in [OUTPUT_DIR, PLOTS_DIR, MODELS_DIR, PREDICTIONS_DIR]:
    path.mkdir(parents=True, exist_ok=True)

print(f"Project root: {PROJECT_ROOT}")
print(f"Outputs directory: {OUTPUT_DIR}")


In [22]:

data_path = DATA_DIR / "bank-full.csv"
df = pd.read_csv(data_path, sep=";")
print(f"Loaded {df.shape[0]:,} rows and {df.shape[1]} columns from {data_path}")
print(df["y"].value_counts(normalize=True).rename("proportion"))


Loaded 45,211 rows and 17 columns
y
no     0.883015
yes    0.116985
Name: proportion, dtype: float64



## 3. Preprocessing

- Remove extreme outliers on a few numeric fields.
- Separate categorical vs numeric columns for proper encoding.
- Stratified train/test split to preserve class balance.


In [23]:

for col in ["balance", "age", "duration", "campaign"]:
    z_scores = np.abs(df[col] - df[col].mean()) / df[col].std()
    df = df[z_scores < 3]

print(f"After outlier filtering: {df.shape[0]:,} rows remain.")


After outlier filtering: 42,338 rows remain.


In [24]:

X = df.drop(columns="y")
y = df["y"].map({"yes": 1, "no": 0})

categorical_cols = X.select_dtypes(exclude=["number"]).columns.tolist()
numeric_cols = [col for col in X.columns if col not in categorical_cols]

preprocess_template = ColumnTransformer(
    transformers=[
        ("categorical", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
        ("numeric", StandardScaler(with_mean=False), numeric_cols),
    ]
)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

pos_weight = (len(y_train) - y_train.sum()) / y_train.sum()
print(f"Positive class weight (train): {pos_weight:.2f}")
print(f"Train size: {X_train.shape[0]}, Test size: {X_test.shape[0]}")


Positive class weight (train): 8.58
Train size: 33870, Test size: 8468


### Feature Engineering

Create domain-specific features that might improve predictive power.

In [None]:
# Create engineered features
def engineer_features(df):
    """Add domain-specific engineered features"""
    df = df.copy()
    
    # Contact frequency features
    df['contact_per_campaign'] = df['campaign'] / (df['previous'] + 1)
    df['days_since_last'] = df['pdays'].apply(lambda x: 999 if x == -1 else x)
    df['was_contacted_before'] = (df['pdays'] != -1).astype(int)
    
    # Financial features
    df['balance_to_age_ratio'] = df['balance'] / (df['age'] + 1)
    df['high_balance'] = (df['balance'] > df['balance'].median()).astype(int)
    
    # Call duration features (highly predictive but use cautiously)
    df['duration_minutes'] = df['duration'] / 60
    df['long_call'] = (df['duration'] > 300).astype(int)  # > 5 minutes
    
    # Time-based features
    if 'day' in df.columns:
        df['is_month_start'] = (df['day'] <= 10).astype(int)
        df['is_month_end'] = (df['day'] >= 20).astype(int)
    
    return df

# Apply feature engineering
X_train_eng = engineer_features(X_train)
X_test_eng = engineer_features(X_test)

print(f"Features after engineering: {X_train_eng.shape[1]}")
print(f"New features: {set(X_train_eng.columns) - set(X_train.columns)}")

In [None]:
# Update preprocessing for engineered features
categorical_cols_eng = X_train_eng.select_dtypes(exclude=["number"]).columns.tolist()
numeric_cols_eng = [col for col in X_train_eng.columns if col not in categorical_cols_eng]

preprocess_eng = ColumnTransformer(
    transformers=[
        ("categorical", OneHotEncoder(handle_unknown="ignore"), categorical_cols_eng),
        ("numeric", StandardScaler(), numeric_cols_eng),
    ]
)

print(f"Categorical features: {len(categorical_cols_eng)}")
print(f"Numeric features: {len(numeric_cols_eng)}")

### Helper functions

In [None]:
def find_best_threshold(probs, y_true, search=np.linspace(0.1, 0.9, 81)):
    """Find threshold that maximizes F1 score"""
    scores = [(thr, f1_score(y_true, (probs >= thr).astype(int))) for thr in search]
    return max(scores, key=lambda x: x[1])

def find_best_threshold_pr_curve(probs, y_true):
    """Find threshold using precision-recall curve"""
    precision, recall, thresholds = precision_recall_curve(y_true, probs)
    f1_scores = 2 * (precision * recall) / (precision + recall + 1e-10)
    best_idx = np.argmax(f1_scores)
    best_threshold = thresholds[best_idx] if best_idx < len(thresholds) else 0.5
    return best_threshold, f1_scores[best_idx]

def evaluate(name, probs, y_true, threshold=None, use_pr_curve=True):
    """Evaluate model with comprehensive metrics"""
    if threshold is None:
        if use_pr_curve:
            threshold, _ = find_best_threshold_pr_curve(probs, y_true)
        else:
            threshold, _ = find_best_threshold(probs, y_true)
    
    preds = (probs >= threshold).astype(int)
    acc = accuracy_score(y_true, preds)
    f1 = f1_score(y_true, preds)
    precision = precision_score(y_true, preds)
    recall = recall_score(y_true, preds)
    roc_auc = roc_auc_score(y_true, probs)
    
    print(f"\n{name} @ threshold {threshold:.3f}")
    print(f"  Accuracy:  {acc:.4f}")
    print(f"  Precision: {precision:.4f}")
    print(f"  Recall:    {recall:.4f}")
    print(f"  F1:        {f1:.4f}")
    print(f"  ROC AUC:   {roc_auc:.4f}")
    
    return preds, threshold, {
        'name': name,
        'threshold': threshold,
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'roc_auc': roc_auc
    }

## 4. Logistic Regression (one-hot + class weights)

In [None]:
log_pipeline = Pipeline(
    steps=[
        ("preprocess", clone(preprocess_eng)),
        ("model", LogisticRegression(max_iter=1000, class_weight="balanced", C=0.1)),
    ]
)

log_pipeline.fit(X_train_eng, y_train)
log_probs = log_pipeline.predict_proba(X_test_eng)[:, 1]
log_preds, log_threshold, log_metrics = evaluate("Logistic Regression", log_probs, y_test)

## 5. Tuned XGBoost (best current performer)

## 5a. XGBoost with SMOTE (to handle class imbalance)

In [None]:
# XGBoost with SMOTE oversampling
smote_xgb_clf = XGBClassifier(
    n_estimators=400,
    learning_rate=0.03,
    max_depth=5,
    min_child_weight=3,
    subsample=0.8,
    colsample_bytree=0.8,
    gamma=0.15,
    reg_alpha=0.1,
    reg_lambda=1.5,
    objective="binary:logistic",
    eval_metric="logloss",
    random_state=42,
    tree_method="hist",
)

smote_xgb_pipeline = ImbPipeline(
    steps=[
        ("preprocess", clone(preprocess_eng)),
        ("smote", SMOTE(random_state=42, k_neighbors=5)),
        ("model", smote_xgb_clf),
    ]
)

smote_xgb_pipeline.fit(X_train_eng, y_train)
smote_xgb_probs = smote_xgb_pipeline.predict_proba(X_test_eng)[:, 1]
smote_xgb_preds, smote_xgb_threshold, smote_xgb_metrics = evaluate("XGBoost + SMOTE", smote_xgb_probs, y_test)

## 5b. Random Forest with SMOTE

In [None]:
# Random Forest with SMOTE
rf_clf = RandomForestClassifier(
    n_estimators=300,
    max_depth=12,
    min_samples_split=10,
    min_samples_leaf=4,
    max_features='sqrt',
    random_state=42,
    n_jobs=-1,
)

smote_rf_pipeline = ImbPipeline(
    steps=[
        ("preprocess", clone(preprocess_eng)),
        ("smote", SMOTE(random_state=42, k_neighbors=5)),
        ("model", rf_clf),
    ]
)

smote_rf_pipeline.fit(X_train_eng, y_train)
smote_rf_probs = smote_rf_pipeline.predict_proba(X_test_eng)[:, 1]
smote_rf_preds, smote_rf_threshold, smote_rf_metrics = evaluate("Random Forest + SMOTE", smote_rf_probs, y_test)

## 5c. LightGBM with SMOTE (often best for imbalanced data)

In [None]:
# LightGBM with SMOTE (often superior for imbalanced datasets)
lgbm_clf = LGBMClassifier(
    n_estimators=500,
    learning_rate=0.02,
    max_depth=6,
    num_leaves=31,
    min_child_samples=20,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.1,
    reg_lambda=1.0,
    random_state=42,
    verbose=-1,
)

smote_lgbm_pipeline = ImbPipeline(
    steps=[
        ("preprocess", clone(preprocess_eng)),
        ("smote", SMOTE(random_state=42, k_neighbors=5)),
        ("model", lgbm_clf),
    ]
)

smote_lgbm_pipeline.fit(X_train_eng, y_train)
smote_lgbm_probs = smote_lgbm_pipeline.predict_proba(X_test_eng)[:, 1]
smote_lgbm_preds, smote_lgbm_threshold, smote_lgbm_metrics = evaluate("LightGBM + SMOTE", smote_lgbm_probs, y_test)

In [None]:
xgb_clf = XGBClassifier(
    n_estimators=400,
    learning_rate=0.03,
    max_depth=5,
    min_child_weight=3,
    subsample=0.8,
    colsample_bytree=0.8,
    gamma=0.15,
    reg_alpha=0.1,
    reg_lambda=1.5,
    objective="binary:logistic",
    eval_metric="logloss",
    random_state=42,
    scale_pos_weight=pos_weight,
    tree_method="hist",
)

xgb_pipeline = Pipeline(
    steps=[
        ("preprocess", clone(preprocess_eng)),
        ("model", xgb_clf),
    ]
)

xgb_pipeline.fit(X_train_eng, y_train)
xgb_probs = xgb_pipeline.predict_proba(X_test_eng)[:, 1]
xgb_preds, xgb_threshold, xgb_metrics = evaluate("XGBoost (tuned)", xgb_probs, y_test)

## 6. Confusion Matrices

## 5d. Model Comparison

In [None]:
# Compare all models
all_metrics = [log_metrics, xgb_metrics, smote_xgb_metrics, smote_rf_metrics, smote_lgbm_metrics]
comparison_df = pd.DataFrame(all_metrics)
comparison_df = comparison_df.sort_values('f1', ascending=False)

print("\n" + "="*80)
print("MODEL COMPARISON (sorted by F1 score)")
print("="*80)
print(comparison_df.to_string(index=False))
print("="*80)

# Visualize comparison
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

metrics_to_plot = ['f1', 'precision', 'recall']
for idx, metric in enumerate(metrics_to_plot):
    ax = axes[idx]
    comparison_df_sorted = comparison_df.sort_values(metric, ascending=True)
    ax.barh(comparison_df_sorted['name'], comparison_df_sorted[metric])
    ax.set_xlabel(metric.capitalize())
    ax.set_title(f'{metric.capitalize()} Comparison')
    ax.set_xlim([0, 1])
    
plt.tight_layout()
comparison_plot_path = PLOTS_DIR / "model_comparison.png"
plt.savefig(comparison_plot_path)
plt.show()
print(f"\nSaved comparison plot to {comparison_plot_path}")

In [None]:
def plot_confusion(y_true, y_pred, title):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(5, 4))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.title(title)
    plt.tight_layout()
    outfile = PLOTS_DIR / f"{title.replace(' ', '_').lower()}_confusion_matrix.png"
    plt.savefig(outfile)
    plt.show()
    print(f"Saved confusion matrix to {outfile}")

# Plot confusion matrices for all models
model_results = [
    ("Logistic Regression", log_preds, log_threshold),
    ("XGBoost", xgb_preds, xgb_threshold),
    ("XGBoost + SMOTE", smote_xgb_preds, smote_xgb_threshold),
    ("Random Forest + SMOTE", smote_rf_preds, smote_rf_threshold),
    ("LightGBM + SMOTE", smote_lgbm_preds, smote_lgbm_threshold),
]

for name, preds, threshold in model_results:
    plot_confusion(y_test, preds, f"{name} (thr={threshold:.3f})")

## 7. Persist Models

In [None]:
# Save all models
models_to_save = {
    "logistic_regression": {"pipeline": log_pipeline, "threshold": log_threshold},
    "xgboost": {"pipeline": xgb_pipeline, "threshold": xgb_threshold},
    "xgboost_smote": {"pipeline": smote_xgb_pipeline, "threshold": smote_xgb_threshold},
    "random_forest_smote": {"pipeline": smote_rf_pipeline, "threshold": smote_rf_threshold},
    "lightgbm_smote": {"pipeline": smote_lgbm_pipeline, "threshold": smote_lgbm_threshold},
}

for model_name, model_data in models_to_save.items():
    model_path = MODELS_DIR / f"{model_name}.pkl"
    joblib.dump(model_data, model_path)
    print(f"Saved {model_name} to {model_path}")

# Identify and save best model
best_model_name = comparison_df.iloc[0]['name']
print(f"\nBest model by F1 score: {best_model_name}")
print(f"F1 Score: {comparison_df.iloc[0]['f1']:.4f}")

## 8. Generate Predictions and Export

In [None]:
# Generate predictions from all models for comparison
pred_output = pd.DataFrame({
    'actual': y_test,
    'log_reg_prob': log_probs,
    'log_reg_pred': log_preds,
    'xgb_prob': xgb_probs,
    'xgb_pred': xgb_preds,
    'xgb_smote_prob': smote_xgb_probs,
    'xgb_smote_pred': smote_xgb_preds,
    'rf_smote_prob': smote_rf_probs,
    'rf_smote_pred': smote_rf_preds,
    'lgbm_smote_prob': smote_lgbm_probs,
    'lgbm_smote_pred': smote_lgbm_preds,
})

predictions_path = PREDICTIONS_DIR / "classification_predictions.csv"
pred_output.to_csv(predictions_path, index=False)
print(f"Saved predictions to {predictions_path}")
print(f"Prediction file shape: {pred_output.shape}")

## 9. Model Interpretability with SHAP

In [None]:
# SHAP analysis for the best performing SMOTE model (LightGBM or XGBoost)
# Use LightGBM SMOTE if it has the best F1, otherwise use XGBoost SMOTE
best_tree_model_name = comparison_df[comparison_df['name'].str.contains('SMOTE')].iloc[0]['name']

if 'LightGBM' in best_tree_model_name:
    shap_pipeline = smote_lgbm_pipeline
    model_label = "LightGBM + SMOTE"
elif 'XGBoost' in best_tree_model_name:
    shap_pipeline = smote_xgb_pipeline
    model_label = "XGBoost + SMOTE"
else:
    shap_pipeline = smote_rf_pipeline
    model_label = "Random Forest + SMOTE"

print(f"Generating SHAP values for: {model_label}")

feature_names = shap_pipeline.named_steps["preprocess"].get_feature_names_out()
X_test_encoded = shap_pipeline.named_steps["preprocess"].transform(X_test_eng)

sample_size = min(1000, X_test_encoded.shape[0])
sample_indices = np.random.RandomState(42).choice(
    X_test_encoded.shape[0], sample_size, replace=False
)
X_sample = X_test_encoded[sample_indices]

if hasattr(X_sample, "toarray"):
    X_sample = X_sample.toarray()

explainer = shap.TreeExplainer(shap_pipeline.named_steps["model"])
shap_values = explainer.shap_values(X_sample)

shap.summary_plot(
    shap_values,
    feature_names=feature_names,
    plot_type="bar",
    max_display=15,
    show=False,
)
plt.tight_layout()
shap_plot_path = PLOTS_DIR / f"{model_label.lower().replace(' ', '_')}_shap_summary_bar.png"
plt.savefig(shap_plot_path, bbox_inches="tight")
plt.close()
print(f"Saved SHAP summary plot to {shap_plot_path}")