# Bank Marketing Campaign Classification

This notebook builds a classification pipeline to predict whether a customer will subscribe to a term deposit based on the UCI Bank Marketing dataset.

## Imports

In [1]:
# Standard library
from pathlib import Path
import warnings

# Data manipulation
import pandas as pd
import numpy as np

# Scikit-learn: preprocessing and evaluation
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.base import clone
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    recall_score,
    precision_score,
    roc_auc_score,
    precision_recall_curve,
)

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

# Imbalanced learning
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

# Model persistence
import joblib

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Model interpretability
import shap

# Configuration
warnings.filterwarnings("ignore", category=FutureWarning)
sns.set_theme(style="whitegrid")
plt.rcParams["figure.dpi"] = 100

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

  from .autonotebook import tqdm as notebook_tqdm


## Load Data

Configure project paths and load the Bank Marketing dataset.

In [2]:
# Resolve project paths
PROJECT_ROOT = Path.cwd()
if not (PROJECT_ROOT / "datasets").exists() and (PROJECT_ROOT.parent / "datasets").exists():
    PROJECT_ROOT = PROJECT_ROOT.parent
if not (PROJECT_ROOT / "datasets").exists():
    raise FileNotFoundError("Could not locate datasets folder. Run from project root or src directory.")

# Define directory structure
DATA_DIR = PROJECT_ROOT / "datasets" / "bank-marketing"
OUTPUT_DIR = PROJECT_ROOT / "outputs"
PLOTS_DIR = OUTPUT_DIR / "plots"
MODELS_DIR = OUTPUT_DIR / "models"
PREDICTIONS_DIR = OUTPUT_DIR / "predictions"

# Create output directories
for path in [PLOTS_DIR, MODELS_DIR, PREDICTIONS_DIR]:
    path.mkdir(parents=True, exist_ok=True)

print(f"Project root: {PROJECT_ROOT}")
print(f"Outputs directory: {OUTPUT_DIR}")

Project root: /Users/zihanghuang/data-mining
Outputs directory: /Users/zihanghuang/data-mining/outputs


In [3]:
# Load dataset
data_path = DATA_DIR / "bank-full.csv"
df_raw = pd.read_csv(data_path, sep=";")

print(f"Loaded {df_raw.shape[0]:,} rows and {df_raw.shape[1]} columns")
print(f"\nTarget distribution:")
print(df_raw["y"].value_counts(normalize=True).rename("proportion").to_string())
print(f"\nClass imbalance ratio: {df_raw['y'].value_counts()['no'] / df_raw['y'].value_counts()['yes']:.2f}:1")

Loaded 45,211 rows and 17 columns

Target distribution:
y
no     0.883015
yes    0.116985

Class imbalance ratio: 7.55:1


## Preprocessing

### Outlier Removal
Remove extreme outliers (z-score > 3) from numeric columns to reduce noise.

In [4]:
# Remove outliers using z-score method
# Keeps more edge cases that may contain valuable signal
df = df_raw.copy()
outlier_cols = ["balance", "age", "campaign"]

# Drop duration column - it's only known after a call ends (data leakage)
df = df.drop(columns=["duration"])

# Create a combined mask for all outlier conditions (relaxed threshold)
outlier_mask = pd.Series(True, index=df.index)
for col in outlier_cols:
    z_scores = np.abs((df[col] - df[col].mean()) / df[col].std())
    outlier_mask &= (z_scores < 4)  # Changed from 3 to 4 (less aggressive)

df = df[outlier_mask].reset_index(drop=True)

print(f"Rows before: {df_raw.shape[0]:,}")
print(f"Rows after outlier removal: {df.shape[0]:,}")
print(f"Rows removed: {df_raw.shape[0] - df.shape[0]:,} ({100 * (1 - df.shape[0] / df_raw.shape[0]):.1f}%)")
print(f"\nNote: 'duration' column dropped (only known post-call, causes data leakage)")

Rows before: 45,211
Rows after outlier removal: 44,220
Rows removed: 991 (2.2%)

Note: 'duration' column dropped (only known post-call, causes data leakage)


In [5]:
# Separate features and target
X = df.drop(columns="y")
y = df["y"].map({"yes": 1, "no": 0})

# Identify column types
categorical_cols = X.select_dtypes(include=["object", "category"]).columns.tolist()
numeric_cols = X.select_dtypes(include=["number"]).columns.tolist()

print(f"Categorical features ({len(categorical_cols)}): {categorical_cols}")
print(f"Numeric features ({len(numeric_cols)}): {numeric_cols}")

Categorical features (9): ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome']
Numeric features (6): ['age', 'balance', 'day', 'campaign', 'pdays', 'previous']


## Feature Engineering

Create domain-specific features to improve model predictive power.

**Note**: The `duration` feature was removed from the dataset as it represents call duration, which is only known after a call ends and would cause data leakage in a real prediction scenario.

In [6]:
def engineer_features(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    
    # -1 means never contacted, don't use 999 which creates outliers
    df["was_contacted_before"] = (df["pdays"] != -1).astype(int)
    
    # For those contacted before, use log transform (adds 1 to avoid log(0))
    # For never contacted, use median of contacted group
    contacted_mask = df["pdays"] != -1
    if contacted_mask.sum() > 0:
        median_pdays = df.loc[contacted_mask, "pdays"].median()
        df["pdays_transformed"] = df["pdays"].replace(-1, median_pdays)
        df["pdays_log"] = np.log1p(df["pdays_transformed"])
    else:
        df["pdays_log"] = 0
    
    # Recency buckets, more informative than raw days
    df["contact_recency"] = pd.cut(
        df["pdays"].replace(-1, 9999),
        bins=[-1, 7, 30, 90, 180, 365, 10000],
        labels=["week", "month", "quarter", "half_year", "year", "never"]
    ).astype(str)
    
    # Previous campaign success is extremely predictive (~65% conversion if success)
    df["prev_success"] = (df["poutcome"] == "success").astype(int)
    df["prev_failure"] = (df["poutcome"] == "failure").astype(int)
    df["prev_unknown"] = (df["poutcome"] == "unknown").astype(int)
    
    # Interaction: contacted before AND had success
    df["contacted_and_success"] = (df["was_contacted_before"] & df["prev_success"]).astype(int)
    
    # === Contact history features ===
    df["contact_intensity"] = df["campaign"] / (df["previous"] + 1)
    df["total_contacts"] = df["campaign"] + df["previous"]
    df["high_campaign_effort"] = (df["campaign"] > df["campaign"].median()).astype(int)
    
    # === Financial features ===
    df["balance_per_age"] = df["balance"] / (df["age"] + 1)
    df["has_positive_balance"] = (df["balance"] > 0).astype(int)
    df["has_high_balance"] = (df["balance"] > df["balance"].quantile(0.75)).astype(int)
    df["has_loan_or_default"] = ((df["loan"] == "yes") | (df["default"] == "yes")).astype(int)
    
    # === Demographic interactions ===
    df["young_single"] = ((df["age"] < 30) & (df["marital"] == "single")).astype(int)
    df["retired_age"] = (df["age"] >= 60).astype(int)
    
    # === Time-based features ===
    if "day" in df.columns:
        df["is_month_start"] = (df["day"] <= 10).astype(int)
        df["is_month_end"] = (df["day"] >= 20).astype(int)
    
    # Month seasonality (certain months have higher conversion)
    high_conversion_months = ["mar", "oct", "sep", "dec"]
    df["is_high_conversion_month"] = df["month"].isin(high_conversion_months).astype(int)
    
    # Drop intermediate columns
    df = df.drop(columns=["pdays_transformed"], errors="ignore")
    
    return df


# Apply feature engineering
X_engineered = engineer_features(X)

new_features = set(X_engineered.columns) - set(X.columns)
print(f"Original features: {len(X.columns)}")
print(f"Engineered features: {len(X_engineered.columns)}")
print(f"New features added ({len(new_features)}): {sorted(new_features)}")

Original features: 15
Engineered features: 34
New features added (19): ['balance_per_age', 'contact_intensity', 'contact_recency', 'contacted_and_success', 'has_high_balance', 'has_loan_or_default', 'has_positive_balance', 'high_campaign_effort', 'is_high_conversion_month', 'is_month_end', 'is_month_start', 'pdays_log', 'prev_failure', 'prev_success', 'prev_unknown', 'retired_age', 'total_contacts', 'was_contacted_before', 'young_single']


In [7]:
# Update column lists for engineered features
categorical_cols_eng = X_engineered.select_dtypes(include=["object", "category"]).columns.tolist()
numeric_cols_eng = X_engineered.select_dtypes(include=["number"]).columns.tolist()

# Create preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), categorical_cols_eng),
        ("num", StandardScaler(), numeric_cols_eng),
    ]
)

print(f"Categorical features: {len(categorical_cols_eng)}")
print(f"Numeric features: {len(numeric_cols_eng)}")

Categorical features: 10
Numeric features: 24


### Train/Test Split

Use stratified split to preserve class distribution in both sets.

In [8]:
# Stratified train/validation/test split
# First split: train+val vs test (80/20)
X_temp, X_test, y_temp, y_test = train_test_split(
    X_engineered, y, 
    test_size=0.2, 
    random_state=RANDOM_STATE, 
    stratify=y
)

# Second split: train vs validation (75/25 of remaining = 60/20 overall)
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp,
    test_size=0.25,
    random_state=RANDOM_STATE,
    stratify=y_temp
)

# Calculate class weight for imbalanced learning
pos_weight = (y_train == 0).sum() / (y_train == 1).sum()

print(f"Training set: {X_train.shape[0]:,} samples (60%)")
print(f"Validation set: {X_val.shape[0]:,} samples (20%) - for early stopping")
print(f"Test set: {X_test.shape[0]:,} samples (20%)")
print(f"Positive class weight: {pos_weight:.2f}")
print(f"\nTrain class distribution:")
print(y_train.value_counts(normalize=True).rename({0: "no", 1: "yes"}).to_string())

Training set: 26,532 samples (60%)
Validation set: 8,844 samples (20%) - for early stopping
Test set: 8,844 samples (20%)
Positive class weight: 7.50

Train class distribution:
y
no     0.882331
yes    0.117669


### Helper Functions

Utility functions for threshold optimization and model evaluation.

In [9]:
def find_optimal_threshold(y_true: np.ndarray, y_proba: np.ndarray) -> tuple[float, float]:
    precision, recall, thresholds = precision_recall_curve(y_true, y_proba)
    
    # Calculate F1 for each threshold (avoid division by zero)
    f1_scores = 2 * (precision * recall) / (precision + recall + 1e-10)
    
    # Find best threshold (precision_recall_curve returns n+1 values, last has no threshold)
    best_idx = np.argmax(f1_scores[:-1])
    return thresholds[best_idx], f1_scores[best_idx]


def evaluate_model(
    name: str, 
    y_true: np.ndarray, 
    y_proba: np.ndarray, 
    threshold: float = None
) -> dict:
    if threshold is None:
        threshold, _ = find_optimal_threshold(y_true, y_proba)
    
    y_pred = (y_proba >= threshold).astype(int)
    
    metrics = {
        "name": name,
        "threshold": threshold,
        "accuracy": accuracy_score(y_true, y_pred),
        "precision": precision_score(y_true, y_pred, zero_division=0),
        "recall": recall_score(y_true, y_pred, zero_division=0),
        "f1": f1_score(y_true, y_pred, zero_division=0),
        "roc_auc": roc_auc_score(y_true, y_proba),
        "predictions": y_pred,
        "probabilities": y_proba,
    }
    
    print(f"\n{'='*50}")
    print(f"{name} @ threshold {threshold:.3f}")
    print(f"{'='*50}")
    print(f"  Accuracy:  {metrics['accuracy']:.4f}")
    print(f"  Precision: {metrics['precision']:.4f}")
    print(f"  Recall:    {metrics['recall']:.4f}")
    print(f"  F1 Score:  {metrics['f1']:.4f}")
    print(f"  ROC AUC:   {metrics['roc_auc']:.4f}")
    
    return metrics

## Model Training

Train multiple classifiers with different approaches to handle class imbalance:
1. **Class weights**: Built-in parameter to penalize misclassification of minority class
2. **SMOTE**: Synthetic Minority Over-sampling Technique to balance training data

In [10]:
# Store all results for comparison
all_results = []

## Bayesian Hyperparameter Optimization

Use `BayesSearchCV` from scikit-optimize to find optimal hyperparameters for each model. Bayesian optimization is more efficient than grid search as it uses past evaluation results to choose the next hyperparameters to evaluate.

In [None]:
# Bayesian Optimization imports
from skopt import BayesSearchCV
from skopt.space import Real, Integer, Categorical
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

# Use stratified k-fold for imbalanced data
bayes_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

# SMOTE for handling class imbalance (applied only during training in CV)
smote = SMOTE(random_state=RANDOM_STATE)

# Store Bayesian optimization results
bayes_results = {}

: 

### Reactive Rule Learner (RRL)

RRL is an interpretable rule-based model that learns logical rules from data. It provides transparent decision-making through human-readable rules.

In [None]:

# RRL Training using preprocessed and engineered features with hyperparameter search
import torch
from torch.utils.data import DataLoader, TensorDataset
import importlib
import rrl.models
from rrl.models import RRL
from rrl.utils import DBEncoder
from itertools import product

# Set device for RRL
# Force CPU usage to avoid potential MPS instability/crashes during repeated model initialization
rrl_device = torch.device("cpu")
print(f"Using device for RRL: {rrl_device}", flush=True)

# Use engineered features (same as other models)
# Create feature info dataframe for DBEncoder from X_engineered
print("Preparing engineered features for RRL...", flush=True)
f_list = []
for col in X_engineered.columns:
    if X_engineered[col].dtype == 'object':
        f_list.append([col, 'discrete'])
    else:
        f_list.append([col, 'continuous'])

f_df = pd.DataFrame(f_list)

# Initialize DBEncoder with engineered features
db_enc = DBEncoder(f_df, discrete=False)

# Create target dataframe (DBEncoder expects DataFrame)
y_train_df = y_train.to_frame(name='y')
y_val_df = y_val.to_frame(name='y')
y_test_df = y_test.to_frame(name='y')

# Fit encoder on training data
db_enc.fit(X_train, y_train_df)

# Transform all splits
X_rrl_train, y_rrl_train = db_enc.transform(X_train, y_train_df, normalized=True, keep_stat=True)
X_rrl_val, y_rrl_val = db_enc.transform(X_val, y_val_df, normalized=True, keep_stat=False)
X_rrl_test, y_rrl_test = db_enc.transform(X_test, y_test_df, normalized=True, keep_stat=False)

print(f"RRL Training samples: {X_rrl_train.shape[0]:,}", flush=True)
print(f"RRL Validation samples: {X_rrl_val.shape[0]:,}", flush=True)
print(f"RRL Test samples: {X_rrl_test.shape[0]:,}", flush=True)

# Create DataLoaders with optimized settings for speed
print("Creating tensors...", flush=True)
rrl_batch_size = 256

# FIX: Explicitly convert to float32 numpy arrays before creating tensors
# DBEncoder.transform can return object dtype arrays which cause torch.tensor() to hang
X_rrl_train = np.asarray(X_rrl_train, dtype=np.float32)
X_rrl_val = np.asarray(X_rrl_val, dtype=np.float32)
X_rrl_test = np.asarray(X_rrl_test, dtype=np.float32)
y_rrl_train = np.asarray(y_rrl_train, dtype=np.float32)
y_rrl_val = np.asarray(y_rrl_val, dtype=np.float32)
y_rrl_test = np.asarray(y_rrl_test, dtype=np.float32)

# Create tensors using torch.from_numpy (faster than torch.tensor for contiguous arrays)
X_train_tensor = torch.from_numpy(X_rrl_train)
y_train_tensor = torch.from_numpy(y_rrl_train)
X_val_tensor = torch.from_numpy(X_rrl_val)
y_val_tensor = torch.from_numpy(y_rrl_val)
X_test_tensor = torch.from_numpy(X_rrl_test)
y_test_tensor = torch.from_numpy(y_rrl_test)

rrl_train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
rrl_val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
rrl_test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

rrl_train_loader = DataLoader(rrl_train_dataset, batch_size=rrl_batch_size, shuffle=True)
rrl_val_loader = DataLoader(rrl_val_dataset, batch_size=rrl_batch_size, shuffle=False)
rrl_test_loader = DataLoader(rrl_test_dataset, batch_size=rrl_batch_size, shuffle=False)

discrete_flen = db_enc.discrete_flen
continuous_flen = db_enc.continuous_flen
rrl_output_dim = y_rrl_train.shape[1]  # One-hot encoded length

print(f"Discrete features: {discrete_flen}, Continuous features: {continuous_flen}", flush=True)
print(f"RRL output dimension: {rrl_output_dim}", flush=True)

# Hyperparameter search space
param_grid = {
    'lr': [0.001, 0.005, 0.01],
    'temperature': [0.05, 0.1, 0.2],
    'structure': [[16, 16, 8], [32, 32, 16], [64, 32, 16]]
}

best_val_f1 = -float('inf')
best_params = None
best_model_state = None
best_dim_list = None

print("Starting RRL hyperparameter search...", flush=True)
print("=" * 60, flush=True)
all_combos = list(product(param_grid['lr'], param_grid['temperature'], param_grid['structure']))
print(f"Testing {len(all_combos)} configurations", flush=True)

# Configure logging once to ensure outputs are visible
import logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - [%(levelname)s] - %(message)s')

# Stability settings
torch.set_num_threads(1)
print("Set torch threads to 1 for stability", flush=True)

# Data Validation
print("Validating data...", flush=True)
if np.isnan(X_rrl_train).any() or np.isinf(X_rrl_train).any():
    print("WARNING: NaN or Inf found in X_rrl_train! Replacing with 0 in-place.", flush=True)
    np.nan_to_num(X_rrl_train, copy=False)

if np.isnan(X_rrl_val).any() or np.isinf(X_rrl_val).any():
    print("WARNING: NaN or Inf found in X_rrl_val! Replacing with 0 in-place.", flush=True)
    np.nan_to_num(X_rrl_val, copy=False)

if np.isnan(X_rrl_test).any() or np.isinf(X_rrl_test).any():
    print("WARNING: NaN or Inf found in X_rrl_test! Replacing with 0 in-place.", flush=True)
    np.nan_to_num(X_rrl_test, copy=False)
    
print(f"Data shapes: Train {X_rrl_train.shape}, Val {X_rrl_val.shape}", flush=True)

for i, (lr, temp, struct) in enumerate(all_combos, start=1):
    current_dim_list = [(discrete_flen, continuous_flen)] + list(struct) + [rrl_output_dim]
    
    print(f"[{i}/{len(all_combos)}] Initializing model with lr={lr}, temp={temp}, struct={struct}", flush=True)

    try:
        candidate_model = RRL(
            dim_list=current_dim_list,
            device=rrl_device,
            use_not=True,
            is_rank0=False, # Set to False to prevent RRL from messing with logging handlers repeatedly
            save_best=False,
            distributed=False,
            use_skip=False,
            save_path=MODELS_DIR / "rrl_search_tmp.pth",
            temperature=temp
        )

        # Use fewer epochs during search for speed
        candidate_model.train_model(
            data_loader=rrl_train_loader,
            valid_loader=rrl_val_loader,
            epoch=30,
            lr=lr,
            weight_decay=1e-5,
            show_progress=False
        )

        _, val_f1 = candidate_model.test(
            test_loader=rrl_val_loader,
            set_name='Validation',
            show_progress=False
        )

        print(f"[{i}/{len(all_combos)}] lr={lr}, temp={temp}, struct={struct} -> Val F1: {val_f1:.4f}", flush=True)

        if val_f1 > best_val_f1:
            best_val_f1 = val_f1
            best_params = {'lr': lr, 'temperature': temp, 'structure': list(struct)}
            best_dim_list = current_dim_list
            best_model_state = {k: v.detach().cpu() for k, v in candidate_model.net.state_dict().items()}

        # Cleanup to prevent memory leaks and crashes
        del candidate_model
        import gc
        gc.collect()

    except Exception as e:
        print(f"Error verifying configuration {i}: {e}", flush=True)
        import traceback
        traceback.print_exc()
        # Continue to next iteration if one fails
        continue

if best_model_state is None:
    raise RuntimeError("Hyperparameter search did not produce a valid model.")

print("" + "=" * 60, flush=True)
print(f"Best parameters: {best_params}")
print(f"Best validation F1: {best_val_f1:.4f}")
print("=" * 60, flush=True)

rrl_model_path = MODELS_DIR / 'rrl_bank_model.pth'
rrl_log_path = MODELS_DIR / 'rrl_bank_log.txt'

# Rebuild best model and save checkpoint
rrl_model = RRL(
    dim_list=best_dim_list,
    device=rrl_device,
    use_not=True,
    is_rank0=True,
    distributed=False,
    save_best=False,
    use_skip=False,
    temperature=best_params['temperature']
)
rrl_model.net.load_state_dict(best_model_state)
rrl_model.net.eval()

best_rrl_args = {
    'dim_list': best_dim_list,
    'use_not': True,
    'use_skip': False,
    'estimated_grad': False,
    'use_nlaf': False,
    'alpha': 0.999,
    'beta': 8,
    'gamma': 1,
    'temperature': best_params['temperature']
}

torch.save({'model_state_dict': best_model_state, 'rrl_args': best_rrl_args}, rrl_model_path)
print(f"Saved best RRL model to {rrl_model_path}", flush=True)

# Test best model
print("Testing best RRL model...", flush=True)
rrl_model.test(test_loader=rrl_test_loader, set_name='Test', show_progress=True)

# Print Rules
print("Generating rules...", flush=True)
rrl_rules_path = MODELS_DIR / 'rrl_bank_rules.txt'
with open(rrl_rules_path, 'w') as f:
    rrl_model.rule_print(db_enc.X_fname, db_enc.y_fname, rrl_train_loader, file=f, mean=db_enc.mean, std=db_enc.std)

print(f"RRL Model saved to {rrl_model_path}", flush=True)
print(f"RRL Rules saved to {rrl_rules_path}", flush=True)


Using device for RRL: cpu
Preparing engineered features for RRL...
RRL Training samples: 26,532
RRL Validation samples: 8,844
RRL Test samples: 8,844
Creating tensors...
Discrete features: 40, Continuous features: 24
RRL output dimension: 2
Starting RRL hyperparameter search...
Testing 27 configurations




### RRL Evaluation

Load the trained RRL model and evaluate using the standard `evaluate_model` function for comparison with other models.

In [None]:
from tqdm.auto import tqdm as tqdm_auto

print(f"Loading RRL model from {rrl_model_path}...")

if not rrl_model_path.exists():
    print(f"Error: Model file not found at {rrl_model_path}")
else:
    checkpoint = torch.load(rrl_model_path, map_location=rrl_device, weights_only=False)
    saved_args = checkpoint['rrl_args']
    model_state_dict = checkpoint['model_state_dict']

    # Re-instantiate RRL model with saved arguments
    rrl_eval = RRL(
        dim_list=saved_args['dim_list'],
        device=rrl_device,
        use_not=saved_args['use_not'],
        use_skip=saved_args.get('use_skip', False),
        estimated_grad=saved_args.get('estimated_grad', False),
        use_nlaf=saved_args.get('use_nlaf', False),
        alpha=saved_args.get('alpha', 0.999),
        beta=saved_args.get('beta', 8),
        gamma=saved_args.get('gamma', 1),
        distributed=False,
        is_rank0=True
    )

    new_state_dict = {}
    for k, v in model_state_dict.items():
        name = k[7:] if k.startswith('module.') else k
        new_state_dict[name] = v

    rrl_eval.net.load_state_dict(new_state_dict)
    rrl_eval.net.eval()
    print("RRL Model loaded successfully.")

    rrl_y_proba_list = []

    with torch.no_grad():
        for X_batch, y_batch in tqdm_auto(rrl_test_loader, desc="Generating predictions", unit="batch"):
            X_batch = X_batch.to(rrl_device)

            # Forward pass
            outputs = rrl_eval.net(X_batch)

            # Apply softmax to get probabilities
            probs = torch.softmax(outputs, dim=1)

            rrl_y_proba_list.extend(probs[:, 1].cpu().numpy())  # Probability of class 1 (yes)

    rrl_y_proba = np.array(rrl_y_proba_list)

    rrl_results = evaluate_model("RRL", y_test.values, rrl_y_proba)
    all_results.append(rrl_results)