## Test BinaryBART
<sup>*</sup>Including ProbitBART and LogisticBART

In [1]:
import numpy as np
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss, accuracy_score, roc_auc_score
from sklearn.datasets import make_classification, load_breast_cancer, load_wine, fetch_openml
import pandas as pd
from bart_playground import *
from bart_playground.bart import DefaultBART, ProbitBART
import bartz

In [2]:
# Parameters
N_TREES = 50
NDPOST = 500
NSKIP = 200
RANDOM_STATE = 42

# If debug then run with only one dataset and record running time
# Otherwise run with all datasets
debug = False

In [3]:
from sklearn.preprocessing import OrdinalEncoder, normalize

def load_mushroom():
    X, y = fetch_openml('mushroom', version=1, return_X_y=True)
    for col in X.select_dtypes('category'):
        # -1 in codes indicates NaN by pandas convention
        X[col] = X[col].cat.codes
    X = normalize(X)
    y_array = y.to_numpy().reshape(-1, 1)
    y_arm = OrdinalEncoder(dtype=int).fit_transform(y_array).flatten()
    
    return X, y_arm

def load_mushroom_encoded():
    X, y_arm = load_mushroom()
    
    n_arm = np.max(y_arm) + 1
    dim = X.shape[1] * n_arm # total number of encoded covariates (location-encoded for each arm) 
    act_dim = X.shape[1] # number of covariates
    covariates = np.zeros((n_arm * X.shape[0], dim))
    rewards = np.zeros((n_arm * X.shape[0], ))
    for cursor in range(X.shape[0]):
        for a in range(n_arm):
            covariates[cursor * n_arm + a, a * act_dim:(a * act_dim + act_dim)] = X[cursor]
        arm = y_arm[cursor]
        rewards[cursor * n_arm + arm] = 1

    return covariates, rewards

In [4]:
# Load datasets
def load_datasets():
    # Synthetic dataset
    X_syn, y_syn = make_classification(n_samples=400, n_features=8, n_informative=6, 
                                       n_redundant=0, n_classes=2, random_state=RANDOM_STATE)
    
    # Breast cancer dataset
    X_bc, y_bc = load_breast_cancer(return_X_y=True)
    
    # Wine dataset (convert to binary: class 0 vs rest)
    X_wine, y_wine = load_wine(return_X_y=True)
    y_wine = (y_wine == 0).astype(int)
    
    X_mushroom, y_mushroom = load_mushroom()
    X_mr_encoded, y_mr_encoded = load_mushroom_encoded()
    
    return {
        "Synthetic": (X_syn, y_syn),
        "Breast Cancer": (X_bc, y_bc),
        "Wine Binary": (X_wine, y_wine),
        "Mushroom": (X_mushroom, y_mushroom),
        "Mushroom Encoded": (X_mr_encoded, y_mr_encoded)
    }

In [5]:
def evaluate_model(model, model_name, X_train, X_test, y_train, y_test):
    """Evaluate a single model and return metrics"""
    
    if model_name == "Bartz":
        # Bartz regression treating 0/1 as continuous
        fit_result = bartz.BART.gbart(
            x_train=X_train.T, y_train=y_train.astype(float),
            x_test=X_test.T,
            ntree=N_TREES, ndpost=NDPOST, nskip=NSKIP,
            seed=RANDOM_STATE,
            printevery=NDPOST + NSKIP + 100
        )
        btpred_all = fit_result.predict(np.transpose(X_test))
        btpred = np.mean(np.array(btpred_all), axis=0)
        y_pred_prob = np.clip(btpred, 1e-9, 1 - 1e-9)
        y_pred = (y_pred_prob > 0.5).astype(int)
        
    elif model_name == "ProbitBART" or model_name == "LogisticBART":
        # Proper binary BART
        model.fit(X_train, y_train)
        proba_output = model.predict_proba(X_test)
        y_pred_prob = proba_output[:, 1]
        y_pred = model.predict(X_test)
        
    elif model_name == "RandomForestClassifier":
        # Native binary classifier
        model.fit(X_train, y_train)
        y_pred_prob = model.predict_proba(X_test)[:, 1]
        y_pred = model.predict(X_test)
        
    else:
        # Regression methods treating 0/1 as continuous
        model.fit(X_train, y_train)
        raw_pred = model.predict(X_test)
        y_pred_prob = np.clip(raw_pred, 1e-9, 1 - 1e-9)
        y_pred = (y_pred_prob > 0.5).astype(int)
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    logloss = log_loss(y_test, y_pred_prob)
    auc = roc_auc_score(y_test, y_pred_prob)
    
    return {'Accuracy': accuracy, 'LogLoss': logloss, 'AUC': auc}

In [6]:
from bart_playground.bart import LogisticBART

old_settings = np.seterr(invalid='raise')

datasets = load_datasets()
results = []

In [7]:
for name, (X, y) in datasets.items():
    # Print dataset shapes
    print(f"Dataset: {name}\nX shape: {X.shape}, y shape: {y.shape}")
    # Print 0-1 distribution of y
    print(f"y distribution: {pd.Series(y).value_counts(normalize=True).to_dict()}")

Dataset: Synthetic
X shape: (400, 8), y shape: (400,)
y distribution: {0: 0.5, 1: 0.5}
Dataset: Breast Cancer
X shape: (569, 30), y shape: (569,)
y distribution: {1: 0.6274165202108963, 0: 0.37258347978910367}
Dataset: Wine Binary
X shape: (178, 13), y shape: (178,)
y distribution: {0: 0.6685393258426966, 1: 0.33146067415730335}
Dataset: Mushroom
X shape: (8124, 22), y shape: (8124,)
y distribution: {0: 0.517971442639094, 1: 0.48202855736090594}
Dataset: Mushroom Encoded
X shape: (16248, 44), y shape: (16248,)
y distribution: {0.0: 0.5, 1.0: 0.5}


In [8]:
metrics = None
 
def record_evaluation_results(dataset_name, X, y):
    global metrics
    
    print(f"\n=== Testing on {dataset_name} ===")
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=RANDOM_STATE, stratify=y
    )
    
    # Define models
    models = {
        "RFClassifier": RandomForestClassifier(n_estimators=N_TREES, random_state=RANDOM_STATE),
        "RFRegressor": RandomForestRegressor(n_estimators=N_TREES, random_state=RANDOM_STATE),
        "Bartz": "placeholder",
        "DefaultBART": DefaultBART(n_trees=N_TREES, ndpost=NDPOST, nskip=NSKIP, random_state=RANDOM_STATE),
        "ProbitBART": ProbitBART(n_trees=N_TREES, ndpost=NDPOST, nskip=NSKIP, random_state=RANDOM_STATE),
        "LogisticBART": LogisticBART(n_trees=N_TREES, ndpost=NDPOST, nskip=NSKIP, random_state=RANDOM_STATE)
    }
    
    for model_name, model in models.items():
        print(f"  Training {model_name}...")
        
        X_tr, X_te = X_train, X_test
        
        metrics = evaluate_model(model, model_name, X_tr, X_te, y_train, y_test)
        
        result = {'Dataset': dataset_name, 'Model': model_name, **metrics}
        results.append(result)
        print(f"    Acc: {metrics['Accuracy']:.3f}, LogLoss: {metrics['LogLoss']:.3f}, AUC: {metrics['AUC']:.4f}")
        
    return results

In [9]:
if not debug:
    for dataset_name, (X, y) in list(datasets.items())[0:4]: # skip mushroom encoded for brevity
        record_evaluation_results(dataset_name, X, y)

INFO:2025-06-08 10:56:38,504:jax._src.xla_bridge:867: Unable to initialize backend 'tpu': INTERNAL: Failed to open libtpu.so: libtpu.so: cannot open shared object file: No such file or directory
INFO:jax._src.xla_bridge:Unable to initialize backend 'tpu': INTERNAL: Failed to open libtpu.so: libtpu.so: cannot open shared object file: No such file or directory



=== Testing on Synthetic ===
  Training RFClassifier...
    Acc: 0.900, LogLoss: 2.072, AUC: 0.9000
  Training RFRegressor...
    Acc: 0.858, LogLoss: 0.341, AUC: 0.9315
  Training Bartz...
    Acc: 0.875, LogLoss: 0.329, AUC: 0.9475
  Training DefaultBART...


Iterations: 100%|██████████| 700/700 [00:11<00:00, 59.95it/s] 


    Acc: 0.867, LogLoss: 0.322, AUC: 0.9511
  Training ProbitBART...


Iterations: 100%|██████████| 700/700 [00:08<00:00, 85.04it/s] 


    Acc: 0.817, LogLoss: 0.552, AUC: 0.8756
  Training LogisticBART...


Iterations: 100%|██████████| 700/700 [00:15<00:00, 43.90it/s]


    Acc: 0.867, LogLoss: 0.313, AUC: 0.9497

=== Testing on Breast Cancer ===
  Training RFClassifier...
    Acc: 0.924, LogLoss: 1.575, AUC: 0.9204
  Training RFRegressor...
    Acc: 0.942, LogLoss: 0.118, AUC: 0.9892
  Training Bartz...
    Acc: 0.947, LogLoss: 0.172, AUC: 0.9848
  Training DefaultBART...


Iterations: 100%|██████████| 700/700 [00:06<00:00, 103.34it/s]


    Acc: 0.942, LogLoss: 0.140, AUC: 0.9857
  Training ProbitBART...


Iterations: 100%|██████████| 700/700 [00:06<00:00, 100.37it/s]


    Acc: 0.936, LogLoss: 0.264, AUC: 0.9879
  Training LogisticBART...


Iterations: 100%|██████████| 700/700 [00:13<00:00, 51.03it/s]


    Acc: 0.971, LogLoss: 0.107, AUC: 0.9931

=== Testing on Wine Binary ===
  Training RFClassifier...
    Acc: 0.926, LogLoss: 1.535, AUC: 0.9028
  Training RFRegressor...
    Acc: 0.926, LogLoss: 0.519, AUC: 0.9498
  Training Bartz...
    Acc: 0.944, LogLoss: 0.124, AUC: 0.9877
  Training DefaultBART...


Iterations: 100%|██████████| 700/700 [00:04<00:00, 144.14it/s]


    Acc: 0.963, LogLoss: 0.132, AUC: 0.9846
  Training ProbitBART...


Iterations: 100%|██████████| 700/700 [00:06<00:00, 115.67it/s]


    Acc: 0.815, LogLoss: 0.463, AUC: 0.9861
  Training LogisticBART...


Iterations: 100%|██████████| 700/700 [00:12<00:00, 54.76it/s]


    Acc: 0.944, LogLoss: 0.135, AUC: 0.9892

=== Testing on Mushroom ===
  Training RFClassifier...
    Acc: 1.000, LogLoss: 0.000, AUC: 1.0000
  Training RFRegressor...
    Acc: 1.000, LogLoss: 0.002, AUC: 1.0000
  Training Bartz...
    Acc: 1.000, LogLoss: 0.003, AUC: 1.0000
  Training DefaultBART...


Iterations: 100%|██████████| 700/700 [01:34<00:00,  7.41it/s]


    Acc: 1.000, LogLoss: 0.015, AUC: 1.0000
  Training ProbitBART...


Iterations: 100%|██████████| 700/700 [00:46<00:00, 15.01it/s]


    Acc: 0.979, LogLoss: 0.091, AUC: 0.9984
  Training LogisticBART...


Iterations: 100%|██████████| 700/700 [00:41<00:00, 16.93it/s]


    Acc: 0.996, LogLoss: 0.026, AUC: 0.9999


In [10]:
if debug == True:
    dataset_name, (X, y) = list(datasets.items())[-1]  # Last dataset for debugging
    
    profile = False
    if not profile:
        record_evaluation_results(dataset_name, X, y)
    else:
        %prun -s cumtime -D temp_profile.prof -q record_evaluation_results(dataset_name, X, y)

        fname = "profile_logisticbart"

        !mv temp_profile.prof {fname}.prof
        !gprof2dot -f pstats {fname}.prof -o {fname}.dot
        !dot -Tpng {fname}.dot -o {fname}.png


=== Testing on Mushroom Encoded ===
  Training RFClassifier...
    Acc: 1.000, LogLoss: 0.004, AUC: 0.9998
  Training RFRegressor...
    Acc: 0.999, LogLoss: 0.008, AUC: 0.9998
  Training LogisticBART...


Iterations: 100%|██████████| 400/400 [01:02<00:00,  6.44it/s]


    Acc: 0.997, LogLoss: 0.028, AUC: 0.9999


In [10]:
# Display results
results_df = pd.DataFrame(results)
print("\n" + "="*60)
print("SUMMARY RESULTS")
print("="*60)

# Pivot tables for easy comparison
for metric in ['Accuracy', 'AUC', 'LogLoss']:
    print(f"\n{metric}:")
    pivot = results_df.pivot_table(index='Dataset', columns='Model', values=metric)
    print(pivot.round(3))


SUMMARY RESULTS

Accuracy:
Model          Bartz  DefaultBART  LogisticBART  ProbitBART  RFClassifier  \
Dataset                                                                     
Breast Cancer  0.947        0.942         0.971       0.936         0.924   
Synthetic      0.875        0.867         0.867       0.817         0.900   
Wine Binary    0.944        0.963         0.944       0.815         0.926   

Model          RFRegressor  
Dataset                     
Breast Cancer        0.942  
Synthetic            0.858  
Wine Binary          0.926  

AUC:
Model          Bartz  DefaultBART  LogisticBART  ProbitBART  RFClassifier  \
Dataset                                                                     
Breast Cancer  0.985        0.986         0.993       0.988         0.920   
Synthetic      0.948        0.951         0.950       0.876         0.900   
Wine Binary    0.988        0.985         0.989       0.986         0.903   

Model          RFRegressor  
Dataset             