# Setup

In [9]:
# Standard library imports
import warnings
from time import perf_counter_ns

# Third party imports
import pandas as pd
import xgboost as xgb
from interpret.glassbox import ExplainableBoostingRegressor
from sklearn.datasets import fetch_california_housing
from sklearn.dummy import DummyRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.model_selection import KFold
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor

# Local imports
from asboostreg import SparseAdditiveBoostingRegressor

In [10]:
pd.options.plotting.backend = "plotly"
warnings.filterwarnings("ignore")

# Loading the California housing dataset

In [11]:
X, y = fetch_california_housing(return_X_y=True, as_frame=True)
X.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [12]:
kf = KFold(n_splits=10, random_state=0)
cv = list(kf.split(X))

# Defining the models

In [13]:
dummy = DummyRegressor()

# Interpretable but not strong
ridgereg = make_pipeline(
    StandardScaler(),
    RidgeCV(cv=cv),
)  # Non Sparse
treereg = DecisionTreeRegressor(max_depth=3)  # Sparse

# Strong but not interpretable
rfreg = RandomForestRegressor()  # Non Sparse
xgbreg = xgb.XGBRegressor()  # Sparse

# Both interpretable and strong
ebmreg = ExplainableBoostingRegressor(interactions=0)  # Non Sparse
sparsereg = SparseAdditiveBoostingRegressor(
    learning_rate=0.01,
    n_estimators=18_000,
    l2_regularization=5.0,
    max_depth=4,
    row_subsample=0.85,
    random_state=0,
    n_iter_no_change=20,
)  # Sparse

# Comparing performance

In [14]:
# Running fast Hyperparameter optimization for Ridge
ridgereg.fit(X, y)
alpha = ridgereg.named_steps["ridgecv"].alpha_
ridgereg = make_pipeline(StandardScaler(), Ridge(alpha=alpha))

In [15]:
def evaluate(model, X_train, X_test, y_train, y_test, **kwargs):
    start = perf_counter_ns()
    model.fit(X_train, y_train, **kwargs)
    end = perf_counter_ns()
    elapsed = (end - start) / 1e9
    train_score = model.score(X_train, y_train)
    test_score = model.score(X_test, y_test)
    print(
        f"{model.__class__.__name__}: {train_score:.3f} (train),"
        f" {test_score:.3f} (test), {elapsed:.3f} (s)"
    )
    return train_score, test_score

In [16]:
train_scores = {
    "Decision Tree": [],
    "Elastic Net": [],
    "XGBoost": [],
    "Random Forest": [],
    "EBM": [],
    "SparseReg": [],
    "Dummy": [],
}
test_scores = {
    "Dummy": [],
    "Decision Tree": [],
    "Elastic Net": [],
    "XGBoost": [],
    "Random Forest": [],
    "EBM": [],
    "SparseReg": [],
}

for i, (train_index, test_index) in enumerate(cv, 1):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    print(f"Fold {i}")
    print("------")

    train, test = evaluate(dummy, X_train, X_test, y_train, y_test)
    train_scores["Dummy"].append(train)
    test_scores["Dummy"].append(test)

    train, test = evaluate(treereg, X_train, X_test, y_train, y_test)
    train_scores["Decision Tree"].append(train)
    test_scores["Decision Tree"].append(test)

    train, test = evaluate(ridgereg, X_train, X_test, y_train, y_test)
    train_scores["Elastic Net"].append(train)
    test_scores["Elastic Net"].append(test)

    train, test = evaluate(
        xgbreg,
        X_train,
        X_test,
        y_train,
        y_test,
        eval_set=[(X_test, y_test)],
        early_stopping_rounds=30,
        verbose=False,
    )
    train_scores["XGBoost"].append(train)
    test_scores["XGBoost"].append(test)

    train, test = evaluate(rfreg, X_train, X_test, y_train, y_test)
    train_scores["Random Forest"].append(train)
    test_scores["Random Forest"].append(test)

    train, test = evaluate(ebmreg, X_train, X_test, y_train, y_test)
    train_scores["EBM"].append(train)
    test_scores["EBM"].append(test)

    train, test = evaluate(
        sparsereg, X_train, X_test, y_train, y_test, validation_set=(X_test, y_test)
    )
    train_scores["SparseReg"].append(train)
    test_scores["SparseReg"].append(test)

    print()

Fold 1
------
DummyRegressor: 0.000 (train), -0.000 (test), 0.001 (s)
DecisionTreeRegressor: 0.535 (train), 0.520 (test), 0.065 (s)
Pipeline: 0.606 (train), 0.610 (test), 0.007 (s)
XGBRegressor: 0.938 (train), 0.849 (test), 1.082 (s)
RandomForestRegressor: 0.974 (train), 0.815 (test), 30.524 (s)
ExplainableBoostingRegressor: 0.783 (train), 0.774 (test), 3.738 (s)
SparseAdditiveBoostingRegressor: 0.751 (train), 0.757 (test), 50.063 (s)

Fold 2
------
DummyRegressor: 0.000 (train), -0.000 (test), 0.000 (s)
DecisionTreeRegressor: 0.540 (train), 0.473 (test), 0.063 (s)
Pipeline: 0.609 (train), 0.576 (test), 0.008 (s)
XGBRegressor: 0.938 (train), 0.828 (test), 1.042 (s)
RandomForestRegressor: 0.974 (train), 0.794 (test), 29.877 (s)
ExplainableBoostingRegressor: 0.785 (train), 0.740 (test), 3.590 (s)
SparseAdditiveBoostingRegressor: 0.503 (train), 0.466 (test), 48.862 (s)

Fold 3
------
DummyRegressor: 0.000 (train), -0.001 (test), 0.000 (s)
DecisionTreeRegressor: 0.533 (train), 0.538 (test)

In [19]:
test_scores_df = pd.DataFrame(test_scores)
test_scores_df

Unnamed: 0,Dummy,Decision Tree,Elastic Net,XGBoost,Random Forest,EBM,SparseReg
0,-9.238224e-05,0.519848,0.60966,0.849227,0.815203,0.773837,0.756509
1,-0.0004577164,0.472878,0.576329,0.828193,0.793642,0.739553,0.465886
2,-0.001009159,0.537544,0.590484,0.833853,0.803716,0.753132,0.737851
3,-4.325266e-05,0.527948,0.615331,0.843024,0.814622,0.76648,0.54944
4,-5.910429e-05,0.520796,0.613184,0.847933,0.822098,0.776031,0.501127
5,-0.0004370797,0.519115,0.590524,0.832766,0.812028,0.76201,0.745576
6,-0.0002159296,0.551868,0.608638,0.839166,0.814184,0.758246,0.740616
7,-3.202562e-06,0.530788,0.591641,0.841493,0.823664,0.777471,0.551265
8,-7.267582e-07,0.500882,0.597019,0.812652,0.791354,0.734388,0.54152
9,-2.696465e-05,0.565437,0.639456,0.851851,0.833884,0.792962,0.568409


In [20]:
print(test_scores_df.round(3).to_latex())

\begin{tabular}{lrrrrrrr}
\toprule
 & Dummy & Decision Tree & Elastic Net & XGBoost & Random Forest & EBM & SparseReg \\
\midrule
0 & -0.000000 & 0.520000 & 0.610000 & 0.849000 & 0.815000 & 0.774000 & 0.757000 \\
1 & -0.000000 & 0.473000 & 0.576000 & 0.828000 & 0.794000 & 0.740000 & 0.466000 \\
2 & -0.001000 & 0.538000 & 0.590000 & 0.834000 & 0.804000 & 0.753000 & 0.738000 \\
3 & -0.000000 & 0.528000 & 0.615000 & 0.843000 & 0.815000 & 0.766000 & 0.549000 \\
4 & -0.000000 & 0.521000 & 0.613000 & 0.848000 & 0.822000 & 0.776000 & 0.501000 \\
5 & -0.000000 & 0.519000 & 0.591000 & 0.833000 & 0.812000 & 0.762000 & 0.746000 \\
6 & -0.000000 & 0.552000 & 0.609000 & 0.839000 & 0.814000 & 0.758000 & 0.741000 \\
7 & -0.000000 & 0.531000 & 0.592000 & 0.841000 & 0.824000 & 0.777000 & 0.551000 \\
8 & -0.000000 & 0.501000 & 0.597000 & 0.813000 & 0.791000 & 0.734000 & 0.542000 \\
9 & -0.000000 & 0.565000 & 0.639000 & 0.852000 & 0.834000 & 0.793000 & 0.568000 \\
\bottomrule
\end{tabular}

