In [24]:
import numpy as np
import pandas as pd
from asboostreg import SparseAdditiveBoostingRegressor
from interpret import show
from interpret.glassbox import ExplainableBoostingRegressor
from pmlb import fetch_data
from sklearn.linear_model import ElasticNet, ElasticNetCV
from sklearn.model_selection import KFold, cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder

In [17]:
def interpretation(data_name: str, asbr_params: dict = None):
    if asbr_params is None:
        asbr_params = {}
    X, y = fetch_data(data_name, return_X_y=True)
    asbr = SparseAdditiveBoostingRegressor(
        random_state=0,
        n_iter_no_change=15,
        **asbr_params,
    )
    asbr.fit(X, y)
    asbr.plot_model_information()
    asbr.explain(X)
    ebm = ExplainableBoostingRegressor(interactions=0, random_state=0)
    ebm.fit(X, y)
    show(ebm.explain_global())
    return asbr, ebm

# Easy case

In [18]:
params = dict(
    l2_regularization=4.2,
    learning_rate=0.22,
    max_bins=399,
    max_leaves=58,
    min_samples_leaf=1,
    n_estimators=542,
    row_subsample=0.82,
)
asbr, ebm = interpretation("197_cpu_act");

The following features were not selected: ['feature_1', 'feature_3', 'feature_4', 'feature_6', 'feature_7', 'feature_10', 'feature_11', 'feature_12', 'feature_13', 'feature_15', 'feature_16', 'feature_17', 'feature_19']



Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`



# Constant case

In [21]:
X, y = fetch_data("215_2dplanes", return_X_y=True)
pd.DataFrame(X).nunique()

0    2
1    3
2    3
3    3
4    3
5    3
6    3
7    3
8    3
9    3
dtype: int64

In [27]:
kfold = KFold(shuffle=True, random_state=0)
linear = make_pipeline(
    OneHotEncoder(
        drop="first",
    ),
    ElasticNetCV(
        l1_ratio=np.linspace(0.1, 1, 10),
        cv=kfold,
        n_jobs=5,
        random_state=0,
        selection="random",
    )
)
linear.fit(X, y)
linear_refit = make_pipeline(
    OneHotEncoder(
        drop="first",
    ),
    ElasticNet(
        l1_ratio=linear[-1].l1_ratio_,
        alpha=linear[-1].alpha_,
        random_state=0,
        selection="random",
    )
)
scores = cross_val_score(linear_refit, X, y, cv=kfold)

In [28]:
print(scores.mean(), scores.std()/np.sqrt(len(scores)))

0.7053315883120342 0.0008978287404647299


# Hard case

In [6]:
params3 = dict(
    l2_regularization=0.9,
    learning_rate=0.27,
    max_bins=786,
    max_leaves=39,
    min_samples_leaf=1,
    n_estimators=734,
    row_subsample=0.88,
)
asbr3, ebm3 = interpretation("201_pol", params3);


invalid value encountered in divide


invalid value encountered in divide



The following features were not selected: ['feature_1', 'feature_2', 'feature_3', 'feature_4', 'feature_5', 'feature_6', 'feature_7', 'feature_8', 'feature_9', 'feature_10', 'feature_11', 'feature_12', 'feature_13', 'feature_14', 'feature_15', 'feature_16', 'feature_17', 'feature_18', 'feature_19', 'feature_20', 'feature_21', 'feature_22', 'feature_23', 'feature_24', 'feature_25', 'feature_26', 'feature_27', 'feature_28', 'feature_29', 'feature_30', 'feature_31', 'feature_32', 'feature_33', 'feature_34', 'feature_35', 'feature_36', 'feature_37', 'feature_38', 'feature_39', 'feature_40', 'feature_41', 'feature_42', 'feature_43', 'feature_44', 'feature_45', 'feature_46', 'feature_47']



invalid value encountered in add




Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`

