In [None]:
import lightgbm as lgbm
import matplotlib.pyplot as plt
import seaborn as sns
from gamexplainer.datasets import dataset_from_fun
from sklearn.model_selection import train_test_split
from gamexplainer import GamExplainer
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KernelDensity
from synthetic_fun import fun_without_interaction, base_fun
from sklearn.metrics import mean_squared_error, r2_score
from collections import defaultdict
%load_ext autoreload
%autoreload 2

In [None]:
# matplotlib conf
plt.rcParams['text.usetex'] = True
plt.rcParams['text.latex.preamble'] = r'\usepackage{bm}'

## Create dataset

In [None]:
noise_gen = np.random.default_rng(seed=42)

In [None]:
synth_df = dataset_from_fun(n_sample=10000, n_features=5, fun=fun_without_interaction, random_state=42, rnd_gen=noise_gen)
X_train, X_test, y_train, y_test = train_test_split(synth_df.drop("y", axis=1), synth_df["y"], test_size=0.2, shuffle=False)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, shuffle=False)
train_data = lgbm.Dataset(X_train, label=y_train)
val_data = lgbm.Dataset(X_val, label=y_val, reference=train_data)
test_data = lgbm.Dataset(X_test, label=y_test)

## Grid search to find the best regressor

In [None]:
parameters = {
    "n_estimators": np.geomspace(10, 1000, num=3, dtype=int),
    "num_leaves": np.geomspace(32, 256, num=4, dtype=int),
    "learning_rate": np.geomspace(1e-4, 1e-1, num=4)
}
CV_regressor = GridSearchCV(lgbm.LGBMRegressor(), parameters, verbose=3, scoring="neg_root_mean_squared_error")
CV_regressor.fit(X_train, y_train, early_stopping_rounds=50, eval_set=[(X_val.values, y_val.values)])
print(CV_regressor.best_params_)

The results are
- n_estimators = 1000
- num_leaves = 32
- learning_rate = 0.1

In [None]:
# Train it without fine-tuning to save time
forest = lgbm.LGBMRegressor(n_estimators=1000, num_leaves=32, learning_rate=0.1)
forest.fit(X_train, y_train, early_stopping_rounds=50, eval_set=[(X_val.values, y_val.values)])

## RMSE prediction over the test set

In [None]:
mean_squared_error(y_test, forest.predict(X_test), squared=False)

In [None]:
r2_score(y_test, forest.predict(X_test))

## Modify this to choose to save the file or not 

In [None]:
save_pdf = True
file_out = None
if save_pdf:
    import pathlib
    import os

    file_out = "plots/sampling_comparison.pdf"
    pathlib.Path(os.path.dirname(file_out)).mkdir(parents=True, exist_ok=True)

In [None]:
sampling_methods = ["all", "quantile", "equal", "kmeans", "equi_size"]
results = {}
for i, sampling_method in tqdm(enumerate(sampling_methods)):
    explainer = GamExplainer(sample_method=sampling_method,
                             n_spline_terms=5,
                             n_inter_terms=0,
                             sample_n=75)
    gam = explainer.explain(forest, lam_search_space=[0.1, 0.5, 1])
    results[sampling_method] = explainer

## Sampling comparison

In [None]:
import matplotlib.patches as mpatches

In [None]:
labels = [r"\emph{All-Thresholds}", r"\emph{$K$-Quantile}", r"\emph{Equi-Width}", r"\emph{$K$-Means}",
          "\emph{Equi-Size}"]

pos = [(0, 0), (0, 1), (0, 2), (1, 3), (1, 4)]  # plot positions
feat = "x_2"

fig, axs = plt.subplots(1, 5, sharey="all", tight_layout=True, figsize=(20, 5))

# Real function behind the data
fun = lambda x: np.exp((x - 0.5) * 50) / (np.exp((x - 0.5) * 50) + 1)

xs = np.linspace(0, 1, 1000)
plots_for_legend = []

all_thresholds = results["all"].get_feature_thresholds()
kde = KernelDensity(kernel='gaussian', bandwidth=0.2)
kde.fit(np.array(all_thresholds[feat]).reshape(-1, 1))
y_kde_plot = kde.score_samples(np.array(all_thresholds[feat]).reshape(-1, 1))

kde_offset = 1
y_lim = [-0.2, 1.7]

_, rug_plot_col = sns.color_palette(n_colors=2)
points_col = rug_plot_col
kde_fill_col, dist_fun_color, kde_contour_col = sns.color_palette("Blues", n_colors=3)

for i, sampling_method in enumerate(sampling_methods):
    row, col = pos[i]
    ax = axs[i]

    ax.set_ylim(y_lim)

    ax.plot(np.array(all_thresholds[feat]), y_kde_plot + kde_offset, color=kde_contour_col)
    ax.fill_between(np.array(all_thresholds[feat]), y_kde_plot + kde_offset, y2=0, color=kde_fill_col)
    
    sampling_results = np.array(results[sampling_method].sampled[feat])
   
    points_y = fun(sampling_results)
    sns.rugplot(x=sampling_results, ax=ax, color=rug_plot_col, height=0.07, linewidth = 1)
    
    sampled_ax = ax.scatter(sampling_results, points_y, color=points_col, marker="o", s=10)

    # function and points
    ys = fun(xs)
    original_dist = ax.plot(xs, ys, color=dist_fun_color, marker=None)

    ax.set_title(labels[i])

blue_patch = mpatches.Patch(color=dist_fun_color, label='Original thresholds distribution')
orange_patch = mpatches.Patch(color=points_col, label='Sampled thresholds distribution')
plt.figlegend(handles=[blue_patch, orange_patch], bbox_to_anchor=(0.8, 0.05), ncol=2)
params = {'legend.fontsize': 25,
          'figure.figsize': (20, 5),
          'axes.titlesize': 30,
          'xtick.labelsize': 18,
          'ytick.labelsize': 20}
plt.rcParams.update(params)
plt.show()
if save_pdf:
    fig.savefig(file_out, bbox_inches="tight")

## Sampling strategies comparison

In [None]:
PRECOMPUTED_PATH = "precomputed_results/sampling_comparison_synthetic.pickle"

### Import the results if available

In [None]:
import pickle

with open(PRECOMPUTED_PATH, 'rb') as f:
    acc_methods = pickle.load(f)

In [None]:
sampling_methods = ["all", "quantile", "equal", "kmeans", "equi_size"]
range_m = range(100, 2500, 100)

### Compute the results if not available

In [None]:
explanation_params = {"verbose": False,
                      "feat_importance_method": "gain",
                      "n_spline_terms": 5,
                      }
acc_methods = defaultdict(list)
for m in tqdm(range_m):
    explanation_params["sample_n"] = m
    for sampling_method in sampling_methods:
        explanation_params["sample_method"] = sampling_method
        explainer = GamExplainer(**explanation_params)
        gam = explainer.explain(forest, lam_search_space=[0.01, 1])
        acc_methods[sampling_method].append(explainer.loss_res)

#### Save the results to avoid repeating computations

In [None]:
import pickle

with open(PRECOMPUTED_PATH, 'wb') as f:
    pickle.dump(acc_methods, f)

### Plot the results

In [None]:
labels = [r"\emph{All-Thresholds}", r"\emph{$K$-Quantile}", r"\emph{Equi-Width}", r"\emph{$K$-Means}",
          r"\emph{Equi-Size}"]
markers = ["o", "s", "*", "x", "D"]
colors = sns.color_palette(n_colors=len(sampling_methods))
for i, sampling_method in enumerate(sampling_methods):
    plt.plot(range_m, acc_methods[sampling_method], f'{markers[i]}--', mfc='none', mec=colors[i], ms=11,
             label=labels[i])
plt.xlabel("$K$")
plt.ylabel("RMSE")
file_out = "plots/rmse_sampling_comparison.pdf"

plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.15), ncol=3, fancybox=False, shadow=False)
params = {'legend.fontsize': 32,
          'figure.figsize': (15, 10),
          'axes.labelsize': 32,
          'xtick.labelsize': 32,
          'ytick.labelsize': 20}
plt.rcParams.update(params)
plt.tight_layout()
plt.savefig(file_out)

## Splines reconstructions

In [None]:
explanation_params = {"n_spline_terms": 5,
                      "sample_method": "equi_size",
                      "sample_n": 1200,
                      "verbose": True,
                      "n_inter_terms": 0}

explainer = GamExplainer(**explanation_params)
explainer.explain(forest, lam_search_space=[0.01, 0.05, 0.1, 1])

In [None]:
save_pdf = True
file_out = None
if save_pdf:
    import pathlib
    import os

    file_out = "plots/generators.pdf"
    pathlib.Path(os.path.dirname(file_out)).mkdir(parents=True, exist_ok=True)

In [None]:
n_row, n_col = 1, 5
texts = [r"$\bm{x}_1$",
         r"$\sin\left(20\bm{x}_2\right)$",
         r"$\frac{\exp\left(50(\bm{x}_3 -0.5)\right)}{\exp\left(50(\bm{x}_3 -0.5)\right) + 1}$",
         r"$\frac{\arctan\left(10\bm{x}_4\right)- \sin\left(10\bm{x}_4\right)}{2}$",
         r"$\frac{2}{\bm{x}_5 +1}$"]

font_sizes = [30, 30, 40, 40, 40]
if not plt.rcParams['text.usetex']:
    texts = ["First spline", "Second spline", "Third spline", "Fourth spline", "Fifth spline"]

fig = plt.figure(figsize=(30, 6))

terms = [(i, x) for i, x in enumerate(explainer.gam.terms) if not x.isintercept and not x.istensor]
terms.sort(key=lambda x: x[1].feature)

axes = fig.subplots(n_row, n_col, sharey=True)

for i, ax in enumerate(axes):

    term = explainer.gam.terms[i]

    c1, c2 = sns.color_palette("Blues", 2)

    # Spline print
    grid = explainer.gam.generate_X_grid(term=i)
    pdep, confi = explainer.gam.partial_dependence(term=i, X=grid, width=0.95)
    #print(pdep)
    centered_pdep = pdep - np.average(pdep)
    l1 = ax.plot(grid[:, term.feature], centered_pdep, label="Spline learned", lw=3)

    # Generating function print
    real_fun_aux = base_fun(term.feature)(grid[:, term.feature])
    real_fun_centered = real_fun_aux - np.average(real_fun_aux)
    l2 = ax.plot(grid[:, term.feature], real_fun_centered, label="Generating function", ls='--', lw=3)

    ax.set_title(texts[term.feature], loc='center', fontsize=font_sizes[term.feature])

axes[0].legend(bbox_to_anchor=(3.9, -0.1), ncol=2)


params = {'xtick.labelsize': 28,
          'ytick.labelsize': 28,
          'legend.fontsize': 30,
          'axes.titley': 1,
          'axes.titlepad': 26,
          'axes.labelsize': 32}
plt.rcParams.update(params)
plt.subplots_adjust(hspace=0.3)
if save_pdf:
    fig.savefig(file_out, bbox_inches="tight")

## RMSE prediction GAM vs true value of test set and versus prediction made on the test by the forest

In [None]:
mean_squared_error(y_test, explainer.gam.predict(X_test), squared=False)

In [None]:
mean_squared_error(forest.predict(X_test), explainer.gam.predict(X_test), squared=False)

In [None]:
r2_score(y_test, explainer.gam.predict(X_test))

In [None]:
r2_score(forest.predict(X_test), explainer.gam.predict(X_test))