In [None]:
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import lightgbm as lgbm
from gamexplainer.datasets import dataset_from_fun
from sklearn.model_selection import train_test_split
from gamexplainer import GamExplainer
from synthetic_fun import fun_interaction
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
from numpy.random import default_rng
import pickle

In [None]:
plt.rcParams['text.usetex'] = True

# First run synth_inter_benchmark.py to compute the results in precomputed_results/inter_strategies_bench.pickle

In [None]:
with open("precomputed_results/inter_strategies_bench.pickle", "rb") as f:
    dict_res = pickle.load(f)

In [None]:
df_res = pd.DataFrame.from_dict(dict_res["map"])

In [None]:
sorted_df = df_res.copy()

In [None]:
col_label = {
             "h_stat": r"\emph{H-Stat}",
             "count_path": r"\emph{Count-Path}",
             "pair_gain": r"\emph{Pair-Gain}",
             "gain_path": r"\emph{Gain-Path}"
}

for col in ["h_stat", "count_path", "pair_gain", "gain_path"]:
    sorted_df[col_label[col]] = sorted(sorted_df[col].to_list(), reverse=True)

to_be_displayed = sorted_df.copy()
for col in ["h_stat", "count_path", "pair_gain", "gain_path"]:
    to_be_displayed = to_be_displayed.drop(col, axis=1)

In [None]:
plt.figure(figsize=(15, 10))
params = {'legend.fontsize': 24,
          'figure.figsize': (10, 5),
          'axes.labelsize': 30,
          'axes.titlesize': 30,
          'xtick.labelsize': 30,
          'ytick.labelsize': 30}
plt.rcParams.update(params)
ax =sns.lineplot(data=to_be_displayed, lw=4)
ax.set(xlabel='Interaction set sorted by MAP', ylabel='MAP')

plt.savefig("plots/interactions_detection.pdf", bbox_inches="tight")

In [None]:
print(sorted_df.describe().to_latex())

In [None]:
from scipy import stats

stats.ttest_ind(sorted_df["count_path"].to_numpy(), sorted_df["gain_path"].to_numpy(), equal_var=False)

In [None]:
import itertools
for col1, col2 in itertools.combinations(sorted_df.columns, 2):
    p_value = stats.ttest_ind(sorted_df[col1].to_numpy(), sorted_df[col2].to_numpy(), equal_var=False)[1]
    if p_value <= 0.05:
        print(f"Different means between {col1} and {col2}")

## Difference in prediction

In [None]:
noise_gen = np.random.default_rng(seed=42)
synth_df = dataset_from_fun(n_sample=100000,
                        n_features=5,
                        fun=fun_interaction,
                        random_state=42,
                        rnd_gen=noise_gen,
                        real_interactions=((0, 1), (0, 4), (1, 4)))

X_train, X_test, y_train, y_test = train_test_split(synth_df.drop("y", axis=1),
                                                synth_df["y"],
                                                test_size=0.2,
                                                shuffle=False)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, shuffle=False)

forest = lgbm.LGBMRegressor(n_estimators=1000, num_leaves=32, learning_rate=0.1, n_jobs=10)
forest.fit(X_train, y_train)

In [None]:
mean_squared_error(y_test, forest.predict(X_test))

In [None]:
r2_score(y_test, forest.predict(X_test))

In [None]:
explanation_params = {"n_spline_terms": 5,
                      "sample_method": "equi_size",
                      "sample_n": 1200,
                      "inter_max_distance": 32,
                      "verbose": True,
                      "n_inter_terms": 3}

explainer = GamExplainer(**explanation_params)
explainer.explain(forest, lam_search_space=[0.01, 0.05, 0.1, 1])

In [None]:
mean_squared_error(y_test, explainer.gam.predict(X_test))

In [None]:
mean_squared_error(forest.predict(X_test), explainer.gam.predict(X_test))

In [None]:
r2_score(y_test, explainer.gam.predict(X_test))

In [None]:
r2_score(forest.predict(X_test), explainer.gam.predict(X_test))