# Ablation Study: Size of the MC Variance Depending on the Dataset Size

For this study, we apply the following formula, i.e. we omit the model and thus eliminate other error sources such as model bias and variance. When transferring this to the model feature effects (PDP and ALE), this would assume a perfect model fit.

\begin{equation}
    \widehat{\text{Variance}}_{MC}(x_S; PDP_f, \widehat{PDP}_f) = \frac{1}{K}\sum_{k=1}^K(PDP_f(x_S) - \widehat{PDP}_f^{(k)}(x_S))^2
\end{equation}

Note: We only perform this ablation study only for the SimpleCombined and Friedman1 groundtruth functions, since these are the only ones for which we can derive the feature effects analytically.

In [59]:
import yaml
import os
from joblib import dump
from pathlib import Path
from collections import defaultdict
import numpy as np

from current_research_feature_effects.mappings import map_dataset_to_groundtruth
from current_research_feature_effects.data_generating.data_generation import generate_data, Groundtruth
from current_research_feature_effects.feature_effects import compute_theoretical_effects, compute_ales, compute_pdps

In [None]:
EXPERIMENT_PATH = Path("../experiments/mc_ablation_study")
os.makedirs(EXPERIMENT_PATH, exist_ok=True)

In [42]:
with open("../configs/datasets.yaml", "r") as file:
    datasets_config = yaml.safe_load(file)

groundtruths = [
    map_dataset_to_groundtruth(
        config["groundtruth"],
        [(v["marginal"]["type"], tuple(v["marginal"]["params"])) for v in config["features"].values()],
        np.array(config["correlation_matrix"]),
        feature_names=list(config["features"].keys()),
        name=name,
    )
    for name, config in datasets_config.items()
]

In [43]:
N_SAMPLES = np.logspace(1, 6, num=50)
SNRS = np.array([0, 5])
K = 50
N_GRID_POINTS = 100
BASE_SEED = 99999

In [None]:
def perform_mc_ablation_study(groundtruth: Groundtruth):
    feature_names = groundtruth.feature_names
    quantiles = np.linspace(0.0001, 0.9999, N_GRID_POINTS, endpoint=True)
    grid_values = [groundtruth.get_theoretical_quantiles(feature, quantiles) for feature in feature_names]

    ale_groundtruth_theoretical = compute_theoretical_effects(
        groundtruth, "ale", feature_names, grid_values=grid_values, center_curves=True, remove_first_last=False
    )
    pdp_groundtruth_theoretical = compute_theoretical_effects(
        groundtruth, "pdp", feature_names, grid_values=grid_values, center_curves=True, remove_first_last=False
    )

    results = {"pdp": defaultdict(lambda: defaultdict(dict)), "ale": defaultdict(lambda: defaultdict(dict))}

    for snr in SNRS:
        for n_samples in N_SAMPLES:
            pdp_variances = defaultdict(float)
            ale_variances = defaultdict(float)
            for i in range(K):
                X_mc, *_ = generate_data(
                    groundtruth=groundtruth, n_train=int(n_samples), n_test=1, snr=snr, seed=BASE_SEED + i
                )

                pdp = compute_pdps(
                    groundtruth,
                    X_mc,
                    feature_names,
                    grid_values=grid_values,
                    center_curves=True,
                    remove_first_last=False,
                )

                ale = compute_ales(
                    groundtruth,
                    X_mc,
                    feature_names,
                    grid_values=grid_values,
                    center_curves=True,
                    remove_first_last=False,
                )

                pdp_vars = {
                    feature: (pdp[i] - pdp_groundtruth_theoretical[i]) ** 2 for i, feature in enumerate(feature_names)
                }

                ale_vars = {
                    feature: (ale[i] - ale_groundtruth_theoretical[i]) ** 2 for i, feature in enumerate(feature_names)
                }

                for feature in feature_names:
                    pdp_variances[feature] += pdp_vars[feature]
                    ale_variances[feature] += ale_vars[feature]

            results["pdp"][snr][n_samples] = {k: v / K for k, v in pdp_variances.items()}
            results["ale"][snr][n_samples] = {k: v / K for k, v in ale_variances.items()}

    return results

## Simple Combined

In [44]:
simplecomb_groundtruth = groundtruths[0]
results_simplecomb = perform_mc_ablation_study(simplecomb_groundtruth)

In [None]:
os.makedirs(EXPERIMENT_PATH / str(simplecomb_groundtruth), exist_ok=True)
dump(results_simplecomb, EXPERIMENT_PATH / str(simplecomb_groundtruth) / "ablation_results.joblib")

## Simple Combined

In [None]:
friedman1_groundtruth = groundtruths[1]
results_friedman1 = perform_mc_ablation_study(friedman1_groundtruth)

In [None]:
os.makedirs(EXPERIMENT_PATH / str(friedman1_groundtruth), exist_ok=True)
dump(results_friedman1, EXPERIMENT_PATH / str(friedman1_groundtruth) / "ablation_results.joblib")

## Analysis

In [3]:
...

Ellipsis