In [None]:
# Courtesy of https://randomrealizations.com/posts/shap-from-scratch/

import numpy as np 
from typing import Any, Callable, Iterable
from itertools import chain, combinations

class ShapFromScratchExplainer():
    def __init__(self,
                 model: Callable[[np.ndarray], float], 
                 background_dataset: np.ndarray,
                 max_samples: int = None):
        self.model = model # Set model
        if max_samples: # If max samples, randomly sample a subset of the background dataset
            max_samples = min(max_samples, background_dataset.shape[0]) 
            rng = np.random.default_rng()
            self.background_dataset = rng.choice(background_dataset, 
                                                 size=max_samples, 
                                                 replace=False, axis=0)
        else: # Use the full background dataset
            self.background_dataset = background_dataset

    def shap_values(self, X: np.ndarray) -> np.ndarray:
        """Compute SHAP values for instances in DataFrame or 2D array"""
        shap_values = np.empty(X.shape)
        # TODO: For each instance (i) and feature (j) in X, compute single SHAP value and put in shap_values[i, j]
        return shap_values
       
    def _compute_single_shap_value(self, 
                                   feature: int,
                                   instance: np.array) -> float:
        """
        Compute a single SHAP value given feature of interest and instance
        (equation 4 in SHAP paper)
        """
        n_features = len(instance)
        shap_value = 0
        for subset in self._get_all_other_feature_subsets(n_features, feature):
            n_subset = len(subset)
            
            prediction_with_feature = 0 # TODO: Make subset model prediction without the feature
            prediction_without_feature = 0 # TODO: Make subset model prediction with the feature
            
            factor = 0 # TODO: Calculate permutation factor (equation 4 in SHAP paper, left side)
            shap_value += 0 # TODO: Calculate SHAP value (equation 4 in SHAP paper, right side)
        return shap_value
    
    def _get_all_subsets(self, items: list) -> Iterable:
        """Generate all possible subsets"""
        return chain.from_iterable(combinations(items, r) for r in range(len(items)+1))
    
    def _get_all_other_feature_subsets(self, n_features, feature_of_interest):
        """Generate all subsets of features excluding the feature of interest"""
        all_other_features = [j for j in range(n_features) if j != feature_of_interest]
        return self._get_all_subsets(all_other_features)

    def _permutation_factor(self, n_features, n_subset):
        # TODO: Calculate a weighted average over all feature combinations (equation 4 in SHAP paper, left side)
        return 0
    
    def _subset_model_approximation(self, 
                                    feature_subset: tuple[int, ...], 
                                    instance: np.array) -> float:
        """ 
        Approximate subset model prediction (Equation 11 in SHAP paper)
        \hat{f}_S(x) = E_{x_{\hat{S}}}[f_S(x)]
        for feature subset S on single instance x
        """
        masked_background_dataset = self.background_dataset.copy()
        for j in range(masked_background_dataset.shape[1]):
            if j in feature_subset:
                masked_background_dataset[:, j] = instance[j]
        conditional_expectation_of_model = np.mean(
            self.model(masked_background_dataset)
        )
        return conditional_expectation_of_model          

In [None]:
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor

# Load diabetes dataaset
X, y = load_diabetes(as_frame=False, return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, 
                                                    random_state=42)

# Train some models
lin_model = LinearRegression().fit(X_train, y_train)
rfr_model = RandomForestRegressor().fit(X_train, y_train)
gbt_model = GradientBoostingRegressor().fit(X_train, y_train)

In [None]:
# TODO:
# (1) Compute SHAP values using ShapFromScratchExplainer and compute them using the SHAP library (KernelSHAP)
# (2) Compare the two SHAP values (should be similar)