# Notebook for parallel computation of n-Shapley Values

In [None]:
# papermill parameter: notebook id
aid = 0

In [None]:
# second compute wave
#aid = aid + 1000

In [None]:
import numpy as np

import os

import datasets
import nshap

from itertools import product

import paperutil

%load_ext autoreload
%autoreload 2

### The different compute jobs

In [None]:
data_sets = ['folk_income', 'folk_travel', 'housing', 'credit', 'iris']
classifiers = ['rf', 'knn', 'gam', 'gbtree']
examples = list(range(0, 100))

all_jobs = list(product(data_sets, classifiers, examples))
print(len(all_jobs), 'different compute jobs')

In [None]:
for data_set in data_sets:
    X_train, X_test, Y_train, Y_test, feature_names = datasets.load_dataset(data_set)
    print(data_set, X_train.shape[0])

### The current job

In [None]:
job_id = aid
dataset = all_jobs[job_id][0]
classifier = all_jobs[job_id][1]
example = all_jobs[job_id][2]
random_seed = example

print(job_id, dataset, classifier, example, random_seed)

### Create output dir structure, if it does not already exist

In [None]:
if not os.path.exists( f'../../results/n_shapley_values/{dataset}' ):
    os.mkdir( f'../../results/n_shapley_values/{dataset}' )
if not os.path.exists( f'../../results/n_shapley_values/{dataset}/{classifier}' ):
    os.mkdir( f'../../results/n_shapley_values/{dataset}/{classifier}' )

### Load the dataset

In [None]:
X_train, X_test, Y_train, Y_test, feature_names = datasets.load_dataset(dataset)

In [None]:
is_classification = datasets.is_classification(dataset)

### Train the classifier

In [None]:
clf = paperutil.train_classifier(dataset, classifier)

In [None]:
if is_classification:
    print( sklearn.metrics.accuracy_score(Y_test, clf.predict(X_test)) )
else:
    print( sklearn.metrics.mean_squared_error(Y_test, clf.predict(X_test)) )

### n-Shapley Values

In [None]:
i_datapoint = example
froot = f'../../results/n_shapley_values/{dataset}/{classifier}/observation_{i_datapoint}'
for max_samples in [500, 5000]:
    num_samples = min(max_samples, X_train.shape[0])
    # the value function
    vfunc = nshap.vfunc.interventional_shap(clf.predict, X_train, num_samples=num_samples, random_state=0)
    fname = froot + f'_predict_{num_samples}.JSON'
    if is_classification:
        prediction = int( clf.predict( X_test[i_datapoint, :].reshape((1,-1)) ) )
        vfunc = nshap.vfunc.interventional_shap(clf.predict_proba, X_train, num_samples=num_samples, random_state=0, target=prediction)
        fname = froot + f'_proba_{num_samples}.JSON'
    if classifier == 'gam':
        vfunc = nshap.vfunc.interventional_shap(clf.decision_function, X_train, num_samples=num_samples, random_state=0)
        fname = froot + f'_decision_{num_samples}.JSON'
    # compute and save n-Shapley Values
    if not os.path.exists(fname):
        n_shapley_values = nshap.n_shapley_values(X_test[i_datapoint, :].reshape((1,-1)), vfunc)
        n_shapley_values.save(fname)