# Example Covertype dataset
- In this notebook, we introduce how to use `ShapEngine` to compute Beta Shapley value. 

In [1]:
import os, sys, argparse
from time import time
import numpy as np
np.random.seed(2022)
sys.path.append('../betashap')
from ShapEngine import ShapEngine
import utils
import data

# Load data 
- We use the **Covertype** dataset (https://scikit-learn.org/stable/modules/generated/sklearn.datasets.fetch_covtype.html) and suppose 10% of data points in the training dataset is mislabeled.
- A function `data.load_data` will load the train, val, and test datasets as well as a noisy index. 

In [2]:
problem='classification'
dataset='covertype'
dargs={'n_data_to_be_valued': 200, 'n_val': 200, 'n_test': 1000}

In [3]:
# Load dataset
(X, y), (X_val, y_val), (X_test, y_test), noisy_index=data.load_data(problem, dataset, **dargs)    

print('-'*30)
print(f'Train X: {X.shape}')
print(f'Val X: {X_val.shape}')
print(f'Test X: {X_test.shape}')
print('-'*30)

--------------------------------------------------
Covertype
--------------------------------------------------
number of samples: 200
------------------------------
Train X: (200, 54)
Val X: (200, 54)
Test X: (1000, 54)
------------------------------


# Compute Beta Shapley

In [4]:
model_family='logistic'
metric='accuracy'
GR_threshold=1.05
weights_list=[(1, 16), (1, 4), (1,1), (4,1), (16, 1)]

In [None]:
# Evaluate values
shap_engine=ShapEngine(X=X, y=y, X_val=X_val, y_val=y_val, 
                       problem=problem, model_family=model_family, 
                       metric=metric, GR_threshold=GR_threshold)
%time shap_engine.run(weights_list=weights_list)

Source is initialized. A unit of sample is one data point
Start: Marginal Contribution Calculation!


In [None]:
# A vector of data values is stored in `shap_engine.results` 
print(f'List of data values: {list(shap_engine.results.keys())}')

In [None]:
# Beta(4:1) data values for the first 10 data points. 
first_ten_values=shap_engine.results['Beta(4:1)'][:10]
print(f'First 10 data values: {first_ten_values}')

In [None]:
MC_mat=shap_engine.MC_obs_card_mat/(shap_engine.MC_count_obs_card_mat+1e-16)
print(f'Shape of MC_mat : {MC_mat.shape}')
print(f'Marginal contributions of the first sample: {MC_mat[0]}')

# Marginal contributions for clean and noisy samples
- Figure 2 in the manuscript shows a smooth curve because it is based on 50 independent runs.

In [None]:
from matplotlib import pyplot as plt
%matplotlib inline

In [None]:
clean_index=[i for i in range(MC_mat.shape[0]) if i not in noisy_index]
plt.plot(np.arange(MC_mat.shape[0]), np.mean(MC_mat[clean_index], axis=0), label='Clean')
plt.plot(np.arange(MC_mat.shape[0]), np.mean(MC_mat[noisy_index], axis=0), label='Noisy')
plt.legend(fontsize=15)
plt.xlabel('Cardinality', fontsize=15)
plt.ylabel('Marginal Contributions', fontsize=15)

# Performance on downstream ML tasks

In [None]:
result_dict=utils.summary_experiments(shap_engine, noisy_index, X_test, y_test)
print(f'Available ML tasks: {list(result_dict.keys())}')

In [None]:
# For instance, a noisy label detection task result is stored in result_dict['noisy'] 
# For each method, it shows recall, precision, F1-score of the method.
result_dict['noisy'] 