In [1]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import copy
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
import sys
import math
import scipy.spatial.distance
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.tree import DecisionTreeRegressor
import os
import shutil
import glob
import shap
from sklearn.model_selection import train_test_split

cmap='viridis'
sys.path.insert(1, '../')
from LibHelperFuncs import *

rstate = 0
np.random.seed(rstate)
njob = -1
global_trees = 1000

np.random.seed(rstate)

from sklearn.linear_model import Ridge
from sklearn.kernel_ridge import KernelRidge
from sklearn.neural_network import MLPRegressor

import time

In [2]:
originalDat = np.genfromtxt("../../Data/data_bas_preprocessing.csv", delimiter=',')
originalDat_pd = pd.read_csv("../../Data/data_bas_preprocessing_pd.csv")

X = originalDat[:,:-1]
Y = originalDat[:,-1]
C = originalDat_pd.columns


In [21]:
t1 = time.time()

rf = RandomForestRegressor(n_estimators=global_trees, random_state=rstate, n_jobs=-1)
rf.fit(X, Y)

t2 = time.time()

exp = shap.TreeExplainer(rf, feature_perturbation='tree_path_dependant')
s_vals = exp.shap_values(X, approximate=True)
ov_shap = np.mean(np.abs(s_vals), axis=0)

t3 = time.time()

In [24]:
print("Time to train model:",t2-t1)
print("Time to compute SHAP values:",t3-t2)

Time to train model: 8.400600910186768
Time to compute SHAP values: 1.397200584411621


In [25]:
t1 = time.time()

pca = PCA(n_components=50)
pca.fit(X)
# comps = compute_sparse_components(pca.components_, X.shape[1], 0.85)
comps = pca.components_
X_r = (comps @ X.T).T

rf = RandomForestRegressor(n_estimators=global_trees, random_state=rstate, n_jobs=-1)
rf.fit(X_r, Y)

t2 = time.time()

exp = shap.TreeExplainer(rf, feature_perturbation='tree_path_dependant')
s_vals = exp.shap_values(X_r, approximate=True)
ov_reduced_shap = mean_carried_shap(s_vals, pca.components_, X)

t3 = time.time()

In [26]:
print("Time to train model:",t2-t1)
print("Time to compute SHAP values:",t3-t2)

Time to train model: 3.276597023010254
Time to compute SHAP values: 1.319150447845459


In [30]:
t1 = time.time()

rd = KernelRidge().fit(X, Y)

t2 = time.time()

exp = shap.KernelExplainer(rd.predict, shap.sample(X, 30))
s_vals = exp.shap_values(X, nsamples=30)
ov_shap = np.mean(np.abs(s_vals), axis=0)

t3 = time.time()

HBox(children=(FloatProgress(value=0.0, max=1617.0), HTML(value='')))




In [31]:
print("Time to train model:",t2-t1)
print("Time to compute SHAP values:",t3-t2)

Time to train model: 0.1050872802734375
Time to compute SHAP values: 551.5335009098053


In [32]:
t1 = time.time()

pca = PCA(n_components=10)
pca.fit(X)
comps = pca.components_
X_r = (comps @ X.T).T

rx = KernelRidge().fit(X_r, Y)

t2 = time.time()

exp = shap.KernelExplainer(rx.predict, shap.sample(X_r, 30))
print("Explainer done")
s_vals = exp.shap_values(X_r, nsamples=30)
ov_reduced_shap = mean_carried_shap(s_vals, pca.components_, X)

t3 = time.time()

Explainer done


HBox(children=(FloatProgress(value=0.0, max=1617.0), HTML(value='')))




In [33]:
print("Time to train model:",t2-t1)
print("Time to compute SHAP values:",t3-t2)

Time to train model: 0.09407877922058105
Time to compute SHAP values: 32.37304377555847


In [40]:
rrows = np.random.randint(0, X.shape[0], size=100)

Xsub = X[rrows,:]

In [81]:
t1 = time.time()

mlp = MLPRegressor(hidden_layer_sizes=(128, 64, 32, 8 ), random_state=rstate, max_iter=2000, learning_rate='adaptive', shuffle=True).fit(X, Y)

t2 = time.time()

# exp = shap.Explainer(mlp.predict, Xsub)
# s_vals = exp(Xsub)
# ov_shap = np.mean(np.abs(s_vals.values), axis=0)

# t3 = time.time()

In [82]:
print("Time to train model:",t2-t1)
print("Time to compute SHAP values:",t3-t2)

Time to train model: 0.8252248764038086
Time to compute SHAP values: -55081.16129231453


In [77]:
t1 = time.time()

pca = PCA(n_components=50)
pca.fit(X)
comps = pca.components_
X_r = (comps @ X.T).T
Xr_sub = X_r[rrows,:]

mlp = MLPRegressor(hidden_layer_sizes=(128, 64, 32, 8), random_state=rstate, max_iter=2000, learning_rate='adaptive', shuffle=True).fit(X_r, Y)

t2 = time.time()

# expr = shap.Explainer(mlp.predict, Xr_sub)
# s_valsr = expr(Xr_sub)

# t3 = time.time()

In [78]:
print("Time to train model:",t2-t1)
print("Time to compute SHAP values:",t3-t2)

Time to train model: 1.6799993515014648
Time to compute SHAP values: -55064.79333472252


In [91]:
t1 = time.time()

rd = LinearRegression()
rd.fit(X, Y)

t2 = time.time()

exp = shap.LinearExplainer(rd, X)
s_vals = exp.shap_values(X)
ov_shap = np.mean(np.abs(s_vals), axis=0)

t3 = time.time()

In [92]:
print("Time to train model:",t2-t1)
print("Time to compute SHAP values:",t3-t2)

Time to train model: 0.024019718170166016
Time to compute SHAP values: 0.009007453918457031


In [93]:
t1 = time.time()

pca = PCA(n_components=50)
pca.fit(X)
comps = pca.components_
X_r = (comps @ X.T).T

rd = LinearRegression()
rd.fit(X_r, Y)

t2 = time.time()

exp = shap.LinearExplainer(rd, X_r)
s_vals = exp.shap_values(X_r)
ov_reduced_shap = mean_carried_shap(s_vals, pca.components_, X)

t3 = time.time()

In [94]:
print("Time to train model:",t2-t1)
print("Time to compute SHAP values:",t3-t2)

Time to train model: 0.020017147064208984
Time to compute SHAP values: 0.0220181941986084


In [87]:
t1 = time.time()

rd = DecisionTreeRegressor()
rd.fit(X, Y)

t2 = time.time()

exp = shap.TreeExplainer(rd, X)
s_vals = exp.shap_values(X)
ov_shap = np.mean(np.abs(s_vals), axis=0)

t3 = time.time()

In [88]:
print("Time to train model:",t2-t1)
print("Time to compute SHAP values:",t3-t2)

Time to train model: 0.14662623405456543
Time to compute SHAP values: 0.996739387512207


In [89]:
t1 = time.time()

pca = PCA(n_components=50)
pca.fit(X)
comps = pca.components_
X_r = (comps @ X.T).T

rd = DecisionTreeRegressor()
rd.fit(X_r, Y)

t2 = time.time()

exp = shap.TreeExplainer(rd, X_r)
s_vals = exp.shap_values(X_r)
ov_reduced_shap = mean_carried_shap(s_vals, pca.components_, X)

t3 = time.time()

In [90]:
print("Time to train model:",t2-t1)
print("Time to compute SHAP values:",t3-t2)

Time to train model: 0.0710597038269043
Time to compute SHAP values: 0.4658963680267334
