Skip to content

Commit

Permalink
fix: Xgboost sklearn model parameters that should be passed to xgboos…
Browse files Browse the repository at this point in the history
…t.DMatrix (shap#3314)

* Make sure to save xgboost sklearn class parameters for future use

* Propogate xgboost sklearn params to the DMatrix if created

* Add test for testing the propogation of dmatrix kwargs

* Refactor according to PR comments

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Remove typehint that was breaking tests + add docstring

* Fix unittest according to PR comments

* Make code more xgboost specific as requested in PR review

* Safeguard attribute retrieval as requested out by @connortann

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
  • Loading branch information
vancromy and pre-commit-ci[bot] committed Oct 14, 2023
1 parent 3ac66fb commit 3dad62b
Show file tree
Hide file tree
Showing 2 changed files with 51 additions and 1 deletion.
28 changes: 27 additions & 1 deletion shap/explainers/_tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -356,7 +356,9 @@ def shap_values(self, X, y=None, tree_limit=None, approximate=False, check_addit
if self.model.model_type == "xgboost":
import xgboost
if not isinstance(X, xgboost.core.DMatrix):
X = xgboost.DMatrix(X)
# Retrieve any DMatrix properties if they have been set on the TreeEnsemble Class
dmatrix_props = getattr(self.model, "_xgb_dmatrix_props", {})
X = xgboost.DMatrix(X, **dmatrix_props)
if tree_limit == -1:
tree_limit = 0
try:
Expand Down Expand Up @@ -992,6 +994,9 @@ def __init__(self, model, data=None, data_missing=None, model_output=None):
self.model_output = "probability_doubled" # with predict_proba we need to double the outputs to match
else:
self.model_output = "probability"
# Some properties of the sklearn API are passed to a DMatrix object in xgboost
# We need to make sure we do the same here - GH #3313
self._xgb_dmatrix_props = get_xgboost_dmatrix_properties(model)
elif safe_isinstance(model, "xgboost.sklearn.XGBRegressor"):
self.original_model = model.get_booster()
self.model_type = "xgboost"
Expand All @@ -1003,6 +1008,9 @@ def __init__(self, model, data=None, data_missing=None, model_output=None):
self.tree_limit = getattr(model, "best_ntree_limit", None)
if xgb_loader.num_class > 0:
self.num_stacked_models = xgb_loader.num_class
# Some properties of the sklearn API are passed to a DMatrix object in xgboost
# We need to make sure we do the same here - GH #3313
self._xgb_dmatrix_props = get_xgboost_dmatrix_properties(model)
elif safe_isinstance(model, "xgboost.sklearn.XGBRanker"):
self.original_model = model.get_booster()
self.model_type = "xgboost"
Expand All @@ -1014,6 +1022,9 @@ def __init__(self, model, data=None, data_missing=None, model_output=None):
self.tree_limit = getattr(model, "best_ntree_limit", None)
if xgb_loader.num_class > 0:
self.num_stacked_models = xgb_loader.num_class
# Some properties of the sklearn API are passed to a DMatrix object in xgboost
# We need to make sure we do the same here - GH #3313
self._xgb_dmatrix_props = get_xgboost_dmatrix_properties(model)
elif safe_isinstance(model, "lightgbm.basic.Booster"):
assert_import("lightgbm")
self.model_type = "lightgbm"
Expand Down Expand Up @@ -1649,6 +1660,21 @@ def _recalculate_value(tree, i , level):
self.features = np.where(self.features >= 0, tree_features[self.features], self.features)


def get_xgboost_dmatrix_properties(model):
"""
Retrieves properties from an xgboost.sklearn.XGBModel instance that should be passed to the xgboost.core.DMatrix object before calling predict on the model
"""
properties_to_pass = ["missing", "n_jobs", "enable_categorical", "feature_types"]
dmatrix_attributes = {}
for attribute in properties_to_pass:
if hasattr(model, attribute):
dmatrix_attributes[attribute] = getattr(model, attribute)

# Convert sklearn n_jobs to xgboost nthread
if "n_jobs" in dmatrix_attributes:
dmatrix_attributes["nthread"] = dmatrix_attributes.pop("n_jobs")
return dmatrix_attributes

def get_xgboost_json(model):
""" This gets a JSON dump of an XGBoost model while ensuring the features names are their indexes.
"""
Expand Down
24 changes: 24 additions & 0 deletions tests/explainers/test_tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -1215,6 +1215,30 @@ def test_xgboost_regression(self):
expected_diff = np.abs(explanation.values.sum(1) + explanation.base_values - predicted).max()
assert expected_diff < 1e-4, "SHAP values don't sum to model output!"

def test_xgboost_dmatrix_propagation(self):
"""
Test that xgboost sklearn attributues are properly passed to the DMatrix
initiated during shap value calculation. see GH #3313
"""
xgboost = pytest.importorskip("xgboost")

X, y = shap.datasets.adult(n_points=100)

# Randomly add missing data to the input where missing data is encoded as 1e-8
X_nan = X.copy()
X_nan.loc[
X_nan.sample(frac=0.3, random_state=42).index,
X_nan.columns.to_series().sample(frac=0.5, random_state=42),
] = 1e-8

clf = xgboost.XGBClassifier(missing=1e-8, random_state=42)
clf.fit(X_nan, y)
margin = clf.predict(X_nan, output_margin=True)
explainer = shap.TreeExplainer(clf)
shap_values = explainer.shap_values(X_nan)
# check that SHAP values sum to model output
assert np.allclose(margin, explainer.expected_value + shap_values.sum(axis=1))

def test_xgboost_direct(self):
xgboost = pytest.importorskip("xgboost")

Expand Down

0 comments on commit 3dad62b

Please sign in to comment.