From 3dad62b74445ba3342bc9f79bc413c4584271d83 Mon Sep 17 00:00:00 2001 From: Yann Van Crombrugge <59556820+vancromy@users.noreply.github.com> Date: Sat, 14 Oct 2023 10:45:49 +0200 Subject: [PATCH] fix: Xgboost sklearn model parameters that should be passed to xgboost.DMatrix (#3314) * Make sure to save xgboost sklearn class parameters for future use * Propogate xgboost sklearn params to the DMatrix if created * Add test for testing the propogation of dmatrix kwargs * Refactor according to PR comments * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Remove typehint that was breaking tests + add docstring * Fix unittest according to PR comments * Make code more xgboost specific as requested in PR review * Safeguard attribute retrieval as requested out by @connortann --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- shap/explainers/_tree.py | 28 +++++++++++++++++++++++++++- tests/explainers/test_tree.py | 24 ++++++++++++++++++++++++ 2 files changed, 51 insertions(+), 1 deletion(-) diff --git a/shap/explainers/_tree.py b/shap/explainers/_tree.py index 7fb791227..f6ecc34a7 100644 --- a/shap/explainers/_tree.py +++ b/shap/explainers/_tree.py @@ -356,7 +356,9 @@ def shap_values(self, X, y=None, tree_limit=None, approximate=False, check_addit if self.model.model_type == "xgboost": import xgboost if not isinstance(X, xgboost.core.DMatrix): - X = xgboost.DMatrix(X) + # Retrieve any DMatrix properties if they have been set on the TreeEnsemble Class + dmatrix_props = getattr(self.model, "_xgb_dmatrix_props", {}) + X = xgboost.DMatrix(X, **dmatrix_props) if tree_limit == -1: tree_limit = 0 try: @@ -992,6 +994,9 @@ def __init__(self, model, data=None, data_missing=None, model_output=None): self.model_output = "probability_doubled" # with predict_proba we need to double the outputs to match else: self.model_output = "probability" + # Some properties of the sklearn API are passed to a DMatrix object in xgboost + # We need to make sure we do the same here - GH #3313 + self._xgb_dmatrix_props = get_xgboost_dmatrix_properties(model) elif safe_isinstance(model, "xgboost.sklearn.XGBRegressor"): self.original_model = model.get_booster() self.model_type = "xgboost" @@ -1003,6 +1008,9 @@ def __init__(self, model, data=None, data_missing=None, model_output=None): self.tree_limit = getattr(model, "best_ntree_limit", None) if xgb_loader.num_class > 0: self.num_stacked_models = xgb_loader.num_class + # Some properties of the sklearn API are passed to a DMatrix object in xgboost + # We need to make sure we do the same here - GH #3313 + self._xgb_dmatrix_props = get_xgboost_dmatrix_properties(model) elif safe_isinstance(model, "xgboost.sklearn.XGBRanker"): self.original_model = model.get_booster() self.model_type = "xgboost" @@ -1014,6 +1022,9 @@ def __init__(self, model, data=None, data_missing=None, model_output=None): self.tree_limit = getattr(model, "best_ntree_limit", None) if xgb_loader.num_class > 0: self.num_stacked_models = xgb_loader.num_class + # Some properties of the sklearn API are passed to a DMatrix object in xgboost + # We need to make sure we do the same here - GH #3313 + self._xgb_dmatrix_props = get_xgboost_dmatrix_properties(model) elif safe_isinstance(model, "lightgbm.basic.Booster"): assert_import("lightgbm") self.model_type = "lightgbm" @@ -1649,6 +1660,21 @@ def _recalculate_value(tree, i , level): self.features = np.where(self.features >= 0, tree_features[self.features], self.features) +def get_xgboost_dmatrix_properties(model): + """ + Retrieves properties from an xgboost.sklearn.XGBModel instance that should be passed to the xgboost.core.DMatrix object before calling predict on the model + """ + properties_to_pass = ["missing", "n_jobs", "enable_categorical", "feature_types"] + dmatrix_attributes = {} + for attribute in properties_to_pass: + if hasattr(model, attribute): + dmatrix_attributes[attribute] = getattr(model, attribute) + + # Convert sklearn n_jobs to xgboost nthread + if "n_jobs" in dmatrix_attributes: + dmatrix_attributes["nthread"] = dmatrix_attributes.pop("n_jobs") + return dmatrix_attributes + def get_xgboost_json(model): """ This gets a JSON dump of an XGBoost model while ensuring the features names are their indexes. """ diff --git a/tests/explainers/test_tree.py b/tests/explainers/test_tree.py index 5288d4be9..b76d73407 100644 --- a/tests/explainers/test_tree.py +++ b/tests/explainers/test_tree.py @@ -1215,6 +1215,30 @@ def test_xgboost_regression(self): expected_diff = np.abs(explanation.values.sum(1) + explanation.base_values - predicted).max() assert expected_diff < 1e-4, "SHAP values don't sum to model output!" + def test_xgboost_dmatrix_propagation(self): + """ + Test that xgboost sklearn attributues are properly passed to the DMatrix + initiated during shap value calculation. see GH #3313 + """ + xgboost = pytest.importorskip("xgboost") + + X, y = shap.datasets.adult(n_points=100) + + # Randomly add missing data to the input where missing data is encoded as 1e-8 + X_nan = X.copy() + X_nan.loc[ + X_nan.sample(frac=0.3, random_state=42).index, + X_nan.columns.to_series().sample(frac=0.5, random_state=42), + ] = 1e-8 + + clf = xgboost.XGBClassifier(missing=1e-8, random_state=42) + clf.fit(X_nan, y) + margin = clf.predict(X_nan, output_margin=True) + explainer = shap.TreeExplainer(clf) + shap_values = explainer.shap_values(X_nan) + # check that SHAP values sum to model output + assert np.allclose(margin, explainer.expected_value + shap_values.sum(axis=1)) + def test_xgboost_direct(self): xgboost = pytest.importorskip("xgboost")