merge master

microsoft · jameslamb · Sep 15, 2023 · Sep 15, 2023 · Sep 16, 2023 · Aug 18, 2024
commit 5ec27acc8d0063100d68cf0481380b9705f51d71
@@ -200,14 +200,9 @@ def test_booster(tmp_path):
     free_dataset(test)
     booster2 = ctypes.c_void_p()
     num_total_model = ctypes.c_int(0)
-    LIB.LGBM_BoosterCreateFromModelfile(
-        c_str('model.txt'),
-        ctypes.byref(num_total_model),
-        ctypes.byref(booster2))
-    LIB.LGBM_BoosterResetParameter(
-        booster2,
-        c_str("app=binary metric=auc num_leaves=29 verbose=0"))
-    data = np.loadtxt(str(binary_example_dir / 'binary.test'), dtype=np.float64)
+    LIB.LGBM_BoosterCreateFromModelfile(c_str(str(model_path)), ctypes.byref(num_total_model), ctypes.byref(booster2))
+    LIB.LGBM_BoosterResetParameter(booster2, c_str("app=binary metric=auc num_leaves=29 verbose=0"))
+    data = np.loadtxt(str(binary_example_dir / "binary.test"), dtype=np.float64)
     mat = data[:, 1:]
     preb = np.empty(mat.shape[0], dtype=np.float64)
     num_preb = ctypes.c_int64(0)

@@ -15,7 +15,7 @@
 import lightgbm as lgb
 from lightgbm.compat import PANDAS_INSTALLED, pd_DataFrame, pd_Series
 
-from .utils import BOOSTING_TYPES, dummy_obj, load_breast_cancer, mse_obj
+from .utils import BOOSTING_TYPES, dummy_obj, load_breast_cancer, mse_obj, np_assert_array_equal
 
 
 def test_basic(tmp_path):
@@ -895,50 +895,78 @@ def test_feature_names_are_set_correctly_when_no_feature_names_passed_into_Datas
     assert ds.construct().feature_name == ["Column_0", "Column_1", "Column_2"]
 
 
-@pytest.mark.parametrize('boosting_type', BOOSTING_TYPES)
-def test_booster_deepcopy_preserves_parameters(boosting_type):
-    orig_params = {
-        'boosting': boosting_type,
-        'feature_fraction': 0.708,
-        'num_leaves': 5,
-        'verbosity': -1
-    }
-    bst = lgb.train(
-        params=orig_params,
-        num_boost_round=2,
-        train_set=lgb.Dataset(np.random.rand(100, 2))
+# NOTE: this intentionally contains values where num_leaves <, ==, and > (max_depth^2)
+@pytest.mark.parametrize(("max_depth", "num_leaves"), [(-1, 3), (-1, 50), (5, 3), (5, 31), (5, 32), (8, 3), (8, 31)])
+def test_max_depth_warning_is_not_raised_if_num_leaves_is_also_provided(capsys, num_leaves, max_depth):
+    X, y = make_blobs(n_samples=1_000, n_features=1, centers=2)
+    lgb.Booster(
+        params={
+            "objective": "binary",
+            "max_depth": max_depth,
+            "num_leaves": num_leaves,
+            "num_iterations": 1,
+            "verbose": 0,
+        },
+        train_set=lgb.Dataset(X, label=y),
+    )
+    assert "Provided parameters constrain tree depth" not in capsys.readouterr().out
+
+
+# NOTE: max_depth < 5 is significant here because the default for num_leaves=31. With max_depth=5,
+#       a full depth-wise tree would have 2^5 = 32 leaves.
+@pytest.mark.parametrize("max_depth", [1, 2, 3, 4])
+def test_max_depth_warning_is_not_raised_if_max_depth_gt_1_and_lt_5_and_num_leaves_omitted(capsys, max_depth):
+    X, y = make_blobs(n_samples=1_000, n_features=1, centers=2)
+    lgb.Booster(
+        params={
+            "objective": "binary",
+            "max_depth": max_depth,
+            "num_iterations": 1,
+            "verbose": 0,
+        },
+        train_set=lgb.Dataset(X, label=y),
+    )
+    assert "Provided parameters constrain tree depth" not in capsys.readouterr().out
+
+
+@pytest.mark.parametrize("max_depth", [5, 6, 7, 8, 9])
+def test_max_depth_warning_is_raised_if_max_depth_gte_5_and_num_leaves_omitted(capsys, max_depth):
+    X, y = make_blobs(n_samples=1_000, n_features=1, centers=2)
+    lgb.Booster(
+        params={
+            "objective": "binary",
+            "max_depth": max_depth,
+            "num_iterations": 1,
+            "verbose": 0,
+        },
+        train_set=lgb.Dataset(X, label=y),
     )
+    expected_warning = (
+        f"[LightGBM] [Warning] Provided parameters constrain tree depth (max_depth={max_depth}) without explicitly "
+        f"setting 'num_leaves'. This can lead to underfitting. To resolve this warning, pass 'num_leaves' (<={2**max_depth}) "
+        "in params. Alternatively, pass (max_depth=-1) and just use 'num_leaves' to constrain model complexity."
+    )
+    assert expected_warning in capsys.readouterr().out
+
+
+@pytest.mark.parametrize("boosting_type", BOOSTING_TYPES)
+def test_booster_deepcopy_preserves_parameters(boosting_type, default_rng):
+    orig_params = {"boosting": boosting_type, "feature_fraction": 0.708, "num_leaves": 5, "verbosity": -1}
+    bst = lgb.train(params=orig_params, num_boost_round=2, train_set=lgb.Dataset(default_rng.random(100, 2)))
     bst2 = deepcopy(bst)
     assert bst2.params == bst.params
     assert bst.params["num_leaves"] == 5
     assert bst.params["verbosity"] == -1
 
     # passed-in params shouldn't have been modified outside of lightgbm
-    assert orig_params == {
-        'boosting': boosting_type,
-        'feature_fraction': 0.708,
-        'num_leaves': 5,
-        'verbosity': -1
-    }
+    assert orig_params == {"boosting": boosting_type, "feature_fraction": 0.708, "num_leaves": 5, "verbosity": -1}
 
 
-@pytest.mark.parametrize('boosting_type', BOOSTING_TYPES)
-def test_booster_params_kwarg_overrides_params_from_model_string(boosting_type):
-    orig_params = {
-        'boosting': boosting_type,
-        'feature_fraction': 0.708,
-        'num_leaves': 5,
-        'verbosity': -1
-    }
-    bst = lgb.train(
-        params=orig_params,
-        num_boost_round=2,
-        train_set=lgb.Dataset(np.random.rand(100, 2))
-    )
-    bst2 = lgb.Booster(
-        params={'num_leaves': 7},
-        model_str=bst.model_to_string()
-    )
+@pytest.mark.parametrize("boosting_type", BOOSTING_TYPES)
+def test_booster_params_kwarg_overrides_params_from_model_string(boosting_type, default_rng):
+    orig_params = {"boosting": boosting_type, "feature_fraction": 0.708, "num_leaves": 5, "verbosity": -1}
+    bst = lgb.train(params=orig_params, num_boost_round=2, train_set=lgb.Dataset(default_rng.random(100, 2)))
+    bst2 = lgb.Booster(params={"num_leaves": 7}, model_str=bst.model_to_string())
 
     # params should have been updated on the Python object and the C++ side
     assert bst2.params["num_leaves"] == 7
@@ -949,9 +977,4 @@ def test_booster_params_kwarg_overrides_params_from_model_string(boosting_type):
         raise RuntimeError
 
     # passed-in params shouldn't have been modified outside of lightgbm
-    assert orig_params == {
-        'boosting': boosting_type,
-        'feature_fraction': 0.708,
-        'num_leaves': 5,
-        'verbosity': -1
-    }
+    assert orig_params == {"boosting": boosting_type, "feature_fraction": 0.708, "num_leaves": 5, "verbosity": -1}
@@ -11,7 +11,7 @@
 
 import lightgbm as lgb
 
-BOOSTING_TYPES = ['gbdt', 'dart', 'goss', 'rf']
+BOOSTING_TYPES = ["gbdt", "dart", "goss", "rf"]
 SERIALIZERS = ["pickle", "joblib", "cloudpickle"]