Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support to specifying position using position_column parameter #6825

Open
wants to merge 9 commits into
base: master
Choose a base branch
from
Open
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
ruff format new test
  • Loading branch information
NProkoptsev committed Feb 27, 2025
commit 9c38551f18d6fb8a29f4848e8f07a4a79680064e
51 changes: 39 additions & 12 deletions tests/python_package_test/test_engine.py
Original file line number Diff line number Diff line change
@@ -847,8 +847,11 @@ def test_ranking_with_position_information_with_dataset_constructor(tmp_path):
positions_from_get = lgb_train.get_position()
np.testing.assert_array_equal(positions_from_get, positions)


# Appends queries and positions to the dataset file
def append_queries_and_positions_to_file(file_dataset_in, file_query_in, positions, out_path):
def append_queries_and_positions_to_file(
file_dataset_in, file_query_in, positions, out_path
):
queries = []
query_id = 0
with open(file_query_in, "r") as f:
@@ -868,7 +871,8 @@ def append_queries_and_positions_to_file(file_dataset_in, file_query_in, positio


@pytest.mark.skipif(
getenv("TASK", "") == "cuda", reason="Positions in learning to rank is not supported in CUDA version yet"
getenv("TASK", "") == "cuda",
reason="Positions in learning to rank is not supported in CUDA version yet",
)
def test_ranking_with_position_and_group_information_in_single_file(tmp_path):
rank_example_dir = Path(__file__).absolute().parents[2] / "examples" / "lambdarank"
@@ -882,8 +886,18 @@ def test_ranking_with_position_and_group_information_in_single_file(tmp_path):
)

# append queries amd positions to the dataset file. They will have 301 and 302 feature indexes
append_queries_and_positions_to_file(str(tmp_path / "rank.intermediate"), str(rank_example_dir / "rank.train.query"), positions, str(tmp_path / "rank.train"))
append_queries_and_positions_to_file(str(rank_example_dir / "rank.test"), str(rank_example_dir / "rank.test.query"), None, str(tmp_path / "rank.test"))
append_queries_and_positions_to_file(
str(tmp_path / "rank.intermediate"),
str(rank_example_dir / "rank.train.query"),
positions,
str(tmp_path / "rank.train"),
)
append_queries_and_positions_to_file(
str(rank_example_dir / "rank.test"),
str(rank_example_dir / "rank.test.query"),
None,
str(tmp_path / "rank.test"),
)

# Training with single file
params = {
@@ -897,12 +911,14 @@ def test_ranking_with_position_and_group_information_in_single_file(tmp_path):
"min_sum_hessian_in_leaf": 5.0,
"group_column": 301,
"position_column": 302,
"label_column": 0
"label_column": 0,
}

lgb_train = lgb.Dataset(str(tmp_path / "rank.train"), params=params)
lgb_valid = [lgb_train.create_valid(str(tmp_path / "rank.test"), params=params)]
gbm_unbiased_with_single_file = lgb.train(params, lgb_train, valid_sets=lgb_valid, num_boost_round=50)
gbm_unbiased_with_single_file = lgb.train(
params, lgb_train, valid_sets=lgb_valid, num_boost_round=50
)

# Training with query files and list of positions
params = {
@@ -915,15 +931,26 @@ def test_ranking_with_position_and_group_information_in_single_file(tmp_path):
"min_data_in_leaf": 50,
"min_sum_hessian_in_leaf": 5.0,
# ignore position and group column
"ignore_column": "301,302"
"ignore_column": "301,302",
}
copyfile(str(rank_example_dir / "rank.train.query"), str(tmp_path / "rank.train.query"))
copyfile(str(rank_example_dir / "rank.test.query"), str(tmp_path / "rank.test.query"))
lgb_train = lgb.Dataset(str(tmp_path / "rank.train"), params=params, position=positions)
copyfile(
str(rank_example_dir / "rank.train.query"), str(tmp_path / "rank.train.query")
)
copyfile(
str(rank_example_dir / "rank.test.query"), str(tmp_path / "rank.test.query")
)
lgb_train = lgb.Dataset(
str(tmp_path / "rank.train"), params=params, position=positions
)
lgb_valid = [lgb_train.create_valid(str(tmp_path / "rank.test"))]
gbm_unbiased_with_multiple_files = lgb.train(params, lgb_train, valid_sets=lgb_valid, num_boost_round=50)
gbm_unbiased_with_multiple_files = lgb.train(
params, lgb_train, valid_sets=lgb_valid, num_boost_round=50
)
# the performance of the unbiased LambdaMART when using query files and list of positions should match the performance of the unbiased LambdaMART when using single file with group and position columns
assert gbm_unbiased_with_multiple_files.best_score["valid_0"]["ndcg@3"] == gbm_unbiased_with_single_file.best_score["valid_0"]["ndcg@3"]
assert (
gbm_unbiased_with_multiple_files.best_score["valid_0"]["ndcg@3"]
== gbm_unbiased_with_single_file.best_score["valid_0"]["ndcg@3"]
)


def test_early_stopping():
Loading
Oops, something went wrong.