Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support to specifying position using position_column parameter #6825

Open
wants to merge 9 commits into
base: master
Choose a base branch
from
Open
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
add tests
  • Loading branch information
NProkoptsev committed Feb 27, 2025
commit 0b66061135d3c7564ea8b1763ef87f3cf2356444
78 changes: 78 additions & 0 deletions tests/python_package_test/test_engine.py
Original file line number Diff line number Diff line change
@@ -847,6 +847,84 @@ def test_ranking_with_position_information_with_dataset_constructor(tmp_path):
positions_from_get = lgb_train.get_position()
np.testing.assert_array_equal(positions_from_get, positions)

# Appends queries and positions to the dataset file
def append_queries_and_positions_to_file(file_dataset_in, file_query_in, positions, out_path):
queries = []
query_id = 0
with open(file_query_in, "r") as f:
for line in f:
query_count = int(line.strip())
queries.extend([query_id] * query_count)
query_id += 1
with open(file_dataset_in, "r") as f_in:
with open(out_path, "w") as f_out:
if positions is not None:
for line, query, position in zip(f_in, queries, positions):
f_out.write(f"{line.strip()} 301:{query} 302:{position}\n")
else:
for line, query in zip(f_in, queries):
# adding dummy position
f_out.write(f"{line.strip()} 301:{query} 302:1\n")


@pytest.mark.skipif(
getenv("TASK", "") == "cuda", reason="Positions in learning to rank is not supported in CUDA version yet"
)
def test_ranking_with_position_and_group_information_in_single_file(tmp_path):
rank_example_dir = Path(__file__).absolute().parents[2] / "examples" / "lambdarank"

# simulate position bias for the train dataset and put the train dataset with biased labels to temp directory
positions = simulate_position_bias(
str(rank_example_dir / "rank.train"),
str(rank_example_dir / "rank.train.query"),
str(tmp_path / "rank.intermediate"),
baseline_feature=34,
)

# append queries amd positions to the dataset file. They will have 301 and 302 feature indexes
append_queries_and_positions_to_file(str(tmp_path / "rank.intermediate"), str(rank_example_dir / "rank.train.query"), positions, str(tmp_path / "rank.train"))
append_queries_and_positions_to_file(str(rank_example_dir / "rank.test"), str(rank_example_dir / "rank.test.query"), None, str(tmp_path / "rank.test"))

# Training with single file
params = {
"objective": "lambdarank",
"verbose": -1,
"eval_at": [3],
"metric": "ndcg",
"bagging_freq": 1,
"bagging_fraction": 0.9,
"min_data_in_leaf": 50,
"min_sum_hessian_in_leaf": 5.0,
"group_column": 301,
"position_column": 302,
"label_column": 0
}

lgb_train = lgb.Dataset(str(tmp_path / "rank.train"), params=params)
lgb_valid = [lgb_train.create_valid(str(tmp_path / "rank.test"), params=params)]
gbm_unbiased_with_single_file = lgb.train(params, lgb_train, valid_sets=lgb_valid, num_boost_round=50)

# Training with query files and list of positions
params = {
"objective": "lambdarank",
"verbose": -1,
"eval_at": [3],
"metric": "ndcg",
"bagging_freq": 1,
"bagging_fraction": 0.9,
"min_data_in_leaf": 50,
"min_sum_hessian_in_leaf": 5.0,
# ignore position and group column
"ignore_column": "301,302"
}
copyfile(str(rank_example_dir / "rank.train.query"), str(tmp_path / "rank.train.query"))
copyfile(str(rank_example_dir / "rank.test.query"), str(tmp_path / "rank.test.query"))
lgb_train = lgb.Dataset(str(tmp_path / "rank.train"), params=params, position=positions)
lgb_valid = [lgb_train.create_valid(str(tmp_path / "rank.test"))]
gbm_unbiased_with_multiple_files = lgb.train(params, lgb_train, valid_sets=lgb_valid, num_boost_round=50)
# the performance of the unbiased LambdaMART when using query files and list of positions should match the performance of the unbiased LambdaMART when using single file with group and position columns
assert gbm_unbiased_with_multiple_files.best_score["valid_0"]["ndcg@3"] == gbm_unbiased_with_single_file.best_score["valid_0"]["ndcg@3"]


def test_early_stopping():
X, y = load_breast_cancer(return_X_y=True)
Loading
Oops, something went wrong.