From 2f8cb9ae99ddd2ed99afb9e514182bdcc7b62b70 Mon Sep 17 00:00:00 2001 From: "a.makhin" Date: Mon, 13 Feb 2023 18:22:46 +0300 Subject: [PATCH 1/4] BUG Gale-Shapley --- .../feature_selection/gale_shapley.py | 17 +- .../test_gale_shapley_transform.py | 194 +++++++----------- 2 files changed, 78 insertions(+), 133 deletions(-) diff --git a/etna/transforms/feature_selection/gale_shapley.py b/etna/transforms/feature_selection/gale_shapley.py index 61b3b003c..192645975 100644 --- a/etna/transforms/feature_selection/gale_shapley.py +++ b/etna/transforms/feature_selection/gale_shapley.py @@ -294,8 +294,7 @@ def _compute_gale_shapley_steps_number(top_k: int, n_segments: int, n_features: @staticmethod def _gale_shapley_iteration( - segment_features_ranking: Dict[str, List[str]], - feature_segments_ranking: Dict[str, List[str]], + segment_features_ranking: Dict[str, List[str]], feature_segments_ranking: Dict[str, List[str]] ) -> Dict[str, str]: """Build matching for all the segments. @@ -310,10 +309,7 @@ def _gale_shapley_iteration( dict of segment x feature """ gssegments = [ - SegmentGaleShapley( - name=name, - ranked_candidates=ranked_candidates, - ) + SegmentGaleShapley(name=name, ranked_candidates=ranked_candidates) for name, ranked_candidates in segment_features_ranking.items() ] gsfeatures = [ @@ -361,17 +357,14 @@ def fit(self, df: pd.DataFrame) -> "GaleShapleyFeatureSelectionTransform": table=relevance_table.T, ascending=not self.relevance_table.greater_is_better ) gale_shapley_steps_number = self._compute_gale_shapley_steps_number( - top_k=self.top_k, - n_segments=len(segment_features_ranking), - n_features=len(feature_segments_ranking), + top_k=self.top_k, n_segments=len(segment_features_ranking), n_features=len(feature_segments_ranking) ) last_step_features_number = self.top_k % len(segment_features_ranking) for step in range(gale_shapley_steps_number): matches = self._gale_shapley_iteration( - segment_features_ranking=segment_features_ranking, - feature_segments_ranking=feature_segments_ranking, + segment_features_ranking=segment_features_ranking, feature_segments_ranking=feature_segments_ranking ) - if step == gale_shapley_steps_number - 1: + if step == gale_shapley_steps_number - 1 and last_step_features_number != 0: selected_features = self._process_last_step( matches=matches, relevance_table=relevance_table, diff --git a/tests/test_transforms/test_feature_selection/test_gale_shapley_transform.py b/tests/test_transforms/test_feature_selection/test_gale_shapley_transform.py index ba92e786d..3a4c8991c 100644 --- a/tests/test_transforms/test_feature_selection/test_gale_shapley_transform.py +++ b/tests/test_transforms/test_feature_selection/test_gale_shapley_transform.py @@ -19,6 +19,32 @@ from tests.test_transforms.utils import assert_transformation_equals_loaded_original +@pytest.fixture +def get_ts_with_exog_galeshapley(random_seed) -> TSDataset: + np.random.seed(random_seed) + + periods = 30 + df_1 = pd.DataFrame({"timestamp": pd.date_range("2020-01-15", periods=periods)}) + df_1["segment"] = "segment_1" + df_1["target"] = np.random.uniform(10, 20, size=periods) + + df_2 = pd.DataFrame({"timestamp": pd.date_range("2020-01-15", periods=periods)}) + df_2["segment"] = "segment_2" + df_2["target"] = np.random.uniform(-15, 5, size=periods) + + df = pd.concat([df_1, df_2]).reset_index(drop=True) + df = TSDataset.to_dataset(df) + tsds = TSDataset(df, freq="D") + df = tsds.to_pandas(flatten=True) + df_exog = df.copy().drop(columns=["target"]) + df_exog["weekday"] = df_exog["timestamp"].dt.weekday + df_exog["monthday"] = df_exog["timestamp"].dt.day + df_exog["month"] = df_exog["timestamp"].dt.month + df_exog["year"] = df_exog["timestamp"].dt.year + ts = TSDataset(df=TSDataset.to_dataset(df), df_exog=TSDataset.to_dataset(df_exog), freq="D") + return ts + + @pytest.fixture def ts_with_large_regressors_number(random_seed) -> TSDataset: df = generate_periodic_df(periods=100, start_time="2020-01-01", n_segments=3, period=7, scale=10) @@ -68,32 +94,14 @@ def segment() -> SegmentGaleShapley: @pytest.fixture def matcher() -> GaleShapleyMatcher: segments = [ - SegmentGaleShapley( - name="segment_1", - ranked_candidates=["regressor_1", "regressor_2", "regressor_3"], - ), - SegmentGaleShapley( - name="segment_2", - ranked_candidates=["regressor_1", "regressor_3", "regressor_2"], - ), - SegmentGaleShapley( - name="segment_3", - ranked_candidates=["regressor_2", "regressor_3", "regressor_1"], - ), + SegmentGaleShapley(name="segment_1", ranked_candidates=["regressor_1", "regressor_2", "regressor_3"]), + SegmentGaleShapley(name="segment_2", ranked_candidates=["regressor_1", "regressor_3", "regressor_2"]), + SegmentGaleShapley(name="segment_3", ranked_candidates=["regressor_2", "regressor_3", "regressor_1"]), ] features = [ - FeatureGaleShapley( - name="regressor_1", - ranked_candidates=["segment_3", "segment_1", "segment_2"], - ), - FeatureGaleShapley( - name="regressor_2", - ranked_candidates=["segment_2", "segment_3", "segment_1"], - ), - FeatureGaleShapley( - name="regressor_3", - ranked_candidates=["segment_1", "segment_2", "segment_3"], - ), + FeatureGaleShapley(name="regressor_1", ranked_candidates=["segment_3", "segment_1", "segment_2"]), + FeatureGaleShapley(name="regressor_2", ranked_candidates=["segment_2", "segment_3", "segment_1"]), + FeatureGaleShapley(name="regressor_3", ranked_candidates=["segment_1", "segment_2", "segment_3"]), ] gsh = GaleShapleyMatcher(segments=segments, features=features) return gsh @@ -178,13 +186,7 @@ def test_get_ranked_list_features(relevance_matrix: pd.DataFrame, ascending: boo @pytest.mark.parametrize( "top_k,n_segments,n_features,expected", - ( - (20, 10, 50, 2), - (27, 10, 40, 3), - (15, 4, 16, 4), - (7, 10, 50, 1), - (30, 5, 20, 1), - ), + ((20, 10, 50, 2), (27, 10, 40, 3), (15, 4, 16, 4), (7, 10, 50, 1), (30, 5, 20, 1)), ) def test_compute_gale_shapley_steps_number(top_k: int, n_segments: int, n_features: int, expected: int): result = GaleShapleyFeatureSelectionTransform._compute_gale_shapley_steps_number( @@ -216,11 +218,7 @@ def test_compute_gale_shapley_steps_number(top_k: int, n_segments: int, n_featur "segment_3": ["regressor_4", "regressor_3", "regressor_1", "regressor_2"], }, ["regressor_2", "regressor_3", "regressor_1", "regressor_4"], - { - "segment_1": [], - "segment_2": [], - "segment_3": [], - }, + {"segment_1": [], "segment_2": [], "segment_3": []}, ), ), ) @@ -290,38 +288,30 @@ def test_gale_shapley_matcher_break_match(matcher: GaleShapleyMatcher): ( [ SegmentGaleShapley( - name="segment_1", - ranked_candidates=["regressor_1", "regressor_2", "regressor_3", "regressor_4"], + name="segment_1", ranked_candidates=["regressor_1", "regressor_2", "regressor_3", "regressor_4"] ), SegmentGaleShapley( - name="segment_2", - ranked_candidates=["regressor_1", "regressor_3", "regressor_2", "regressor_4"], + name="segment_2", ranked_candidates=["regressor_1", "regressor_3", "regressor_2", "regressor_4"] ), SegmentGaleShapley( - name="segment_3", - ranked_candidates=["regressor_2", "regressor_4", "regressor_1", "regressor_3"], + name="segment_3", ranked_candidates=["regressor_2", "regressor_4", "regressor_1", "regressor_3"] ), SegmentGaleShapley( - name="segment_4", - ranked_candidates=["regressor_3", "regressor_1", "regressor_4", "regressor_2"], + name="segment_4", ranked_candidates=["regressor_3", "regressor_1", "regressor_4", "regressor_2"] ), ], [ FeatureGaleShapley( - name="regressor_1", - ranked_candidates=["segment_2", "segment_1", "segment_3", "segment_4"], + name="regressor_1", ranked_candidates=["segment_2", "segment_1", "segment_3", "segment_4"] ), FeatureGaleShapley( - name="regressor_2", - ranked_candidates=["segment_1", "segment_2", "segment_3", "segment_4"], + name="regressor_2", ranked_candidates=["segment_1", "segment_2", "segment_3", "segment_4"] ), FeatureGaleShapley( - name="regressor_3", - ranked_candidates=["segment_3", "segment_2", "segment_4", "segment_1"], + name="regressor_3", ranked_candidates=["segment_3", "segment_2", "segment_4", "segment_1"] ), FeatureGaleShapley( - name="regressor_4", - ranked_candidates=["segment_3", "segment_1", "segment_4", "segment_2"], + name="regressor_4", ranked_candidates=["segment_3", "segment_1", "segment_4", "segment_2"] ), ], { @@ -334,38 +324,30 @@ def test_gale_shapley_matcher_break_match(matcher: GaleShapleyMatcher): ( [ SegmentGaleShapley( - name="segment_1", - ranked_candidates=["regressor_1", "regressor_2", "regressor_3", "regressor_4"], + name="segment_1", ranked_candidates=["regressor_1", "regressor_2", "regressor_3", "regressor_4"] ), SegmentGaleShapley( - name="segment_2", - ranked_candidates=["regressor_1", "regressor_2", "regressor_3", "regressor_4"], + name="segment_2", ranked_candidates=["regressor_1", "regressor_2", "regressor_3", "regressor_4"] ), SegmentGaleShapley( - name="segment_3", - ranked_candidates=["regressor_1", "regressor_2", "regressor_3", "regressor_4"], + name="segment_3", ranked_candidates=["regressor_1", "regressor_2", "regressor_3", "regressor_4"] ), SegmentGaleShapley( - name="segment_4", - ranked_candidates=["regressor_1", "regressor_2", "regressor_3", "regressor_4"], + name="segment_4", ranked_candidates=["regressor_1", "regressor_2", "regressor_3", "regressor_4"] ), ], [ FeatureGaleShapley( - name="regressor_1", - ranked_candidates=["segment_2", "segment_1", "segment_3", "segment_4"], + name="regressor_1", ranked_candidates=["segment_2", "segment_1", "segment_3", "segment_4"] ), FeatureGaleShapley( - name="regressor_2", - ranked_candidates=["segment_1", "segment_2", "segment_3", "segment_4"], + name="regressor_2", ranked_candidates=["segment_1", "segment_2", "segment_3", "segment_4"] ), FeatureGaleShapley( - name="regressor_3", - ranked_candidates=["segment_3", "segment_2", "segment_4", "segment_1"], + name="regressor_3", ranked_candidates=["segment_3", "segment_2", "segment_4", "segment_1"] ), FeatureGaleShapley( - name="regressor_4", - ranked_candidates=["segment_3", "segment_1", "segment_4", "segment_2"], + name="regressor_4", ranked_candidates=["segment_3", "segment_1", "segment_4", "segment_2"] ), ], { @@ -391,39 +373,18 @@ def test_gale_shapley_matcher_break_match(matcher: GaleShapleyMatcher): ), ], [ - FeatureGaleShapley( - name="regressor_1", - ranked_candidates=["segment_3", "segment_1", "segment_2"], - ), - FeatureGaleShapley( - name="regressor_2", - ranked_candidates=["segment_3", "segment_2", "segment_1"], - ), - FeatureGaleShapley( - name="regressor_3", - ranked_candidates=["segment_3", "segment_1", "segment_2"], - ), - FeatureGaleShapley( - name="regressor_4", - ranked_candidates=["segment_1", "segment_2", "segment_3"], - ), - FeatureGaleShapley( - name="regressor_5", - ranked_candidates=["segment_1", "segment_3", "segment_2"], - ), + FeatureGaleShapley(name="regressor_1", ranked_candidates=["segment_3", "segment_1", "segment_2"]), + FeatureGaleShapley(name="regressor_2", ranked_candidates=["segment_3", "segment_2", "segment_1"]), + FeatureGaleShapley(name="regressor_3", ranked_candidates=["segment_3", "segment_1", "segment_2"]), + FeatureGaleShapley(name="regressor_4", ranked_candidates=["segment_1", "segment_2", "segment_3"]), + FeatureGaleShapley(name="regressor_5", ranked_candidates=["segment_1", "segment_3", "segment_2"]), ], - { - "segment_1": "regressor_5", - "segment_2": "regressor_2", - "segment_3": "regressor_1", - }, + {"segment_1": "regressor_5", "segment_2": "regressor_2", "segment_3": "regressor_1"}, ), ), ) def test_gale_shapley_result( - segments: List[SegmentGaleShapley], - features: List[FeatureGaleShapley], - expected: Dict[str, str], + segments: List[SegmentGaleShapley], features: List[FeatureGaleShapley], expected: Dict[str, str] ): matcher = GaleShapleyMatcher(segments=segments, features=features) matches = matcher() @@ -488,11 +449,7 @@ def test_gale_shapley_result( "regressor_4": ["segment_1", "segment_2", "segment_3"], "regressor_5": ["segment_1", "segment_3", "segment_2"], }, - { - "segment_1": "regressor_5", - "segment_2": "regressor_2", - "segment_3": "regressor_1", - }, + {"segment_1": "regressor_5", "segment_2": "regressor_2", "segment_3": "regressor_1"}, ), ), ) @@ -510,41 +467,25 @@ def test_gale_shapley_transform_gale_shapley_iteration( "matches,n,greater_is_better,expected", ( ( - { - "segment_1": "regressor_4", - "segment_2": "regressor_7", - "segment_3": "regressor_5", - }, + {"segment_1": "regressor_4", "segment_2": "regressor_7", "segment_3": "regressor_5"}, 2, False, ["regressor_5", "regressor_7"], ), ( - { - "segment_1": "regressor_4", - "segment_2": "regressor_7", - "segment_3": "regressor_5", - }, + {"segment_1": "regressor_4", "segment_2": "regressor_7", "segment_3": "regressor_5"}, 1, True, ["regressor_4"], ), ( - { - "segment_1": "regressor_3", - "segment_2": "regressor_2", - "segment_3": "regressor_1", - }, + {"segment_1": "regressor_3", "segment_2": "regressor_2", "segment_3": "regressor_1"}, 2, False, ["regressor_1", "regressor_2"], ), ( - { - "segment_1": "regressor_3", - "segment_2": "regressor_2", - "segment_3": "regressor_1", - }, + {"segment_1": "regressor_3", "segment_2": "regressor_2", "segment_3": "regressor_1"}, 3, False, ["regressor_1", "regressor_2", "regressor_3"], @@ -622,3 +563,14 @@ def test_work_with_non_regressors(ts_with_exog): ) def test_save_load(transform, ts_with_large_regressors_number): assert_transformation_equals_loaded_original(transform=transform, ts=ts_with_large_regressors_number) + + +def test_right_number_features_with_integer_division(get_ts_with_exog_galeshapley): + top_k = len(get_ts_with_exog_galeshapley.segments) + transform = GaleShapleyFeatureSelectionTransform(relevance_table=StatisticsRelevanceTable(), top_k=top_k) + + transform.fit(get_ts_with_exog_galeshapley.to_pandas()) + df = transform.transform(get_ts_with_exog_galeshapley.to_pandas()) + + remaining_columns = df.columns.get_level_values("feature").unique().tolist() + assert len(remaining_columns) == top_k + 1 From 4cf57d843f94d9c876f3d2de54fe2957d705ca63 Mon Sep 17 00:00:00 2001 From: "a.makhin" Date: Mon, 13 Feb 2023 18:26:36 +0300 Subject: [PATCH 2/4] changelog --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2db5e1b52..854e628fd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,7 +16,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - ### Fixed -- +- Fix bug in `GaleShapleyFeatureSelectionTransform` with wrong number of remaining features ([#1110](https://github.com/tinkoff-ai/etna/pull/1110)) - ## [1.15.0] - 2023-01-31 From 626fd0a034781809f34c8d748b5f8001b27f5a34 Mon Sep 17 00:00:00 2001 From: "a.makhin" Date: Mon, 13 Feb 2023 18:34:18 +0300 Subject: [PATCH 3/4] black --- .../feature_selection/gale_shapley.py | 15 +- .../test_gale_shapley_transform.py | 157 ++++++++++++++---- 2 files changed, 132 insertions(+), 40 deletions(-) diff --git a/etna/transforms/feature_selection/gale_shapley.py b/etna/transforms/feature_selection/gale_shapley.py index 192645975..e0bc491d6 100644 --- a/etna/transforms/feature_selection/gale_shapley.py +++ b/etna/transforms/feature_selection/gale_shapley.py @@ -294,7 +294,8 @@ def _compute_gale_shapley_steps_number(top_k: int, n_segments: int, n_features: @staticmethod def _gale_shapley_iteration( - segment_features_ranking: Dict[str, List[str]], feature_segments_ranking: Dict[str, List[str]] + segment_features_ranking: Dict[str, List[str]], + feature_segments_ranking: Dict[str, List[str]], ) -> Dict[str, str]: """Build matching for all the segments. @@ -309,7 +310,10 @@ def _gale_shapley_iteration( dict of segment x feature """ gssegments = [ - SegmentGaleShapley(name=name, ranked_candidates=ranked_candidates) + SegmentGaleShapley( + name=name, + ranked_candidates=ranked_candidates, + ) for name, ranked_candidates in segment_features_ranking.items() ] gsfeatures = [ @@ -357,12 +361,15 @@ def fit(self, df: pd.DataFrame) -> "GaleShapleyFeatureSelectionTransform": table=relevance_table.T, ascending=not self.relevance_table.greater_is_better ) gale_shapley_steps_number = self._compute_gale_shapley_steps_number( - top_k=self.top_k, n_segments=len(segment_features_ranking), n_features=len(feature_segments_ranking) + top_k=self.top_k, + n_segments=len(segment_features_ranking), + n_features=len(feature_segments_ranking), ) last_step_features_number = self.top_k % len(segment_features_ranking) for step in range(gale_shapley_steps_number): matches = self._gale_shapley_iteration( - segment_features_ranking=segment_features_ranking, feature_segments_ranking=feature_segments_ranking + segment_features_ranking=segment_features_ranking, + feature_segments_ranking=feature_segments_ranking, ) if step == gale_shapley_steps_number - 1 and last_step_features_number != 0: selected_features = self._process_last_step( diff --git a/tests/test_transforms/test_feature_selection/test_gale_shapley_transform.py b/tests/test_transforms/test_feature_selection/test_gale_shapley_transform.py index 3a4c8991c..d1af9222e 100644 --- a/tests/test_transforms/test_feature_selection/test_gale_shapley_transform.py +++ b/tests/test_transforms/test_feature_selection/test_gale_shapley_transform.py @@ -94,14 +94,32 @@ def segment() -> SegmentGaleShapley: @pytest.fixture def matcher() -> GaleShapleyMatcher: segments = [ - SegmentGaleShapley(name="segment_1", ranked_candidates=["regressor_1", "regressor_2", "regressor_3"]), - SegmentGaleShapley(name="segment_2", ranked_candidates=["regressor_1", "regressor_3", "regressor_2"]), - SegmentGaleShapley(name="segment_3", ranked_candidates=["regressor_2", "regressor_3", "regressor_1"]), + SegmentGaleShapley( + name="segment_1", + ranked_candidates=["regressor_1", "regressor_2", "regressor_3"], + ), + SegmentGaleShapley( + name="segment_2", + ranked_candidates=["regressor_1", "regressor_3", "regressor_2"], + ), + SegmentGaleShapley( + name="segment_3", + ranked_candidates=["regressor_2", "regressor_3", "regressor_1"], + ), ] features = [ - FeatureGaleShapley(name="regressor_1", ranked_candidates=["segment_3", "segment_1", "segment_2"]), - FeatureGaleShapley(name="regressor_2", ranked_candidates=["segment_2", "segment_3", "segment_1"]), - FeatureGaleShapley(name="regressor_3", ranked_candidates=["segment_1", "segment_2", "segment_3"]), + FeatureGaleShapley( + name="regressor_1", + ranked_candidates=["segment_3", "segment_1", "segment_2"], + ), + FeatureGaleShapley( + name="regressor_2", + ranked_candidates=["segment_2", "segment_3", "segment_1"], + ), + FeatureGaleShapley( + name="regressor_3", + ranked_candidates=["segment_1", "segment_2", "segment_3"], + ), ] gsh = GaleShapleyMatcher(segments=segments, features=features) return gsh @@ -186,7 +204,13 @@ def test_get_ranked_list_features(relevance_matrix: pd.DataFrame, ascending: boo @pytest.mark.parametrize( "top_k,n_segments,n_features,expected", - ((20, 10, 50, 2), (27, 10, 40, 3), (15, 4, 16, 4), (7, 10, 50, 1), (30, 5, 20, 1)), + ( + (20, 10, 50, 2), + (27, 10, 40, 3), + (15, 4, 16, 4), + (7, 10, 50, 1), + (30, 5, 20, 1), + ), ) def test_compute_gale_shapley_steps_number(top_k: int, n_segments: int, n_features: int, expected: int): result = GaleShapleyFeatureSelectionTransform._compute_gale_shapley_steps_number( @@ -218,7 +242,11 @@ def test_compute_gale_shapley_steps_number(top_k: int, n_segments: int, n_featur "segment_3": ["regressor_4", "regressor_3", "regressor_1", "regressor_2"], }, ["regressor_2", "regressor_3", "regressor_1", "regressor_4"], - {"segment_1": [], "segment_2": [], "segment_3": []}, + { + "segment_1": [], + "segment_2": [], + "segment_3": [], + }, ), ), ) @@ -288,30 +316,38 @@ def test_gale_shapley_matcher_break_match(matcher: GaleShapleyMatcher): ( [ SegmentGaleShapley( - name="segment_1", ranked_candidates=["regressor_1", "regressor_2", "regressor_3", "regressor_4"] + name="segment_1", + ranked_candidates=["regressor_1", "regressor_2", "regressor_3", "regressor_4"], ), SegmentGaleShapley( - name="segment_2", ranked_candidates=["regressor_1", "regressor_3", "regressor_2", "regressor_4"] + name="segment_2", + ranked_candidates=["regressor_1", "regressor_3", "regressor_2", "regressor_4"], ), SegmentGaleShapley( - name="segment_3", ranked_candidates=["regressor_2", "regressor_4", "regressor_1", "regressor_3"] + name="segment_3", + ranked_candidates=["regressor_2", "regressor_4", "regressor_1", "regressor_3"], ), SegmentGaleShapley( - name="segment_4", ranked_candidates=["regressor_3", "regressor_1", "regressor_4", "regressor_2"] + name="segment_4", + ranked_candidates=["regressor_3", "regressor_1", "regressor_4", "regressor_2"], ), ], [ FeatureGaleShapley( - name="regressor_1", ranked_candidates=["segment_2", "segment_1", "segment_3", "segment_4"] + name="regressor_1", + ranked_candidates=["segment_2", "segment_1", "segment_3", "segment_4"], ), FeatureGaleShapley( - name="regressor_2", ranked_candidates=["segment_1", "segment_2", "segment_3", "segment_4"] + name="regressor_2", + ranked_candidates=["segment_1", "segment_2", "segment_3", "segment_4"], ), FeatureGaleShapley( - name="regressor_3", ranked_candidates=["segment_3", "segment_2", "segment_4", "segment_1"] + name="regressor_3", + ranked_candidates=["segment_3", "segment_2", "segment_4", "segment_1"], ), FeatureGaleShapley( - name="regressor_4", ranked_candidates=["segment_3", "segment_1", "segment_4", "segment_2"] + name="regressor_4", + ranked_candidates=["segment_3", "segment_1", "segment_4", "segment_2"], ), ], { @@ -324,30 +360,38 @@ def test_gale_shapley_matcher_break_match(matcher: GaleShapleyMatcher): ( [ SegmentGaleShapley( - name="segment_1", ranked_candidates=["regressor_1", "regressor_2", "regressor_3", "regressor_4"] + name="segment_1", + ranked_candidates=["regressor_1", "regressor_2", "regressor_3", "regressor_4"], ), SegmentGaleShapley( - name="segment_2", ranked_candidates=["regressor_1", "regressor_2", "regressor_3", "regressor_4"] + name="segment_2", + ranked_candidates=["regressor_1", "regressor_2", "regressor_3", "regressor_4"], ), SegmentGaleShapley( - name="segment_3", ranked_candidates=["regressor_1", "regressor_2", "regressor_3", "regressor_4"] + name="segment_3", + ranked_candidates=["regressor_1", "regressor_2", "regressor_3", "regressor_4"], ), SegmentGaleShapley( - name="segment_4", ranked_candidates=["regressor_1", "regressor_2", "regressor_3", "regressor_4"] + name="segment_4", + ranked_candidates=["regressor_1", "regressor_2", "regressor_3", "regressor_4"], ), ], [ FeatureGaleShapley( - name="regressor_1", ranked_candidates=["segment_2", "segment_1", "segment_3", "segment_4"] + name="regressor_1", + ranked_candidates=["segment_2", "segment_1", "segment_3", "segment_4"], ), FeatureGaleShapley( - name="regressor_2", ranked_candidates=["segment_1", "segment_2", "segment_3", "segment_4"] + name="regressor_2", + ranked_candidates=["segment_1", "segment_2", "segment_3", "segment_4"], ), FeatureGaleShapley( - name="regressor_3", ranked_candidates=["segment_3", "segment_2", "segment_4", "segment_1"] + name="regressor_3", + ranked_candidates=["segment_3", "segment_2", "segment_4", "segment_1"], ), FeatureGaleShapley( - name="regressor_4", ranked_candidates=["segment_3", "segment_1", "segment_4", "segment_2"] + name="regressor_4", + ranked_candidates=["segment_3", "segment_1", "segment_4", "segment_2"], ), ], { @@ -373,18 +417,39 @@ def test_gale_shapley_matcher_break_match(matcher: GaleShapleyMatcher): ), ], [ - FeatureGaleShapley(name="regressor_1", ranked_candidates=["segment_3", "segment_1", "segment_2"]), - FeatureGaleShapley(name="regressor_2", ranked_candidates=["segment_3", "segment_2", "segment_1"]), - FeatureGaleShapley(name="regressor_3", ranked_candidates=["segment_3", "segment_1", "segment_2"]), - FeatureGaleShapley(name="regressor_4", ranked_candidates=["segment_1", "segment_2", "segment_3"]), - FeatureGaleShapley(name="regressor_5", ranked_candidates=["segment_1", "segment_3", "segment_2"]), + FeatureGaleShapley( + name="regressor_1", + ranked_candidates=["segment_3", "segment_1", "segment_2"], + ), + FeatureGaleShapley( + name="regressor_2", + ranked_candidates=["segment_3", "segment_2", "segment_1"], + ), + FeatureGaleShapley( + name="regressor_3", + ranked_candidates=["segment_3", "segment_1", "segment_2"], + ), + FeatureGaleShapley( + name="regressor_4", + ranked_candidates=["segment_1", "segment_2", "segment_3"], + ), + FeatureGaleShapley( + name="regressor_5", + ranked_candidates=["segment_1", "segment_3", "segment_2"], + ), ], - {"segment_1": "regressor_5", "segment_2": "regressor_2", "segment_3": "regressor_1"}, + { + "segment_1": "regressor_5", + "segment_2": "regressor_2", + "segment_3": "regressor_1", + }, ), ), ) def test_gale_shapley_result( - segments: List[SegmentGaleShapley], features: List[FeatureGaleShapley], expected: Dict[str, str] + segments: List[SegmentGaleShapley], + features: List[FeatureGaleShapley], + expected: Dict[str, str], ): matcher = GaleShapleyMatcher(segments=segments, features=features) matches = matcher() @@ -449,7 +514,11 @@ def test_gale_shapley_result( "regressor_4": ["segment_1", "segment_2", "segment_3"], "regressor_5": ["segment_1", "segment_3", "segment_2"], }, - {"segment_1": "regressor_5", "segment_2": "regressor_2", "segment_3": "regressor_1"}, + { + "segment_1": "regressor_5", + "segment_2": "regressor_2", + "segment_3": "regressor_1", + }, ), ), ) @@ -467,25 +536,41 @@ def test_gale_shapley_transform_gale_shapley_iteration( "matches,n,greater_is_better,expected", ( ( - {"segment_1": "regressor_4", "segment_2": "regressor_7", "segment_3": "regressor_5"}, + { + "segment_1": "regressor_4", + "segment_2": "regressor_7", + "segment_3": "regressor_5", + }, 2, False, ["regressor_5", "regressor_7"], ), ( - {"segment_1": "regressor_4", "segment_2": "regressor_7", "segment_3": "regressor_5"}, + { + "segment_1": "regressor_4", + "segment_2": "regressor_7", + "segment_3": "regressor_5", + }, 1, True, ["regressor_4"], ), ( - {"segment_1": "regressor_3", "segment_2": "regressor_2", "segment_3": "regressor_1"}, + { + "segment_1": "regressor_3", + "segment_2": "regressor_2", + "segment_3": "regressor_1", + }, 2, False, ["regressor_1", "regressor_2"], ), ( - {"segment_1": "regressor_3", "segment_2": "regressor_2", "segment_3": "regressor_1"}, + { + "segment_1": "regressor_3", + "segment_2": "regressor_2", + "segment_3": "regressor_1", + }, 3, False, ["regressor_1", "regressor_2", "regressor_3"], From 72a1a515b1c72cf1616b186d91bafa4604b12944 Mon Sep 17 00:00:00 2001 From: "a.makhin" Date: Tue, 14 Feb 2023 12:08:30 +0300 Subject: [PATCH 4/4] fix comment --- .../test_gale_shapley_transform.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/test_transforms/test_feature_selection/test_gale_shapley_transform.py b/tests/test_transforms/test_feature_selection/test_gale_shapley_transform.py index d1af9222e..891b77f8f 100644 --- a/tests/test_transforms/test_feature_selection/test_gale_shapley_transform.py +++ b/tests/test_transforms/test_feature_selection/test_gale_shapley_transform.py @@ -20,7 +20,7 @@ @pytest.fixture -def get_ts_with_exog_galeshapley(random_seed) -> TSDataset: +def ts_with_exog_galeshapley(random_seed) -> TSDataset: np.random.seed(random_seed) periods = 30 @@ -650,12 +650,12 @@ def test_save_load(transform, ts_with_large_regressors_number): assert_transformation_equals_loaded_original(transform=transform, ts=ts_with_large_regressors_number) -def test_right_number_features_with_integer_division(get_ts_with_exog_galeshapley): - top_k = len(get_ts_with_exog_galeshapley.segments) +def test_right_number_features_with_integer_division(ts_with_exog_galeshapley): + top_k = len(ts_with_exog_galeshapley.segments) transform = GaleShapleyFeatureSelectionTransform(relevance_table=StatisticsRelevanceTable(), top_k=top_k) - transform.fit(get_ts_with_exog_galeshapley.to_pandas()) - df = transform.transform(get_ts_with_exog_galeshapley.to_pandas()) + transform.fit(ts_with_exog_galeshapley.to_pandas()) + df = transform.transform(ts_with_exog_galeshapley.to_pandas()) remaining_columns = df.columns.get_level_values("feature").unique().tolist() assert len(remaining_columns) == top_k + 1