Merge pull request #45 from wwu-mmll/fix/imbalanced_transformer

FIX #44 ImbalancedDataTransformer
wwu-mmll · Mar 15, 2022 · cde82e1 · cde82e1
2 parents 8fc0289 + 456ccb1
commit cde82e1
Show file tree

Hide file tree

Showing 4 changed files with 115 additions and 37 deletions.
diff --git a/examples/basic/imbalanced_data.py b/examples/basic/imbalanced_data.py
@@ -28,9 +28,14 @@
 my_pipe += PipelineElement('StandardScaler')
 
 tested_methods = Categorical(['RandomOverSampler', 'SMOTEENN', 'SVMSMOTE',
-                              'BorderlineSMOTE', 'SMOTE', 'ClusterCentroids'])
+                              'BorderlineSMOTE', 'SMOTE'])
+
+# Only SMOTE got a different input parameter.
+# All other strategies stay with the default setting.
+# Please do not try to optimize over this parameter (not use config inside the 'hyperparameters').
 my_pipe += PipelineElement('ImbalancedDataTransformer',
                            hyperparameters={'method_name': tested_methods},
+                           config={"SMOTE": {"k_neighbors": 3}},
                            test_disabled=True)
 
 my_pipe += PipelineElement("RandomForestClassifier", n_estimators=200)

diff --git a/photonai/base/hyperpipe.py b/photonai/base/hyperpipe.py
@@ -305,9 +305,9 @@ def __init__(self, name: Optional[str],
                     - "grid_search": Optimizer that iteratively tests all possible hyperparameter combinations.
                     - "random_grid_search": A variation of the grid search optimization that randomly picks
                         hyperparameter combinations from all possible hyperparameter combinations.
-                    - "sk_opt": Scikit-Optimize based on theories of Baysian optimization.
+                    - "sk_opt": Scikit-Optimize based on theories of bayesian optimization.
                     - "random_search": randomly chooses hyperparameter from grid-free domain.
-                    - "smac": SMAC based on theories of Baysian optimization.
+                    - "smac": SMAC based on theories of bayesian optimization.
                     - "nevergrad": Nevergrad based on theories of evolutionary learning.
 
                 - In case an object is given:
@@ -359,7 +359,7 @@ def __init__(self, name: Optional[str],
 
             test_size:
                 The amount of the data that should be left out if no outer_cv is given and
-                eval_final_perfomance is set to True.
+                eval_final_performance is set to True.
 
             calculate_metrics_per_fold:
                 If True, the metrics are calculated for each inner_fold.
@@ -377,11 +377,11 @@ def __init__(self, name: Optional[str],
                 gives only warn and error, 1 gives adds info and 2 adds debug.
 
             learning_curves:
-                Enables larning curve procedure. Evaluate learning process over
+                Enables learning curve procedure. Evaluate learning process over
                 different sizes of input. Depends on learning_curves_cut.
 
             learning_curves_cut:
-                The tested relativ cuts for data size.
+                The tested relative cuts for data size.
 
             performance_constraints:
                 Objects that indicate whether a configuration should

diff --git a/photonai/modelwrapper/imbalanced_data_transformer.py b/photonai/modelwrapper/imbalanced_data_transformer.py
@@ -51,7 +51,7 @@ class ImbalancedDataTransformer(BaseEstimator, TransformerMixin):
         'combine': ["SMOTEENN", "SMOTETomek"],
     }
 
-    def __init__(self, method_name: str = 'RandomUnderSampler', **kwargs):
+    def __init__(self, method_name: str = 'RandomUnderSampler', config: dict = None):
         """
         Instantiates an object that transforms the data into balanced groups according to the given method.
 
@@ -84,20 +84,33 @@ def __init__(self, method_name: str = 'RandomUnderSampler', **kwargs):
                     - SMOTEENN,
                     - SMOTETomek.
 
-            **kwargs:
-                Any parameters to pass to the imbalance strategy object.
+            config:
+                Each strategy has a set of presets. This parameter is necessary
+                to select the appropriate settings for the selected method.
+                It is important that the key exactly matches the method_name.
+                If no key is found for a method, it will be started with the default settings.
+                Please do not use this parameter inside the 'hyperparmeters' to optimize it.
 
         """
         if not __found__:
             raise ModuleNotFoundError("Module imblearn not found or not installed as expected. "
                                       "Please install the requirements.txt in PHOTON main folder.")
 
+        self.config = config
+        self._method_name = None
         self.method_name = method_name
         self.needs_y = True
 
+    @property
+    def method_name(self):
+        return self._method_name
+
+    @method_name.setter
+    def method_name(self, value):
+
         imbalance_type = ''
         for group, possible_strategies in ImbalancedDataTransformer.IMBALANCED_DICT.items():
-            if self.method_name in possible_strategies:
+            if value in possible_strategies:
                 imbalance_type = group
 
         if imbalance_type == "oversampling":
@@ -115,8 +128,17 @@ def __init__(self, method_name: str = 'RandomUnderSampler', **kwargs):
             logger.error(msg)
             raise ValueError(msg)
 
-        desired_class = getattr(home, method_name)
-        self.method = desired_class(**kwargs)
+        desired_class = getattr(home, value)
+        self._method_name = value
+        if self.config is not None and value in self.config:
+            if not isinstance(self.config[value], dict):
+                msg = "Please use for the imbalanced config a format like: " \
+                      "config={'SMOTE': {'sampling_strategy': {0: 9, 1: 12}}}."
+                logger.error(msg)
+                raise ValueError(msg)
+            self.method = desired_class(**self.config[value])
+        else:
+            self.method = desired_class()
 
     def fit_transform(self, X: np.ndarray, y: np.ndarray = None, **kwargs) -> (np.ndarray, np.ndarray):
         """

diff --git a/test/modelwrapper_tests/test_imbalanced_data_transformer.py b/test/modelwrapper_tests/test_imbalanced_data_transformer.py
@@ -1,6 +1,11 @@
 import numpy as np
+import pandas as pd
+from sklearn.datasets import make_classification
+from sklearn.model_selection import StratifiedShuffleSplit
 
 from photonai.modelwrapper.imbalanced_data_transformer import ImbalancedDataTransformer
+from photonai.base import Hyperpipe, PipelineElement
+from photonai.optimization import Categorical
 from test.modelwrapper_tests.test_base_model_wrapper import BaseModelWrapperTest
 
 from imblearn.over_sampling._smote.tests import test_smote
@@ -28,42 +33,39 @@ def test_strategy_oversampling(self):
         """
         sample test of different functions based on imblearn implementation for oversampling methods.
         """
-        sampling_strategy = {0: 9, 1: 12}
         imbalanced_data_transformer = ImbalancedDataTransformer(method_name='SMOTE',
-                                                                sampling_strategy = {0: 9, 1: 12},
-                                                                random_state = test_smote.RND_SEED)
+                                                                config={"SMOTE": {"sampling_strategy": {0: 9, 1: 12},
+                                                                                  "random_state": test_smote.RND_SEED}})
 
         # test_sample_regular_half() -> smote
         X_resampled, y_resampled = imbalanced_data_transformer.fit_transform(test_smote.X, test_smote.Y)
-        X_gt = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141], [
-            1.25192108, -0.22367336
-        ], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [
-                             -0.28162401, -2.10400981
-                         ], [0.83680821, 1.72827342], [0.3084254, 0.33299982], [
-                             0.70472253, -0.73309052
-                         ], [0.28893132, -0.38761769], [1.15514042, 0.0129463], [
-                             0.88407872, 0.35454207
-                         ], [1.31301027, -0.92648734], [-1.11515198, -0.93689695], [
-                             -0.18410027, -0.45194484
-                         ], [0.9281014, 0.53085498], [-0.14374509, 0.27370049],
-                         [-0.41635887, -0.38299653], [0.08711622, 0.93259929],
-                         [1.70580611, -0.11219234], [0.36784496, -0.1953161]])
-        y_gt = np.array(
-            [0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0])
+        X_gt = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141],
+                         [1.25192108, -0.22367336], [0.53366841, -0.30312976],
+                         [1.52091956, -0.49283504], [-0.28162401, -2.10400981],
+                         [0.83680821, 1.72827342], [0.3084254, 0.33299982],
+                         [0.70472253, -0.73309052], [0.28893132, -0.38761769],
+                         [1.15514042, 0.0129463], [0.88407872, 0.35454207],
+                         [1.31301027, -0.92648734], [-1.11515198, -0.93689695],
+                         [-0.18410027, -0.45194484], [0.9281014, 0.53085498],
+                         [-0.14374509, 0.27370049], [-0.41635887, -0.38299653],
+                         [0.08711622, 0.93259929], [1.70580611, -0.11219234], [0.36784496, -0.1953161]])
+        y_gt = np.array([0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0])
         test_smote.assert_allclose(X_resampled, X_gt, rtol=test_smote.R_TOL)
         test_smote.assert_array_equal(y_resampled, y_gt)
 
     def test_strategy_undersampling(self):
         """
         sample test of different functions based on imblearn implementation for undersampling methods.
         """
-        imbalanced_data_transformer = ImbalancedDataTransformer(method_name='InstanceHardnessThreshold',
-                                                                estimator=test_instance_hardness_threshold.ESTIMATOR,
-                                                                sampling_strategy={0: 6, 1: 8},
-                                                                random_state=test_instance_hardness_threshold.RND_SEED)
+        transformer = ImbalancedDataTransformer(method_name='InstanceHardnessThreshold',
+                                                config={"InstanceHardnessThreshold":
+                                                            {"estimator": test_instance_hardness_threshold.ESTIMATOR,
+                                                             "sampling_strategy": {0: 6, 1: 8},
+                                                             "random_state": test_instance_hardness_threshold.RND_SEED}
+                                                        })
 
-        X_resampled, y_resampled = imbalanced_data_transformer.fit_resample(test_instance_hardness_threshold.X,
-                                                                            test_instance_hardness_threshold.Y)
+        X_resampled, y_resampled = transformer.fit_resample(test_instance_hardness_threshold.X,
+                                                            test_instance_hardness_threshold.Y)
         assert X_resampled.shape == (15, 2)
         assert y_resampled.shape == (15,)
 
@@ -72,7 +74,9 @@ def test_strategy_combine(self):
         sample test of different functions based on imblearn implementation for oversampling methods.
         """
         imbalanced_data_transformer = ImbalancedDataTransformer(method_name='SMOTETomek',
-                                                              random_state=test_smote_tomek.RND_SEED)
+                                                                config={"SMOTETomek":
+                                                                            {"random_state": test_smote_tomek.RND_SEED}
+                                                                        })
         X_resampled, y_resampled = imbalanced_data_transformer.fit_resample(test_smote_tomek.X, test_smote_tomek.Y)
         X_gt = np.array(
             [
@@ -97,3 +101,50 @@ def test_strategy_combine(self):
         y_gt = np.array([1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0])
         test_smote_tomek.assert_allclose(X_resampled, X_gt, rtol=test_smote_tomek.R_TOL)
         test_smote_tomek.assert_array_equal(y_resampled, y_gt)
+
+    def test_config_parameter(self):
+        with self.assertRaises(ValueError):
+             ImbalancedDataTransformer(method_name='SMOTETomek', config={"SMOTETomek": test_smote_tomek.RND_SEED})
+
+    def test_different_strategies(self):
+        def target_relative(y_true, y_pred):
+            return (y_true == 0).sum() / len(y_true)
+
+        def target_absolute(y_true, y_pred):
+            return len(y_true)
+
+        X, y = make_classification(weights=[0.9, 0.1], n_samples=300)
+
+        my_pipe = Hyperpipe('balanced_pipe',
+                            optimizer='grid_search',
+                            metrics=['accuracy', target_relative, target_absolute],
+                            best_config_metric="accuracy",
+                            inner_cv=StratifiedShuffleSplit(n_splits=3, test_size=0.2),
+                            verbosity=1,
+                            project_folder='./tmp/')
+
+        my_pipe += PipelineElement('StandardScaler')
+
+        tested_methods = Categorical(['RandomOverSampler', 'RandomUnderSampler'])
+        my_pipe += PipelineElement('ImbalancedDataTransformer',
+                                   hyperparameters={'method_name': tested_methods},
+                                   test_disabled=False)
+        my_pipe += PipelineElement("LogisticRegression")
+        my_pipe.fit(X, y)
+
+        # -> test samples per strategy
+        test_perf = pd.DataFrame([], columns=["config", "acc", "class_distribution", "absolute_samples"])
+
+        for i, test_config_item in enumerate(my_pipe.results.outer_folds[0].tested_config_list):
+            config = test_config_item.config_dict["ImbalancedDataTransformer__method_name"]
+            acc = round(test_config_item.metrics_train[0].value, 3)
+            relative = round(test_config_item.metrics_train[2].value, 3)
+            absolute = round(test_config_item.metrics_train[4].value, 3)
+            test_perf = test_perf.append(pd.Series([config, acc, relative, absolute], index=test_perf.columns),
+                                         ignore_index=True)
+
+        self.assertGreater(test_perf[test_perf["config"] == "RandomOverSampler"]["absolute_samples"].tolist()[0],
+                           test_perf[test_perf["config"] == "RandomUnderSampler"]["absolute_samples"].tolist()[0])
+
+        self.assertEqual(test_perf[test_perf["config"] == "RandomOverSampler"]["class_distribution"].tolist()[0],
+                         test_perf[test_perf["config"] == "RandomUnderSampler"]["class_distribution"].tolist()[0])