Skip to content

Commit

Permalink
Merge pull request #45 from wwu-mmll/fix/imbalanced_transformer
Browse files Browse the repository at this point in the history
FIX #44 ImbalancedDataTransformer
  • Loading branch information
RLeenings committed Mar 15, 2022
2 parents 8fc0289 + 456ccb1 commit cde82e1
Show file tree
Hide file tree
Showing 4 changed files with 115 additions and 37 deletions.
7 changes: 6 additions & 1 deletion examples/basic/imbalanced_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,14 @@
my_pipe += PipelineElement('StandardScaler')

tested_methods = Categorical(['RandomOverSampler', 'SMOTEENN', 'SVMSMOTE',
'BorderlineSMOTE', 'SMOTE', 'ClusterCentroids'])
'BorderlineSMOTE', 'SMOTE'])

# Only SMOTE got a different input parameter.
# All other strategies stay with the default setting.
# Please do not try to optimize over this parameter (not use config inside the 'hyperparameters').
my_pipe += PipelineElement('ImbalancedDataTransformer',
hyperparameters={'method_name': tested_methods},
config={"SMOTE": {"k_neighbors": 3}},
test_disabled=True)

my_pipe += PipelineElement("RandomForestClassifier", n_estimators=200)
Expand Down
10 changes: 5 additions & 5 deletions photonai/base/hyperpipe.py
Original file line number Diff line number Diff line change
Expand Up @@ -305,9 +305,9 @@ def __init__(self, name: Optional[str],
- "grid_search": Optimizer that iteratively tests all possible hyperparameter combinations.
- "random_grid_search": A variation of the grid search optimization that randomly picks
hyperparameter combinations from all possible hyperparameter combinations.
- "sk_opt": Scikit-Optimize based on theories of Baysian optimization.
- "sk_opt": Scikit-Optimize based on theories of bayesian optimization.
- "random_search": randomly chooses hyperparameter from grid-free domain.
- "smac": SMAC based on theories of Baysian optimization.
- "smac": SMAC based on theories of bayesian optimization.
- "nevergrad": Nevergrad based on theories of evolutionary learning.
- In case an object is given:
Expand Down Expand Up @@ -359,7 +359,7 @@ def __init__(self, name: Optional[str],
test_size:
The amount of the data that should be left out if no outer_cv is given and
eval_final_perfomance is set to True.
eval_final_performance is set to True.
calculate_metrics_per_fold:
If True, the metrics are calculated for each inner_fold.
Expand All @@ -377,11 +377,11 @@ def __init__(self, name: Optional[str],
gives only warn and error, 1 gives adds info and 2 adds debug.
learning_curves:
Enables larning curve procedure. Evaluate learning process over
Enables learning curve procedure. Evaluate learning process over
different sizes of input. Depends on learning_curves_cut.
learning_curves_cut:
The tested relativ cuts for data size.
The tested relative cuts for data size.
performance_constraints:
Objects that indicate whether a configuration should
Expand Down
34 changes: 28 additions & 6 deletions photonai/modelwrapper/imbalanced_data_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ class ImbalancedDataTransformer(BaseEstimator, TransformerMixin):
'combine': ["SMOTEENN", "SMOTETomek"],
}

def __init__(self, method_name: str = 'RandomUnderSampler', **kwargs):
def __init__(self, method_name: str = 'RandomUnderSampler', config: dict = None):
"""
Instantiates an object that transforms the data into balanced groups according to the given method.
Expand Down Expand Up @@ -84,20 +84,33 @@ def __init__(self, method_name: str = 'RandomUnderSampler', **kwargs):
- SMOTEENN,
- SMOTETomek.
**kwargs:
Any parameters to pass to the imbalance strategy object.
config:
Each strategy has a set of presets. This parameter is necessary
to select the appropriate settings for the selected method.
It is important that the key exactly matches the method_name.
If no key is found for a method, it will be started with the default settings.
Please do not use this parameter inside the 'hyperparmeters' to optimize it.
"""
if not __found__:
raise ModuleNotFoundError("Module imblearn not found or not installed as expected. "
"Please install the requirements.txt in PHOTON main folder.")

self.config = config
self._method_name = None
self.method_name = method_name
self.needs_y = True

@property
def method_name(self):
return self._method_name

@method_name.setter
def method_name(self, value):

imbalance_type = ''
for group, possible_strategies in ImbalancedDataTransformer.IMBALANCED_DICT.items():
if self.method_name in possible_strategies:
if value in possible_strategies:
imbalance_type = group

if imbalance_type == "oversampling":
Expand All @@ -115,8 +128,17 @@ def __init__(self, method_name: str = 'RandomUnderSampler', **kwargs):
logger.error(msg)
raise ValueError(msg)

desired_class = getattr(home, method_name)
self.method = desired_class(**kwargs)
desired_class = getattr(home, value)
self._method_name = value
if self.config is not None and value in self.config:
if not isinstance(self.config[value], dict):
msg = "Please use for the imbalanced config a format like: " \
"config={'SMOTE': {'sampling_strategy': {0: 9, 1: 12}}}."
logger.error(msg)
raise ValueError(msg)
self.method = desired_class(**self.config[value])
else:
self.method = desired_class()

def fit_transform(self, X: np.ndarray, y: np.ndarray = None, **kwargs) -> (np.ndarray, np.ndarray):
"""
Expand Down
101 changes: 76 additions & 25 deletions test/modelwrapper_tests/test_imbalanced_data_transformer.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.model_selection import StratifiedShuffleSplit

from photonai.modelwrapper.imbalanced_data_transformer import ImbalancedDataTransformer
from photonai.base import Hyperpipe, PipelineElement
from photonai.optimization import Categorical
from test.modelwrapper_tests.test_base_model_wrapper import BaseModelWrapperTest

from imblearn.over_sampling._smote.tests import test_smote
Expand Down Expand Up @@ -28,42 +33,39 @@ def test_strategy_oversampling(self):
"""
sample test of different functions based on imblearn implementation for oversampling methods.
"""
sampling_strategy = {0: 9, 1: 12}
imbalanced_data_transformer = ImbalancedDataTransformer(method_name='SMOTE',
sampling_strategy = {0: 9, 1: 12},
random_state = test_smote.RND_SEED)
config={"SMOTE": {"sampling_strategy": {0: 9, 1: 12},
"random_state": test_smote.RND_SEED}})

# test_sample_regular_half() -> smote
X_resampled, y_resampled = imbalanced_data_transformer.fit_transform(test_smote.X, test_smote.Y)
X_gt = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141], [
1.25192108, -0.22367336
], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [
-0.28162401, -2.10400981
], [0.83680821, 1.72827342], [0.3084254, 0.33299982], [
0.70472253, -0.73309052
], [0.28893132, -0.38761769], [1.15514042, 0.0129463], [
0.88407872, 0.35454207
], [1.31301027, -0.92648734], [-1.11515198, -0.93689695], [
-0.18410027, -0.45194484
], [0.9281014, 0.53085498], [-0.14374509, 0.27370049],
[-0.41635887, -0.38299653], [0.08711622, 0.93259929],
[1.70580611, -0.11219234], [0.36784496, -0.1953161]])
y_gt = np.array(
[0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0])
X_gt = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141],
[1.25192108, -0.22367336], [0.53366841, -0.30312976],
[1.52091956, -0.49283504], [-0.28162401, -2.10400981],
[0.83680821, 1.72827342], [0.3084254, 0.33299982],
[0.70472253, -0.73309052], [0.28893132, -0.38761769],
[1.15514042, 0.0129463], [0.88407872, 0.35454207],
[1.31301027, -0.92648734], [-1.11515198, -0.93689695],
[-0.18410027, -0.45194484], [0.9281014, 0.53085498],
[-0.14374509, 0.27370049], [-0.41635887, -0.38299653],
[0.08711622, 0.93259929], [1.70580611, -0.11219234], [0.36784496, -0.1953161]])
y_gt = np.array([0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0])
test_smote.assert_allclose(X_resampled, X_gt, rtol=test_smote.R_TOL)
test_smote.assert_array_equal(y_resampled, y_gt)

def test_strategy_undersampling(self):
"""
sample test of different functions based on imblearn implementation for undersampling methods.
"""
imbalanced_data_transformer = ImbalancedDataTransformer(method_name='InstanceHardnessThreshold',
estimator=test_instance_hardness_threshold.ESTIMATOR,
sampling_strategy={0: 6, 1: 8},
random_state=test_instance_hardness_threshold.RND_SEED)
transformer = ImbalancedDataTransformer(method_name='InstanceHardnessThreshold',
config={"InstanceHardnessThreshold":
{"estimator": test_instance_hardness_threshold.ESTIMATOR,
"sampling_strategy": {0: 6, 1: 8},
"random_state": test_instance_hardness_threshold.RND_SEED}
})

X_resampled, y_resampled = imbalanced_data_transformer.fit_resample(test_instance_hardness_threshold.X,
test_instance_hardness_threshold.Y)
X_resampled, y_resampled = transformer.fit_resample(test_instance_hardness_threshold.X,
test_instance_hardness_threshold.Y)
assert X_resampled.shape == (15, 2)
assert y_resampled.shape == (15,)

Expand All @@ -72,7 +74,9 @@ def test_strategy_combine(self):
sample test of different functions based on imblearn implementation for oversampling methods.
"""
imbalanced_data_transformer = ImbalancedDataTransformer(method_name='SMOTETomek',
random_state=test_smote_tomek.RND_SEED)
config={"SMOTETomek":
{"random_state": test_smote_tomek.RND_SEED}
})
X_resampled, y_resampled = imbalanced_data_transformer.fit_resample(test_smote_tomek.X, test_smote_tomek.Y)
X_gt = np.array(
[
Expand All @@ -97,3 +101,50 @@ def test_strategy_combine(self):
y_gt = np.array([1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0])
test_smote_tomek.assert_allclose(X_resampled, X_gt, rtol=test_smote_tomek.R_TOL)
test_smote_tomek.assert_array_equal(y_resampled, y_gt)

def test_config_parameter(self):
with self.assertRaises(ValueError):
ImbalancedDataTransformer(method_name='SMOTETomek', config={"SMOTETomek": test_smote_tomek.RND_SEED})

def test_different_strategies(self):
def target_relative(y_true, y_pred):
return (y_true == 0).sum() / len(y_true)

def target_absolute(y_true, y_pred):
return len(y_true)

X, y = make_classification(weights=[0.9, 0.1], n_samples=300)

my_pipe = Hyperpipe('balanced_pipe',
optimizer='grid_search',
metrics=['accuracy', target_relative, target_absolute],
best_config_metric="accuracy",
inner_cv=StratifiedShuffleSplit(n_splits=3, test_size=0.2),
verbosity=1,
project_folder='./tmp/')

my_pipe += PipelineElement('StandardScaler')

tested_methods = Categorical(['RandomOverSampler', 'RandomUnderSampler'])
my_pipe += PipelineElement('ImbalancedDataTransformer',
hyperparameters={'method_name': tested_methods},
test_disabled=False)
my_pipe += PipelineElement("LogisticRegression")
my_pipe.fit(X, y)

# -> test samples per strategy
test_perf = pd.DataFrame([], columns=["config", "acc", "class_distribution", "absolute_samples"])

for i, test_config_item in enumerate(my_pipe.results.outer_folds[0].tested_config_list):
config = test_config_item.config_dict["ImbalancedDataTransformer__method_name"]
acc = round(test_config_item.metrics_train[0].value, 3)
relative = round(test_config_item.metrics_train[2].value, 3)
absolute = round(test_config_item.metrics_train[4].value, 3)
test_perf = test_perf.append(pd.Series([config, acc, relative, absolute], index=test_perf.columns),
ignore_index=True)

self.assertGreater(test_perf[test_perf["config"] == "RandomOverSampler"]["absolute_samples"].tolist()[0],
test_perf[test_perf["config"] == "RandomUnderSampler"]["absolute_samples"].tolist()[0])

self.assertEqual(test_perf[test_perf["config"] == "RandomOverSampler"]["class_distribution"].tolist()[0],
test_perf[test_perf["config"] == "RandomUnderSampler"]["class_distribution"].tolist()[0])

0 comments on commit cde82e1

Please sign in to comment.