count warnings -> content based warnings testing, improve cnn example

wwu-mmll · Aug 28, 2020 · a0d00b1 · a0d00b1
1 parent f3a0110
commit a0d00b1
Show file tree

Hide file tree

Showing 15 changed files with 131 additions and 68 deletions.
diff --git a/examples/neural_networks/data.py → examples/neural_networks/dataset.py b/examples/neural_networks/data.py → examples/neural_networks/dataset.py
@@ -1,3 +1,7 @@
+# content by J. Brownlee:
+# https://machinelearningmastery.com/cnn-models-for-human-activity-recognition-time-series-classification/
+# HAR-Dataset: https://archive.ics.uci.edu/ml/datasets/human+activity+recognition+using+smartphones
+
 from numpy import dstack
 from pandas import read_csv
 import numpy as np
@@ -38,7 +42,7 @@ def load_dataset_group(group, prefix=''):
 
 
 # load the dataset, returns train and test X and y elements
-def load_dataset(prefix=''):
+def load_har(prefix=''):
     # load all train
     trainX, trainy = load_dataset_group('train', prefix + '/')
     print(trainX.shape, trainy.shape)

diff --git a/examples/neural_networks/keras_cnn_classification.py b/examples/neural_networks/keras_cnn_classification.py
@@ -1,6 +1,7 @@
 # cnn model with PHOTONAI
-# example: https://machinelearningmastery.com/cnn-models-for-human-activity-recognition-time-series-classification/
-# HARDataset: https://archive.ics.uci.edu/ml/datasets/human+activity+recognition+using+smartphones
+# content by J. Brownlee:
+# https://machinelearningmastery.com/cnn-models-for-human-activity-recognition-time-series-classification/
+# HAR-Dataset: https://archive.ics.uci.edu/ml/datasets/human+activity+recognition+using+smartphones
 # required file: data.py from examples/neural_network
 
 from keras.utils import data_utils
@@ -12,7 +13,7 @@
 from keras.layers.convolutional import MaxPooling1D
 from sklearn.model_selection import KFold
 
-from examples.neural_networks.data import load_dataset
+from examples.neural_networks.dataset import load_har
 
 from photonai.base import Hyperpipe, PipelineElement, OutputSettings
 from photonai.optimization import Categorical
@@ -27,7 +28,7 @@
     archive_format='zip'
 )
 
-X, y = load_dataset(prefix=dataset_path.replace('.zip', ''))
+X, y = load_har(prefix=dataset_path.replace('.zip', ''))
 
 n_timesteps, n_features, n_outputs = X.shape[1], X.shape[2], 6
 model = Sequential()

diff --git a/examples/neural_networks/keras_cnn_optimization.py b/examples/neural_networks/keras_cnn_optimization.py
@@ -1,7 +1,9 @@
 # optimized cnn model with PHOTONAI
-# example: https://machinelearningmastery.com/cnn-models-for-human-activity-recognition-time-series-classification/
-# HARDataset: https://archive.ics.uci.edu/ml/datasets/human+activity+recognition+using+smartphones
-# required file: data.py from examples/neural_network
+# content by J. Brownlee:
+# https://machinelearningmastery.com/cnn-models-for-human-activity-recognition-time-series-classification/
+# HAR-Dataset: https://archive.ics.uci.edu/ml/datasets/human+activity+recognition+using+smartphones
+# required file: dataset.py from examples/neural_network
+
 import os
 
 from keras.utils import data_utils
@@ -12,12 +14,14 @@
 from keras.layers.convolutional import Conv1D
 from keras.layers.convolutional import MaxPooling1D
 from sklearn.model_selection import KFold
+from sklearn.preprocessing import StandardScaler
+from sklearn.base import BaseEstimator
 
-from examples.neural_networks.data import load_dataset
+from examples.neural_networks.dataset import load_har
 
 from photonai.base import Hyperpipe, PipelineElement, OutputSettings, PhotonRegistry
 from photonai.modelwrapper.keras_base_models import KerasBaseClassifier
-from photonai.optimization import IntegerRange
+from photonai.optimization import IntegerRange, BooleanSwitch
 
 dataset_path = data_utils.get_file(
     fname='UCI HAR Dataset.zip',
@@ -29,23 +33,67 @@
     archive_format='zip'
 )
 
-X, y = load_dataset(prefix=dataset_path.replace('.zip', ''))
+X, y = load_har(prefix=dataset_path.replace('.zip', ''))
+
+
+# Transformer and Estimator Definition
+class MyCnnScaler(BaseEstimator):
+
+    def __init__(self, standardize: bool = True,):
+        # it is important that you name your params the same in the constructor
+        #  stub as well as in your class variables!
+        self.standardize = standardize
+
+    def fit(self, data, targets=None, **kwargs):
+        """
+        Adjust the underlying model or method to the data.
+
+        Returns
+        -------
+        IMPORTANT: must return self!
+        """
+        return self
+
+    def transform(self, X, targets=None, **kwargs):
+        """
+        Apply the method's logic to the data.
+        """
+        # remove overlap
+        cut = int(X.shape[1] / 2)
+        longX = X[:, -cut:, :]
+        # flatten windows
+        longX = longX.reshape((longX.shape[0] * longX.shape[1], longX.shape[2]))
+        # flatten train and test
+        flatX = X.reshape((X.shape[0] * X.shape[1], X.shape[2]))
+        # standardize
+        if self.standardize:
+            s = StandardScaler()
+            # fit on training data
+            s.fit(longX)
+            # apply to training and test data
+            flatX = s.transform(flatX)
+        # reshape
+        flatX = flatX.reshape((X.shape))
+        return flatX
 
 
 class MyOptimizedCnnEstimator(KerasBaseClassifier):
 
-    def __init__(self, n_filters: int = 64, epochs: int = 10, verbosity: int = 1):
+    def __init__(self, n_filters: int = 64,
+                 kernel_size: int = 3,
+                 epochs: int = 10,
+                 verbosity: int = 1):
         # it is important that you name your params the same in the constructor
         #  stub as well as in your class variables!
-        model = self.build_model(n_filters, X.shape[1], X.shape[2], 6)
+        model = self.build_model(n_filters, kernel_size, X.shape[1], X.shape[2], 6)
         super(MyOptimizedCnnEstimator, self).__init__(model=model,
                                                       epochs=epochs,
                                                       nn_batch_size=32,
                                                       multi_class=True,
                                                       verbosity=verbosity)
 
-    @classmethod
-    def build_model(cls, n_filters, n_timesteps, n_features, n_outputs):
+    @staticmethod
+    def build_model(n_filters, kernel_size, n_timesteps, n_features, n_outputs):
         model = Sequential()
         model.add(Conv1D(filters=n_filters, kernel_size=3, activation='relu', input_shape=(n_timesteps, n_features)))
         model.add(Conv1D(filters=n_filters, kernel_size=3, activation='relu'))
@@ -65,27 +113,34 @@ def build_model(cls, n_filters, n_timesteps, n_features, n_outputs):
 registry = PhotonRegistry(custom_elements_folder=custom_elements_folder)
 
 # This needs to be done only once on your device
+registry.register(photon_name='MyCnnScaler',
+                  class_str='keras_cnn_optimization.MyCnnScaler',
+                  element_type='Transformer')
+
 registry.register(photon_name='MyOptimizedCnnEstimator',
                   class_str='keras_cnn_optimization.MyOptimizedCnnEstimator',
                   element_type='Estimator')
-
 # This needs to be done every time you run the script
 registry.activate()
 
+
 # DESIGN YOUR PIPELINE
 my_pipe = Hyperpipe('cnn_keras_multiclass_pipe',
                     optimizer='sk_opt',
-                    optimizer_params={'n_configurations': 10},
+                    optimizer_params={'n_configurations': 30},
                     metrics=['accuracy'],
                     best_config_metric='accuracy',
-                    outer_cv=KFold(n_splits=3),
-                    inner_cv=KFold(n_splits=2),
+                    outer_cv=KFold(n_splits=5),
+                    inner_cv=KFold(n_splits=3),
                     verbosity=1,
                     output_settings=OutputSettings(project_folder='./tmp/'))
 
+my_pipe += PipelineElement('MyCnnScaler', hyperparameters={'standardize': BooleanSwitch()})
+
 my_pipe += PipelineElement('MyOptimizedCnnEstimator',
-                           hyperparameters={'n_filters': IntegerRange(8, 256)},
-                           epochs=3, verbosity=1)
+                           hyperparameters={'n_filters': IntegerRange(8, 256),
+                                            'kernel_size': IntegerRange(2, 11)},
+                           epochs=10, verbosity=0)
 
 # NOW TRAIN YOUR PIPELINE
 my_pipe.fit(X, y)
diff --git a/photonai/base/hyperpipe.py b/photonai/base/hyperpipe.py
@@ -63,23 +63,23 @@ class OutputSettings:
         if the chosen estimators have an attribute 'coef_' or 'feature_importances_'.
 
     * `project_folder` [bool, default=True]:
-        If True, PHOTON writes a summary_file, the results of the hyperparameter optimization, the best model and the
+        If True, PHOTONAI writes a summary_file, the results of the hyperparameter optimization, the best model and the
         console output to the filesystem into the given project folder.
 
     * `project_folder` [str, default='']:
-        The output folder in which all files generated by the PHOTON project are saved to.
+        The output folder in which all files generated by the PHOTONAI project are saved to.
 
     * `result_file_mode` [str, default='best']
         The possible save_modes for .photon file.
 
     * `user_id` [str]:
-       The user name of the according PHOTON Wizard login
+       The user name of the according PHOTONAI Wizard login
 
     * `wizard_object_id` [str]:
-       The object id to map the designed pipeline in the PHOTON Wizard to the results in the PHOTON CORE Database
+       The object id to map the designed pipeline in the PHOTONAI Wizard to the results in the PHOTONAI CORE Database
 
     * `wizard_project_name` [str]:
-       How the project is titled in the PHOTON Wizard
+       How the project is titled in the PHOTONAI Wizard
     """
     def __init__(self,
                  mongodb_connect_url: str = None,
@@ -332,10 +332,10 @@ def __init__(self, name,
         # check if both calculate_metrics_per_folds and calculate_metrics_across_folds is False
         if not calculate_metrics_across_folds and not calculate_metrics_per_fold:
             raise NotImplementedError("Apparently, you've set calculate_metrics_across_folds=False and "
-                                      "calculate_metrics_per_fold=False. In this case PHOTON does not calculate "
+                                      "calculate_metrics_per_fold=False. In this case PHOTONAI does not calculate "
                                       "any metrics which doesn't make any sense. Set at least one to True.")
         if inner_cv is None:
-            msg = "PHOTON requires an inner_cv split. Please enable inner cross-validation. As exmaple: Hyperpipe(..." \
+            msg = "PHOTONAI requires an inner_cv split. Please enable inner cross-validation. As exmaple: Hyperpipe(..." \
                   " inner_cv = KFold(n_splits = 3), ...). Ensure you import the cross_validation object first."
             logger.error(msg)
             raise AttributeError(msg)
@@ -486,7 +486,7 @@ def sanity_check_metrics(self):
             if self.best_config_metric is not None:
                 if isinstance(self.best_config_metric, list):
                     warning_text = "Best Config Metric must be a single metric given as string, no list. " \
-                                   "PHOTON chose the first one from the list of metrics to calculate."
+                                   "PHOTONAI chose the first one from the list of metrics to calculate."
 
                     self.best_config_metric = self.best_config_metric[0]
                     logger.warning(warning_text)
@@ -509,13 +509,13 @@ def sanity_check_metrics(self):
                 self.metrics = list(filter(None, self.metrics))
             else:
                 error_msg = "No metrics were chosen. Please choose metrics to quantify your performance and set " \
-                            "the best_config_metric so that PHOTON which optimizes for"
+                            "the best_config_metric so that PHOTONAI which optimizes for"
                 logger.error(error_msg)
                 raise ValueError(error_msg)
 
             if self.best_config_metric is None and self.metrics is not None and len(self.metrics) > 0:
                 self.best_config_metric = self.metrics[0]
-                warning_text = "No best config metric was given, so PHOTON chose the first in the list of metrics as " \
+                warning_text = "No best config metric was given, so PHOTONAI chose the first in the list of metrics as " \
                                "criteria for choosing the best configuration."
                 logger.warning(warning_text)
                 warnings.warn(warning_text)
@@ -673,7 +673,7 @@ def _prepare_result_logging(self, start_time):
         if self.permutation_id is not None:
             self.results.permutation_id = self.permutation_id
 
-        # save wizard information to photon db in order to map results to the wizard design object
+        # save wizard information to PHOTONAI db in order to map results to the wizard design object
         if self.output_settings and hasattr(self.output_settings, 'wizard_object_id'):
             if self.output_settings.wizard_object_id:
                 self.name = self.output_settings.wizard_object_id
@@ -916,7 +916,7 @@ def _input_data_sanity_checks(self, data, targets, **kwargs):
             nr_of_nans = len(np.where(nans_in_y == 1)[0])
             if nr_of_nans > 0:
                 logger.info("You have " + str(nr_of_nans) + " Nans in your target vector, "
-                                                              "PHOTON erases every data item that has a Nan Target")
+                                                              "PHOTONAI erases every data item that has a Nan Target")
                 self.data.X = self.data.X[~nans_in_y]
                 self.data.y = self.data.y[~nans_in_y]
         except Exception as e:
@@ -1021,9 +1021,9 @@ def fit(self, data, targets, **kwargs):
         self.output_settings._update_settings(self.name, start.strftime("%Y-%m-%d_%H-%M-%S"))
 
         logger.photon_system_log('***************************************************************************************************************')
-        logger.photon_system_log('PHOTON ANALYSIS: ' + self.name)
+        logger.photon_system_log('PHOTONAI ANALYSIS: ' + self.name)
         logger.photon_system_log('***************************************************************************************************************')
-        logger.info("Preparing data and PHOTON objects for analysis...")
+        logger.info("Preparing data and PHOTONAI objects for analysis...")
 
         # loop over outer cross validation
         if self.nr_of_processes > 1:

diff --git a/test/base_tests/test_hyperpipe.py b/test/base_tests/test_hyperpipe.py
@@ -114,26 +114,25 @@ def my_func(X, y, **kwargs):
     def test_sanity(self):
         # make sure that no metrics means raising an error
         with self.assertRaises(ValueError):
-            hyperpipe = Hyperpipe("hp_name", inner_cv=self.inner_cv_object)
+            Hyperpipe("hp_name", inner_cv=self.inner_cv_object)
 
         # make sure that if no best config metric is given, PHOTON raises a warning
         with warnings.catch_warnings(record=True) as w:
-            hyperpipe = Hyperpipe("hp_name", inner_cv=self.inner_cv_object, metrics=["accuracy", "f1_score"])
-            assert len(w) == 1
+            Hyperpipe("hp_name", inner_cv=self.inner_cv_object, metrics=["accuracy", "f1_score"])
+            assert any("No best config metric was given" in s for s in [e.message.args[0] for e in w])
 
         with warnings.catch_warnings(record=True) as w:
-            hyperpipe = Hyperpipe("hp_name", inner_cv=self.inner_cv_object, best_config_metric=["accuracy", "f1_score"])
-            assert len(w) == 1
+            Hyperpipe("hp_name", inner_cv=self.inner_cv_object, best_config_metric=["accuracy", "f1_score"])
+            assert any("Best Config Metric must be a single" in s for s in [e.message.args[0] for e in w])
 
         with self.assertRaises(NotImplementedError):
-            hyperpipe = Hyperpipe("hp_name", inner_cv=self.inner_cv_object,
-                                  best_config_metric='accuracy', metrics=["accuracy"],
-                                  calculate_metrics_across_folds=False,
-                                  calculate_metrics_per_fold=False)
+            Hyperpipe("hp_name", inner_cv=self.inner_cv_object,
+                                 best_config_metric='accuracy', metrics=["accuracy"],
+                                 calculate_metrics_across_folds=False,
+                                 calculate_metrics_per_fold=False)
 
         with self.assertRaises(AttributeError):
-            hyperpipe = Hyperpipe("hp_name",
-                                  best_config_metric='accuracy', metrics=["accuracy"])
+            Hyperpipe("hp_name", best_config_metric='accuracy', metrics=["accuracy"])
 
         data = np.random.random((500, 50))
 

diff --git a/test/base_tests/test_photon_batch.py b/test/base_tests/test_photon_batch.py
@@ -78,7 +78,7 @@ def test_transform(self):
 
         with warnings.catch_warnings(record=True) as w:
             self.neuro_batch.transform('str')
-            assert len(w) == 1
+            assert any("Cannot do batching" in s for s in [e.message.args[0] for e in w])
 
     def test_predict(self):
         y_predicted = self.neuro_batch.predict(self.data, **self.kwargs)
@@ -87,4 +87,4 @@ def test_predict(self):
         self.assertEqual(y_predicted[-1], (self.data.shape[0]/self.batch_size))
         with warnings.catch_warnings(record=True) as w:
             self.neuro_batch.predict('str')
-            assert len(w) == 1
+            assert any("Cannot do batching" in s for s in [e.message.args[0] for e in w])
diff --git a/test/base_tests/test_photon_elements.py b/test/base_tests/test_photon_elements.py
@@ -638,7 +638,7 @@ def callback_func(X, y, **kwargs):
             self.assertTrue(no_callback_pipe.elements[-1][1] is my_callback)
             test_branch.sanity_check_pipeline(no_callback_pipe)
             self.assertFalse(no_callback_pipe.elements)
-            assert len(w) == 1
+            assert any("Last element of pipeline cannot be callback" in s for s in [e.message.args[0] for e in w])
 
     def test_prepare_pipeline(self):
         self.assertEqual(len(self.transformer_branch.elements), 2)
@@ -911,7 +911,7 @@ def test_predict_warning(self):
         pe.add(PipelineElement('SVC'))
         with warnings.catch_warnings(record=True) as w:
             pe.predict([0, 1, 2])
-            assert len(w) == 1
+            assert any("There is no predict function" in s for s in [e.message.args[0] for e in w])
 
 
 class DataFilterTests(unittest.TestCase):
@@ -991,4 +991,4 @@ def test_callback(self):
 
         with warnings.catch_warnings(record=True) as w:
             self.callback_branch_pipeline_error.fit(self.X, self.y).predict(self.X)
-            assert len(w) == 2
+            assert any("Last element of pipeline cannot be callback" in s for s in [e.message.args[0] for e in w])
diff --git a/test/optimization_tests/nevergrad/test_nevergrad.py b/test/optimization_tests/nevergrad/test_nevergrad.py
@@ -141,7 +141,7 @@ def test_other(self):
             of = lambda x: x ** 2
             with warnings.catch_warnings(record=True) as w:
                 opt.prepare(pipeline_elements=pipeline_elements, maximize_metric=True, objective_function=of)
-                assert len(w) == 1
+                assert any("PHOTONAI has detected some" in s for s in [e.message.args[0] for e in w])
 
             pipeline_elements = [PipelineElement("SVC", hyperparameters={'C': FloatRange(0.1, 0.5,
                                                                                          range_type='geomspace')})]

diff --git a/test/optimization_tests/random_search/test_random_search.py b/test/optimization_tests/random_search/test_random_search.py
@@ -46,7 +46,7 @@ def test_time_limit(self):
         for config in self.optimizer.ask:
             configs.append(config)
         stop = time.time()
-        self.assertAlmostEqual(stop-start, 3, 2)
+        self.assertAlmostEqual(stop-start, 3, 1)
 
     def test_run(self):
         pass
diff --git a/test/optimization_tests/sk_opt/test_sk_opt.py b/test/optimization_tests/sk_opt/test_sk_opt.py
@@ -26,7 +26,7 @@ def test_empty_hspace(self):
         with warnings.catch_warnings(record=True) as w:
             self.optimizer.prepare([], True)
             self.assertIsNone(self.optimizer.optimizer)
-            assert len(w) == 1
+            assert any("Did not find any" in s for s in [e.message.args[0] for e in w])
 
     def test_eliminate_one_value_hyperparams(self):
         pipeline_elements = [PipelineElement('PCA', hyperparameters={'n_components': Categorical([5])}),
@@ -36,7 +36,7 @@ def test_eliminate_one_value_hyperparams(self):
                                                                      'tol': FloatRange(0.1, 1, range_type='logspace')})]
         with warnings.catch_warnings(record=True) as w:
             self.optimizer.prepare(pipeline_elements, True)
-            assert len(w) == 1
+            assert any("PHOTONAI has detected some" in s for s in [e.message.args[0] for e in w])
         self.assertIn('SVC__C', self.optimizer.hyperparameter_list)
         self.assertIn('SVC__shrinking', self.optimizer.hyperparameter_list)
         self.assertNotIn('PCA__n_components', self.optimizer.hyperparameter_list)