simplify cluster comb code

yzhao062 · Jul 29, 2019 · ba632df · ba632df
1 parent 99271aa
commit ba632df
Show file tree

Hide file tree

Showing 2 changed files with 36 additions and 320 deletions.
diff --git a/combo/models/cluster_comb.py b/combo/models/cluster_comb.py
@@ -15,214 +15,20 @@
 from sklearn.utils import column_or_1d
 from sklearn.utils.testing import assert_equal
 
+from .base import BaseAggregator
 from .score_comb import majority_vote
-from .sklearn_base import _pprint
 from ..utils.utility import check_parameter
-from ..utils.utility import _sklearn_version_21
-
-if _sklearn_version_21():
-    from inspect import signature
-else:
-    from sklearn.externals.funcsigs import signature
 
 OFFSET_FACTOR = 1000000
 
 
-class BaseClusteringAggregator(ABC):
-    """Abstract class for all clustering aggregation methods.
-
-    Parameters
-    ----------
-    estimators: list of shape (n_estimators,)
-        A list of base estimators.
-
-    pre_fitted: bool, optional (default=False)
-        Whether the base estimators are fitted. If True, `fit`
-        process may be skipped.
-    """
-
-    @abstractmethod
-    def __init__(self, estimators, pre_fitted=False):
-        assert (isinstance(estimators, (list)))
-        if len(estimators) < 2:
-            raise ValueError('At least 2 estimators are required')
-        self.estimators = estimators
-        self.n_estimators_ = len(self.estimators)
-        self.pre_fitted = pre_fitted
-
-    @abstractmethod
-    def fit(self, X):
-        """Fit detector.
-
-        Parameters
-        ----------
-        X : numpy array of shape (n_samples, n_features)
-            The input samples.
-        """
-        pass
-
-    @abstractmethod
-    def predict(self, X):
-        """Predict the class labels for the provided data.
-
-        Parameters
-        ----------
-        X : numpy array of shape (n_samples, n_features)
-            The input samples.
-
-        Returns
-        -------
-        labels : numpy array of shape (n_samples,)
-            Class labels for each data sample.
-        """
-        pass
-
-    def __len__(self):
-        """Returns the number of estimators in the ensemble."""
-        return len(self.estimators)
-
-    def __getitem__(self, index):
-        """Returns the index'th estimator in the ensemble."""
-        return self.estimators[index]
-
-    def __iter__(self):
-        """Returns iterator over estimators in the ensemble."""
-        return iter(self.estimators)
-
-    # noinspection PyMethodParameters
-    def _get_param_names(cls):
-        # noinspection PyPep8
-        """Get parameter names for the estimator
-
-        See http://scikit-learn.org/stable/modules/generated/sklearn.base.BaseEstimator.html
-        and sklearn/base.py for more information.
-        """
-
-        # fetch the constructor or the original constructor before
-        # deprecation wrapping if any
-        init = getattr(cls.__init__, 'deprecated_original', cls.__init__)
-        if init is object.__init__:
-            # No explicit constructor to introspect
-            return []
-
-        # introspect the constructor arguments to find the model parameters
-        # to represent
-        init_signature = signature(init)
-        # Consider the constructor parameters excluding 'self'
-        parameters = [p for p in init_signature.parameters.values()
-                      if p.name != 'self' and p.kind != p.VAR_KEYWORD]
-        for p in parameters:
-            if p.kind == p.VAR_POSITIONAL:
-                raise RuntimeError("scikit-learn estimators should always "
-                                   "specify their parameters in the signature"
-                                   " of their __init__ (no varargs)."
-                                   " %s with constructor %s doesn't "
-                                   " follow this convention."
-                                   % (cls, init_signature))
-        # Extract and sort argument names excluding 'self'
-        return sorted([p.name for p in parameters])
-
-    # noinspection PyPep8
-    def get_params(self, deep=True):
-        """Get parameters for this estimator.
-
-        See http://scikit-learn.org/stable/modules/generated/sklearn.base.BaseEstimator.html
-        and sklearn/base.py for more information.
-
-        Parameters
-        ----------
-        deep : boolean, optional
-            If True, will return the parameters for this estimator and
-            contained subobjects that are estimators.
-
-        Returns
-        -------
-        params : mapping of string to any
-            Parameter names mapped to their values.
-        """
-
-        out = dict()
-        for key in self._get_param_names():
-            # We need deprecation warnings to always be on in order to
-            # catch deprecated param values.
-            # This is set in utils/__init__.py but it gets overwritten
-            # when running under python3 somehow.
-            warnings.simplefilter("always", DeprecationWarning)
-            try:
-                with warnings.catch_warnings(record=True) as w:
-                    value = getattr(self, key, None)
-                if len(w) and w[0].category == DeprecationWarning:
-                    # if the parameter is deprecated, don't show it
-                    continue
-            finally:
-                warnings.filters.pop(0)
-
-            # XXX: should we rather test if instance of estimator?
-            if deep and hasattr(value, 'get_params'):
-                deep_items = value.get_params().items()
-                out.update((key + '__' + k, val) for k, val in deep_items)
-            out[key] = value
-        return out
-
-    def set_params(self, **params):
-        # noinspection PyPep8
-        """Set the parameters of this estimator.
-        The method works on simple estimators as well as on nested objects
-        (such as pipelines). The latter have parameters of the form
-        ``<component>__<parameter>`` so that it's possible to update each
-        component of a nested object.
-
-        See http://scikit-learn.org/stable/modules/generated/sklearn.base.BaseEstimator.html
-        and sklearn/base.py for more information.
-
-        Returns
-        -------
-        self : object
-        """
-
-        if not params:
-            # Simple optimization to gain speed (inspect is slow)
-            return self
-        valid_params = self.get_params(deep=True)
-
-        nested_params = defaultdict(dict)  # grouped by prefix
-        for key, value in params.items():
-            key, delim, sub_key = key.partition('__')
-            if key not in valid_params:
-                raise ValueError('Invalid parameter %s for estimator %s. '
-                                 'Check the list of available parameters '
-                                 'with `estimator.get_params().keys()`.' %
-                                 (key, self))
-
-            if delim:
-                nested_params[key][sub_key] = value
-            else:
-                setattr(self, key, value)
-
-        for key, sub_params in nested_params.items():
-            valid_params[key].set_params(**sub_params)
-
-        return self
-
-    def __repr__(self):
-        # noinspection PyPep8
-        """
-        See http://scikit-learn.org/stable/modules/generated/sklearn.base.BaseEstimator.html
-        and sklearn/base.py for more information.
-        """
-
-        class_name = self.__class__.__name__
-        return '%s(%s)' % (class_name, _pprint(self.get_params(deep=False),
-                                               offset=len(class_name), ),)
-
-
-class ClustererEnsemble(BaseClusteringAggregator):
+class ClustererEnsemble(BaseAggregator):
     """Clusterer Ensemble combines multiple base clustering estimators by
     alignment. See :cite:`zhou2006clusterer` for details.
 
     Parameters
     ----------
-    estimators : list or numpy array of shape (n_estimators,)
+    base_estimators : list or numpy array of shape (n_estimators,)
         A list of base estimators. Estimators must have a `labels_`
         attribute once fitted. Sklearn clustering estimators are recommended.
 
@@ -245,25 +51,26 @@ class ClustererEnsemble(BaseClusteringAggregator):
         The predicted label of the fitted data.
     """
 
-    def __init__(self, estimators, n_clusters, weights=None, reference_idx=0,
+    def __init__(self, base_estimators, n_clusters, weights=None,
+                 reference_idx=0,
                  pre_fitted=False):
 
         super(ClustererEnsemble, self).__init__(
-            estimators=estimators, pre_fitted=pre_fitted)
+            base_estimators=base_estimators, pre_fitted=pre_fitted)
 
         check_parameter(n_clusters, low=2, param_name='n_clusters')
         self.n_clusters = n_clusters
 
-        check_parameter(reference_idx, low=0, high=self.n_estimators_ - 1,
+        check_parameter(reference_idx, low=0, high=self.n_base_estimators_ - 1,
                         include_left=True, include_right=True)
         self.reference_idx = reference_idx
 
         if weights is None:
-            self.weights = np.ones([1, self.n_estimators_])
+            self.weights = np.ones([1, self.n_base_estimators_])
         else:
 
             self.weights = column_or_1d(weights).reshape(1, len(weights))
-            assert (self.weights.shape[1] == self.n_estimators_)
+            assert (self.weights.shape[1] == self.n_base_estimators_)
 
             # adjust probability by a factor for integrity
             adjust_factor = self.weights.shape[1] / np.sum(weights)
@@ -282,25 +89,25 @@ def fit(self, X):
         X = check_array(X)
 
         # initialize the score matrix to store the results
-        original_labels = np.zeros([X.shape[0], self.n_estimators_])
+        original_labels = np.zeros([X.shape[0], self.n_base_estimators_])
 
         if self.pre_fitted:
             print("Training Skipped")
 
         else:
-            for clf in self.estimators:
+            for clf in self.base_estimators:
                 clf.fit(X)
                 clf.fitted_ = True
 
-        for i, estimator in enumerate(self.estimators):
+        for i, estimator in enumerate(self.base_estimators):
             check_is_fitted(estimator, ['labels_'])
             original_labels[:, i] = estimator.labels_
-        self.oiginal_labels_ = original_labels
+        self.original_labels_ = original_labels
 
         # get the aligned result
         self.labels_, self.aligned_labels_ = clusterer_ensemble_scores(
             original_labels,
-            self.n_estimators_,
+            self.n_base_estimators_,
             n_clusters=self.n_clusters,
             weights=self.weights,
             return_results=True,
@@ -327,23 +134,39 @@ def predict(self, X):
         X = check_array(X)
 
         # initialize the score matrix to store the results
-        original_labels = np.zeros([X.shape[0], self.n_estimators_])
+        original_labels = np.zeros([X.shape[0], self.n_base_estimators_])
 
-        for i, estimator in enumerate(self.estimators):
+        for i, estimator in enumerate(self.base_estimators):
             check_is_fitted(estimator, ['labels_'])
             original_labels[:, i] = estimator.predict(X)
 
         # get the aligned result
         predicted_labels = clusterer_ensemble_scores(
             original_labels,
-            self.n_estimators_,
+            self.n_base_estimators_,
             n_clusters=self.n_clusters,
             weights=self.weights,
             return_results=False,
             reference_idx=self.reference_idx)
 
         return predicted_labels
 
+    def predict_proba(self, X):
+        """Predict the class labels for the provided data.
+
+        Parameters
+        ----------
+        X : numpy array of shape (n_samples, n_features)
+            The input samples.
+
+        Returns
+        -------
+        labels : numpy array of shape (n_samples,)
+            Class labels for each data sample.
+        """
+        raise NotImplemented("predict_proba function is currently disabled for"
+                             "clustering due to inconsistent behaviours.")
+
 
 def clusterer_ensemble_scores(original_labels, n_estimators, n_clusters,
                               weights=None, return_results=False,