Skip to content

Commit

Permalink
simplify cluster comb code
Browse files Browse the repository at this point in the history
  • Loading branch information
yzhao062 committed Jul 29, 2019
1 parent 99271aa commit ba632df
Show file tree
Hide file tree
Showing 2 changed files with 36 additions and 320 deletions.
243 changes: 33 additions & 210 deletions combo/models/cluster_comb.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,214 +15,20 @@
from sklearn.utils import column_or_1d
from sklearn.utils.testing import assert_equal

from .base import BaseAggregator
from .score_comb import majority_vote
from .sklearn_base import _pprint
from ..utils.utility import check_parameter
from ..utils.utility import _sklearn_version_21

if _sklearn_version_21():
from inspect import signature
else:
from sklearn.externals.funcsigs import signature

OFFSET_FACTOR = 1000000


class BaseClusteringAggregator(ABC):
"""Abstract class for all clustering aggregation methods.
Parameters
----------
estimators: list of shape (n_estimators,)
A list of base estimators.
pre_fitted: bool, optional (default=False)
Whether the base estimators are fitted. If True, `fit`
process may be skipped.
"""

@abstractmethod
def __init__(self, estimators, pre_fitted=False):
assert (isinstance(estimators, (list)))
if len(estimators) < 2:
raise ValueError('At least 2 estimators are required')
self.estimators = estimators
self.n_estimators_ = len(self.estimators)
self.pre_fitted = pre_fitted

@abstractmethod
def fit(self, X):
"""Fit detector.
Parameters
----------
X : numpy array of shape (n_samples, n_features)
The input samples.
"""
pass

@abstractmethod
def predict(self, X):
"""Predict the class labels for the provided data.
Parameters
----------
X : numpy array of shape (n_samples, n_features)
The input samples.
Returns
-------
labels : numpy array of shape (n_samples,)
Class labels for each data sample.
"""
pass

def __len__(self):
"""Returns the number of estimators in the ensemble."""
return len(self.estimators)

def __getitem__(self, index):
"""Returns the index'th estimator in the ensemble."""
return self.estimators[index]

def __iter__(self):
"""Returns iterator over estimators in the ensemble."""
return iter(self.estimators)

# noinspection PyMethodParameters
def _get_param_names(cls):
# noinspection PyPep8
"""Get parameter names for the estimator
See http://scikit-learn.org/stable/modules/generated/sklearn.base.BaseEstimator.html
and sklearn/base.py for more information.
"""

# fetch the constructor or the original constructor before
# deprecation wrapping if any
init = getattr(cls.__init__, 'deprecated_original', cls.__init__)
if init is object.__init__:
# No explicit constructor to introspect
return []

# introspect the constructor arguments to find the model parameters
# to represent
init_signature = signature(init)
# Consider the constructor parameters excluding 'self'
parameters = [p for p in init_signature.parameters.values()
if p.name != 'self' and p.kind != p.VAR_KEYWORD]
for p in parameters:
if p.kind == p.VAR_POSITIONAL:
raise RuntimeError("scikit-learn estimators should always "
"specify their parameters in the signature"
" of their __init__ (no varargs)."
" %s with constructor %s doesn't "
" follow this convention."
% (cls, init_signature))
# Extract and sort argument names excluding 'self'
return sorted([p.name for p in parameters])

# noinspection PyPep8
def get_params(self, deep=True):
"""Get parameters for this estimator.
See http://scikit-learn.org/stable/modules/generated/sklearn.base.BaseEstimator.html
and sklearn/base.py for more information.
Parameters
----------
deep : boolean, optional
If True, will return the parameters for this estimator and
contained subobjects that are estimators.
Returns
-------
params : mapping of string to any
Parameter names mapped to their values.
"""

out = dict()
for key in self._get_param_names():
# We need deprecation warnings to always be on in order to
# catch deprecated param values.
# This is set in utils/__init__.py but it gets overwritten
# when running under python3 somehow.
warnings.simplefilter("always", DeprecationWarning)
try:
with warnings.catch_warnings(record=True) as w:
value = getattr(self, key, None)
if len(w) and w[0].category == DeprecationWarning:
# if the parameter is deprecated, don't show it
continue
finally:
warnings.filters.pop(0)

# XXX: should we rather test if instance of estimator?
if deep and hasattr(value, 'get_params'):
deep_items = value.get_params().items()
out.update((key + '__' + k, val) for k, val in deep_items)
out[key] = value
return out

def set_params(self, **params):
# noinspection PyPep8
"""Set the parameters of this estimator.
The method works on simple estimators as well as on nested objects
(such as pipelines). The latter have parameters of the form
``<component>__<parameter>`` so that it's possible to update each
component of a nested object.
See http://scikit-learn.org/stable/modules/generated/sklearn.base.BaseEstimator.html
and sklearn/base.py for more information.
Returns
-------
self : object
"""

if not params:
# Simple optimization to gain speed (inspect is slow)
return self
valid_params = self.get_params(deep=True)

nested_params = defaultdict(dict) # grouped by prefix
for key, value in params.items():
key, delim, sub_key = key.partition('__')
if key not in valid_params:
raise ValueError('Invalid parameter %s for estimator %s. '
'Check the list of available parameters '
'with `estimator.get_params().keys()`.' %
(key, self))

if delim:
nested_params[key][sub_key] = value
else:
setattr(self, key, value)

for key, sub_params in nested_params.items():
valid_params[key].set_params(**sub_params)

return self

def __repr__(self):
# noinspection PyPep8
"""
See http://scikit-learn.org/stable/modules/generated/sklearn.base.BaseEstimator.html
and sklearn/base.py for more information.
"""

class_name = self.__class__.__name__
return '%s(%s)' % (class_name, _pprint(self.get_params(deep=False),
offset=len(class_name), ),)


class ClustererEnsemble(BaseClusteringAggregator):
class ClustererEnsemble(BaseAggregator):
"""Clusterer Ensemble combines multiple base clustering estimators by
alignment. See :cite:`zhou2006clusterer` for details.
Parameters
----------
estimators : list or numpy array of shape (n_estimators,)
base_estimators : list or numpy array of shape (n_estimators,)
A list of base estimators. Estimators must have a `labels_`
attribute once fitted. Sklearn clustering estimators are recommended.
Expand All @@ -245,25 +51,26 @@ class ClustererEnsemble(BaseClusteringAggregator):
The predicted label of the fitted data.
"""

def __init__(self, estimators, n_clusters, weights=None, reference_idx=0,
def __init__(self, base_estimators, n_clusters, weights=None,
reference_idx=0,
pre_fitted=False):

super(ClustererEnsemble, self).__init__(
estimators=estimators, pre_fitted=pre_fitted)
base_estimators=base_estimators, pre_fitted=pre_fitted)

check_parameter(n_clusters, low=2, param_name='n_clusters')
self.n_clusters = n_clusters

check_parameter(reference_idx, low=0, high=self.n_estimators_ - 1,
check_parameter(reference_idx, low=0, high=self.n_base_estimators_ - 1,
include_left=True, include_right=True)
self.reference_idx = reference_idx

if weights is None:
self.weights = np.ones([1, self.n_estimators_])
self.weights = np.ones([1, self.n_base_estimators_])
else:

self.weights = column_or_1d(weights).reshape(1, len(weights))
assert (self.weights.shape[1] == self.n_estimators_)
assert (self.weights.shape[1] == self.n_base_estimators_)

# adjust probability by a factor for integrity
adjust_factor = self.weights.shape[1] / np.sum(weights)
Expand All @@ -282,25 +89,25 @@ def fit(self, X):
X = check_array(X)

# initialize the score matrix to store the results
original_labels = np.zeros([X.shape[0], self.n_estimators_])
original_labels = np.zeros([X.shape[0], self.n_base_estimators_])

if self.pre_fitted:
print("Training Skipped")

else:
for clf in self.estimators:
for clf in self.base_estimators:
clf.fit(X)
clf.fitted_ = True

for i, estimator in enumerate(self.estimators):
for i, estimator in enumerate(self.base_estimators):
check_is_fitted(estimator, ['labels_'])
original_labels[:, i] = estimator.labels_
self.oiginal_labels_ = original_labels
self.original_labels_ = original_labels

# get the aligned result
self.labels_, self.aligned_labels_ = clusterer_ensemble_scores(
original_labels,
self.n_estimators_,
self.n_base_estimators_,
n_clusters=self.n_clusters,
weights=self.weights,
return_results=True,
Expand All @@ -327,23 +134,39 @@ def predict(self, X):
X = check_array(X)

# initialize the score matrix to store the results
original_labels = np.zeros([X.shape[0], self.n_estimators_])
original_labels = np.zeros([X.shape[0], self.n_base_estimators_])

for i, estimator in enumerate(self.estimators):
for i, estimator in enumerate(self.base_estimators):
check_is_fitted(estimator, ['labels_'])
original_labels[:, i] = estimator.predict(X)

# get the aligned result
predicted_labels = clusterer_ensemble_scores(
original_labels,
self.n_estimators_,
self.n_base_estimators_,
n_clusters=self.n_clusters,
weights=self.weights,
return_results=False,
reference_idx=self.reference_idx)

return predicted_labels

def predict_proba(self, X):
"""Predict the class labels for the provided data.
Parameters
----------
X : numpy array of shape (n_samples, n_features)
The input samples.
Returns
-------
labels : numpy array of shape (n_samples,)
Class labels for each data sample.
"""
raise NotImplemented("predict_proba function is currently disabled for"
"clustering due to inconsistent behaviours.")


def clusterer_ensemble_scores(original_labels, n_estimators, n_clusters,
weights=None, return_results=False,
Expand Down
Loading

0 comments on commit ba632df

Please sign in to comment.