update clustering algorithm

yzhao062 · Jul 18, 2019 · a4e7625 · a4e7625
1 parent a02fe15
commit a4e7625
Show file tree

Hide file tree

Showing 4 changed files with 173 additions and 41 deletions.
diff --git a/combo/models/cluster_comb.py b/combo/models/cluster_comb.py
@@ -50,7 +50,7 @@ def __init__(self, estimators, pre_fitted=False):
         if len(estimators) < 2:
             raise ValueError('At least 2 estimators are required')
         self.estimators = estimators
-        self.len_estimators_ = len(self.estimators)
+        self.n_estimators_ = len(self.estimators)
         self.pre_fitted = pre_fitted
 
     @abstractmethod
@@ -232,17 +232,26 @@ class ClustererEnsemble(BaseClusteringAggregator):
         A list of base estimators. Estimators must have a `labels_`
         attribute once fitted. Sklearn clustering estimators are recommended.
 
-    n_clusters :
+    n_clusters : int, optional (default=8)
+        The number of clusters.
 
     weights : numpy array of shape (n_estimators,)
         Estimator weights. May be used after the alignment.
 
+    reference_idx : int in range [0, n_estimators-1], optional (default=0)
+        The ith base estimator used as the reference for label alignment.
+
     pre_fitted : bool, optional (default=False)
         Whether the base estimators are trained. If True, `fit`
         process may be skipped.
+
+    Attributes
+    ----------
+    labels_ : int
+        The predicted label of the fitted data.
     """
 
-    def __init__(self, estimators, n_clusters, weights=None,
+    def __init__(self, estimators, n_clusters, weights=None, reference_idx=0,
                  pre_fitted=False):
 
         super(ClustererEnsemble, self).__init__(
@@ -251,12 +260,16 @@ def __init__(self, estimators, n_clusters, weights=None,
         check_parameter(n_clusters, low=2, param_name='n_clusters')
         self.n_clusters = n_clusters
 
+        check_parameter(reference_idx, low=0, high=self.n_estimators_ - 1,
+                        include_left=True, include_right=True)
+        self.reference_idx = reference_idx
+
         if weights is None:
-            self.weights = np.ones([1, self.len_estimators_])
+            self.weights = np.ones([1, self.n_estimators_])
         else:
 
             self.weights = column_or_1d(weights).reshape(1, len(weights))
-            assert (self.weights.shape[1] == self.len_estimators_)
+            assert (self.weights.shape[1] == self.n_estimators_)
 
             # adjust probability by a factor for integrity
             adjust_factor = self.weights.shape[1] / np.sum(weights)
@@ -271,12 +284,11 @@ def fit(self, X):
             The input samples.
         """
 
-        # Validate inputs X and y
-        # validate inputs X and y (optional)
+        # Validate inputs X
         X = check_array(X)
 
         # initialize the score matrix to store the results
-        original_labels = np.zeros([X.shape[0], self.len_estimators_])
+        original_labels = np.zeros([X.shape[0], self.n_estimators_])
 
         if self.pre_fitted:
             print("Training Skipped")
@@ -291,10 +303,57 @@ def fit(self, X):
             original_labels[:, i] = estimator.labels_
         self.oiginal_labels_ = original_labels
 
+        # get the aligned result
+        self.labels_, self.aligned_labels_ = clusterer_ensemble_scores(
+            original_labels,
+            self.n_estimators_,
+            n_clusters=self.n_clusters,
+            weights=self.weights,
+            return_results=True,
+            reference_idx=self.reference_idx)
+
+    def predict(self, X):
+        """Predict the class labels for the provided data.
+
+        Parameters
+        ----------
+        X : numpy array of shape (n_samples, n_features)
+            The input samples.
+
+        Returns
+        -------
+        labels : numpy array of shape (n_samples,)
+            Class labels for each data sample.
+        """
+        # TODO: decide whether enable predict function for clustering
+        raise NotImplemented("predict function is currently disabled for"
+                             "clustering due to inconsistent behaviours.")
+
+        # Validate inputs X
+        X = check_array(X)
+
+        # initialize the score matrix to store the results
+        original_labels = np.zeros([X.shape[0], self.n_estimators_])
+
+        for i, estimator in enumerate(self.estimators):
+            check_is_fitted(estimator, ['labels_'])
+            original_labels[:, i] = estimator.predict(X)
+
+        # get the aligned result
+        predicted_labels = clusterer_ensemble_scores(
+            original_labels,
+            self.n_estimators_,
+            n_clusters=self.n_clusters,
+            weights=self.weights,
+            return_results=False,
+            reference_idx=self.reference_idx)
+
+        return predicted_labels
+
 
 def clusterer_ensemble_scores(original_labels, n_estimators, n_clusters,
                               weights=None, return_results=False,
-                              selected_idx=0):
+                              reference_idx=0):
     """Function to align the raw clustering results from base estimators.
     Different from ClustererEnsemble class, this function takes in the output
     from base estimators directly without training and prediction.
@@ -316,13 +375,13 @@ def clusterer_ensemble_scores(original_labels, n_estimators, n_clusters,
     return_results : bool, optional (default=False)
         If True, also return the aligned label matrix.
 
-    selected_idx : int in range [0, n_estimators-1], optional (default=0)
+    reference_idx : int in range [0, n_estimators-1], optional (default=0)
         The ith base estimator used as the reference for label alignment.
 
     Returns
     -------
     aligned_labels : numpy array of shape (n_samples, n_estimators)
-        The aligned label results by using selected_idx estimator as the
+        The aligned label results by using reference_idx estimator as the
         reference.
 
     """
@@ -332,7 +391,7 @@ def clusterer_ensemble_scores(original_labels, n_estimators, n_clusters,
     aligned_labels = np.copy(original_labels)
 
     for i in range(n_estimators):
-        inter_mat = _intersection_mat(original_labels, selected_idx, i,
+        inter_mat = _intersection_mat(original_labels, reference_idx, i,
                                       n_clusters)
         index_mapping = _alignment(inter_mat, n_clusters, i, aligned_labels,
                                    OFFSET_FACTOR)

diff --git a/examples/classifier_comb_example.py b/examples/classifier_comb_example.py
@@ -19,7 +19,6 @@
     os.path.abspath(os.path.join(os.path.dirname("__file__"), '..')))
 
 import numpy as np
-from sklearn.model_selection import train_test_split
 
 from sklearn.tree import DecisionTreeClassifier
 from sklearn.linear_model import LogisticRegression
@@ -28,6 +27,7 @@
 from sklearn.neighbors import KNeighborsClassifier
 
 from sklearn.datasets import load_breast_cancer
+from sklearn.model_selection import train_test_split
 from sklearn.metrics import roc_auc_score
 
 from combo.models.classifier_comb import SimpleClassifierAggregator

diff --git a/examples/cluster_comb_example.py b/examples/cluster_comb_example.py
@@ -0,0 +1,68 @@
+# -*- coding: utf-8 -*-
+"""Example of combining multiple clustering algorithm. The example uses
+Clusterer Ensemble by Zhi-hua Zhou, 2006.
+"""
+# Author: Yue Zhao <zhaoy@cmu.edu>
+# License: BSD 2 clause
+
+
+import os
+import sys
+
+# temporary solution for relative imports in case combo is not installed
+# if combo is installed, no need to use the following line
+sys.path.append(
+    os.path.abspath(os.path.join(os.path.dirname("__file__"), '..')))
+
+import numpy as np
+
+from sklearn.cluster import KMeans
+from sklearn.cluster import MiniBatchKMeans
+from sklearn.cluster import AgglomerativeClustering
+
+from sklearn.datasets import load_breast_cancer
+
+from combo.models.cluster_comb import clusterer_ensemble_scores
+from combo.models.cluster_comb import ClustererEnsemble
+
+import warnings
+
+warnings.filterwarnings("ignore")
+
+if __name__ == "__main__":
+    # Define data file and read X and y
+    X, y = load_breast_cancer(return_X_y=True)
+
+    n_clusters = 5
+    n_estimators = 3
+
+    # Initialize a set of estimators
+    estimators = [KMeans(n_clusters=n_clusters),
+                  MiniBatchKMeans(n_clusters=n_clusters),
+                  AgglomerativeClustering(n_clusters=n_clusters)]
+
+    clf = ClustererEnsemble(estimators, n_clusters=n_clusters)
+    clf.fit(X)
+
+    # generate the labels on X
+    aligned_labels = clf.aligned_labels_
+    predicted_labels = clf.labels_
+
+    # Clusterer Ensemble without initializing a new Class
+    original_labels = np.zeros([X.shape[0], n_estimators])
+
+    for i, estimator in enumerate(estimators):
+        estimator.fit(X)
+        original_labels[:, i] = estimator.labels_
+
+    # Invoke method directly without initializing a new Class
+    # Demo the effect of different parameters
+    labels_by_vote1 = clusterer_ensemble_scores(original_labels, n_estimators,
+                                                n_clusters)
+    # return aligned_labels as well
+    labels_by_vote2, aligned_labels = clusterer_ensemble_scores(
+        original_labels, n_estimators, n_clusters, return_results=True)
+
+    # select a different reference base estimator (default is 0)
+    labels_by_vote3 = clusterer_ensemble_scores(original_labels, n_estimators,
+                                                n_clusters, reference_idx=1)
diff --git a/examples/temp_do_not_use.py b/examples/temp_do_not_use.py
@@ -20,12 +20,15 @@
 
 import numpy as np
 
-from sklearn.datasets import load_breast_cancer
 from sklearn.cluster import KMeans
+from sklearn.cluster import MiniBatchKMeans
+from sklearn.cluster import AgglomerativeClustering
+
+from sklearn.datasets import load_breast_cancer
 from sklearn.preprocessing import StandardScaler
 
-from combo.models.cluster_comb import clusterer_ensemble
-from combo.utils.data import evaluate_print
+from combo.models.cluster_comb import clusterer_ensemble_scores
+from combo.models.cluster_comb import ClustererEnsemble
 from combo.utils.utility import generate_bagging_indices
 
 import warnings
@@ -36,30 +39,32 @@
     # Define data file and read X and y
     random_state = 42
     X, y = load_breast_cancer(return_X_y=True)
-    X_norm = StandardScaler().fit_transform(X)
-    n_samples = X.shape[0]
-    n_features = X.shape[1]
 
     n_clusters = 5
-    n_ite = 10
-
-    original_results = np.zeros([n_samples, n_ite])
-
-    for ite in range(n_ite):
-        print("build cluster... ite", ite + 1, "...")
-        # random_state = np.random.RandomState(random_state_seed.tomaxint())
-        random_state = np.random.RandomState(ite)
-        # randomly generate feature subspaces
-        sub_features = generate_bagging_indices(
-            random_state=random_state,
-            bootstrap_features=False,
-            n_features=n_features,
-            min_features=n_features * 0.5,
-            max_features=n_features)
-
-        X_sub = X_norm[:, sub_features]
-        kmeans = KMeans(n_clusters=n_clusters, random_state=random_state).fit(
-            X_sub)
-        original_results[:, ite] = kmeans.labels_
-
-    aligned_results = clusterer_ensemble(original_results, n_clusters, n_ite)
+    n_estimators = 3
+
+    # Initialize a set of estimators
+    estimators = [KMeans(n_clusters=n_clusters),
+                  MiniBatchKMeans(n_clusters=n_clusters),
+                  AgglomerativeClustering(n_clusters=n_clusters)]
+
+    clf = ClustererEnsemble(estimators, n_clusters=n_clusters)
+    clf.fit(X)
+    predicted_labels = clf.labels_
+    aligned_labels = clf.aligned_labels_
+
+    # Clusterer Ensemble without ininializing a new Class
+    original_labels = np.zeros([X.shape[0], n_estimators])
+
+    for i, estimator in enumerate(estimators):
+        estimator.fit(X)
+        original_labels[:, i] = estimator.labels_
+
+    # Invoke method directly without initialiing a new Class
+    labels_by_vote1 = clusterer_ensemble_scores(original_labels, n_estimators,
+                                                n_clusters)
+    labels_by_vote2, aligned_labels = clusterer_ensemble_scores(
+        original_labels, n_estimators, n_clusters, return_results=True)
+
+    labels_by_vote3 = clusterer_ensemble_scores(original_labels, n_estimators,
+                                                n_clusters, reference_idx=1)