Skip to content

Commit

Permalink
update clustering algorithm
Browse files Browse the repository at this point in the history
  • Loading branch information
yzhao062 committed Jul 18, 2019
1 parent a02fe15 commit a4e7625
Show file tree
Hide file tree
Showing 4 changed files with 173 additions and 41 deletions.
83 changes: 71 additions & 12 deletions combo/models/cluster_comb.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ def __init__(self, estimators, pre_fitted=False):
if len(estimators) < 2:
raise ValueError('At least 2 estimators are required')
self.estimators = estimators
self.len_estimators_ = len(self.estimators)
self.n_estimators_ = len(self.estimators)
self.pre_fitted = pre_fitted

@abstractmethod
Expand Down Expand Up @@ -232,17 +232,26 @@ class ClustererEnsemble(BaseClusteringAggregator):
A list of base estimators. Estimators must have a `labels_`
attribute once fitted. Sklearn clustering estimators are recommended.
n_clusters :
n_clusters : int, optional (default=8)
The number of clusters.
weights : numpy array of shape (n_estimators,)
Estimator weights. May be used after the alignment.
reference_idx : int in range [0, n_estimators-1], optional (default=0)
The ith base estimator used as the reference for label alignment.
pre_fitted : bool, optional (default=False)
Whether the base estimators are trained. If True, `fit`
process may be skipped.
Attributes
----------
labels_ : int
The predicted label of the fitted data.
"""

def __init__(self, estimators, n_clusters, weights=None,
def __init__(self, estimators, n_clusters, weights=None, reference_idx=0,
pre_fitted=False):

super(ClustererEnsemble, self).__init__(
Expand All @@ -251,12 +260,16 @@ def __init__(self, estimators, n_clusters, weights=None,
check_parameter(n_clusters, low=2, param_name='n_clusters')
self.n_clusters = n_clusters

check_parameter(reference_idx, low=0, high=self.n_estimators_ - 1,
include_left=True, include_right=True)
self.reference_idx = reference_idx

if weights is None:
self.weights = np.ones([1, self.len_estimators_])
self.weights = np.ones([1, self.n_estimators_])
else:

self.weights = column_or_1d(weights).reshape(1, len(weights))
assert (self.weights.shape[1] == self.len_estimators_)
assert (self.weights.shape[1] == self.n_estimators_)

# adjust probability by a factor for integrity
adjust_factor = self.weights.shape[1] / np.sum(weights)
Expand All @@ -271,12 +284,11 @@ def fit(self, X):
The input samples.
"""

# Validate inputs X and y
# validate inputs X and y (optional)
# Validate inputs X
X = check_array(X)

# initialize the score matrix to store the results
original_labels = np.zeros([X.shape[0], self.len_estimators_])
original_labels = np.zeros([X.shape[0], self.n_estimators_])

if self.pre_fitted:
print("Training Skipped")
Expand All @@ -291,10 +303,57 @@ def fit(self, X):
original_labels[:, i] = estimator.labels_
self.oiginal_labels_ = original_labels

# get the aligned result
self.labels_, self.aligned_labels_ = clusterer_ensemble_scores(
original_labels,
self.n_estimators_,
n_clusters=self.n_clusters,
weights=self.weights,
return_results=True,
reference_idx=self.reference_idx)

def predict(self, X):
"""Predict the class labels for the provided data.
Parameters
----------
X : numpy array of shape (n_samples, n_features)
The input samples.
Returns
-------
labels : numpy array of shape (n_samples,)
Class labels for each data sample.
"""
# TODO: decide whether enable predict function for clustering
raise NotImplemented("predict function is currently disabled for"
"clustering due to inconsistent behaviours.")

# Validate inputs X
X = check_array(X)

# initialize the score matrix to store the results
original_labels = np.zeros([X.shape[0], self.n_estimators_])

for i, estimator in enumerate(self.estimators):
check_is_fitted(estimator, ['labels_'])
original_labels[:, i] = estimator.predict(X)

# get the aligned result
predicted_labels = clusterer_ensemble_scores(
original_labels,
self.n_estimators_,
n_clusters=self.n_clusters,
weights=self.weights,
return_results=False,
reference_idx=self.reference_idx)

return predicted_labels


def clusterer_ensemble_scores(original_labels, n_estimators, n_clusters,
weights=None, return_results=False,
selected_idx=0):
reference_idx=0):
"""Function to align the raw clustering results from base estimators.
Different from ClustererEnsemble class, this function takes in the output
from base estimators directly without training and prediction.
Expand All @@ -316,13 +375,13 @@ def clusterer_ensemble_scores(original_labels, n_estimators, n_clusters,
return_results : bool, optional (default=False)
If True, also return the aligned label matrix.
selected_idx : int in range [0, n_estimators-1], optional (default=0)
reference_idx : int in range [0, n_estimators-1], optional (default=0)
The ith base estimator used as the reference for label alignment.
Returns
-------
aligned_labels : numpy array of shape (n_samples, n_estimators)
The aligned label results by using selected_idx estimator as the
The aligned label results by using reference_idx estimator as the
reference.
"""
Expand All @@ -332,7 +391,7 @@ def clusterer_ensemble_scores(original_labels, n_estimators, n_clusters,
aligned_labels = np.copy(original_labels)

for i in range(n_estimators):
inter_mat = _intersection_mat(original_labels, selected_idx, i,
inter_mat = _intersection_mat(original_labels, reference_idx, i,
n_clusters)
index_mapping = _alignment(inter_mat, n_clusters, i, aligned_labels,
OFFSET_FACTOR)
Expand Down
2 changes: 1 addition & 1 deletion examples/classifier_comb_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
os.path.abspath(os.path.join(os.path.dirname("__file__"), '..')))

import numpy as np
from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
Expand All @@ -28,6 +27,7 @@
from sklearn.neighbors import KNeighborsClassifier

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

from combo.models.classifier_comb import SimpleClassifierAggregator
Expand Down
68 changes: 68 additions & 0 deletions examples/cluster_comb_example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
# -*- coding: utf-8 -*-
"""Example of combining multiple clustering algorithm. The example uses
Clusterer Ensemble by Zhi-hua Zhou, 2006.
"""
# Author: Yue Zhao <zhaoy@cmu.edu>
# License: BSD 2 clause


import os
import sys

# temporary solution for relative imports in case combo is not installed
# if combo is installed, no need to use the following line
sys.path.append(
os.path.abspath(os.path.join(os.path.dirname("__file__"), '..')))

import numpy as np

from sklearn.cluster import KMeans
from sklearn.cluster import MiniBatchKMeans
from sklearn.cluster import AgglomerativeClustering

from sklearn.datasets import load_breast_cancer

from combo.models.cluster_comb import clusterer_ensemble_scores
from combo.models.cluster_comb import ClustererEnsemble

import warnings

warnings.filterwarnings("ignore")

if __name__ == "__main__":
# Define data file and read X and y
X, y = load_breast_cancer(return_X_y=True)

n_clusters = 5
n_estimators = 3

# Initialize a set of estimators
estimators = [KMeans(n_clusters=n_clusters),
MiniBatchKMeans(n_clusters=n_clusters),
AgglomerativeClustering(n_clusters=n_clusters)]

clf = ClustererEnsemble(estimators, n_clusters=n_clusters)
clf.fit(X)

# generate the labels on X
aligned_labels = clf.aligned_labels_
predicted_labels = clf.labels_

# Clusterer Ensemble without initializing a new Class
original_labels = np.zeros([X.shape[0], n_estimators])

for i, estimator in enumerate(estimators):
estimator.fit(X)
original_labels[:, i] = estimator.labels_

# Invoke method directly without initializing a new Class
# Demo the effect of different parameters
labels_by_vote1 = clusterer_ensemble_scores(original_labels, n_estimators,
n_clusters)
# return aligned_labels as well
labels_by_vote2, aligned_labels = clusterer_ensemble_scores(
original_labels, n_estimators, n_clusters, return_results=True)

# select a different reference base estimator (default is 0)
labels_by_vote3 = clusterer_ensemble_scores(original_labels, n_estimators,
n_clusters, reference_idx=1)
61 changes: 33 additions & 28 deletions examples/temp_do_not_use.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,15 @@

import numpy as np

from sklearn.datasets import load_breast_cancer
from sklearn.cluster import KMeans
from sklearn.cluster import MiniBatchKMeans
from sklearn.cluster import AgglomerativeClustering

from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import StandardScaler

from combo.models.cluster_comb import clusterer_ensemble
from combo.utils.data import evaluate_print
from combo.models.cluster_comb import clusterer_ensemble_scores
from combo.models.cluster_comb import ClustererEnsemble
from combo.utils.utility import generate_bagging_indices

import warnings
Expand All @@ -36,30 +39,32 @@
# Define data file and read X and y
random_state = 42
X, y = load_breast_cancer(return_X_y=True)
X_norm = StandardScaler().fit_transform(X)
n_samples = X.shape[0]
n_features = X.shape[1]

n_clusters = 5
n_ite = 10

original_results = np.zeros([n_samples, n_ite])

for ite in range(n_ite):
print("build cluster... ite", ite + 1, "...")
# random_state = np.random.RandomState(random_state_seed.tomaxint())
random_state = np.random.RandomState(ite)
# randomly generate feature subspaces
sub_features = generate_bagging_indices(
random_state=random_state,
bootstrap_features=False,
n_features=n_features,
min_features=n_features * 0.5,
max_features=n_features)

X_sub = X_norm[:, sub_features]
kmeans = KMeans(n_clusters=n_clusters, random_state=random_state).fit(
X_sub)
original_results[:, ite] = kmeans.labels_

aligned_results = clusterer_ensemble(original_results, n_clusters, n_ite)
n_estimators = 3

# Initialize a set of estimators
estimators = [KMeans(n_clusters=n_clusters),
MiniBatchKMeans(n_clusters=n_clusters),
AgglomerativeClustering(n_clusters=n_clusters)]

clf = ClustererEnsemble(estimators, n_clusters=n_clusters)
clf.fit(X)
predicted_labels = clf.labels_
aligned_labels = clf.aligned_labels_

# Clusterer Ensemble without ininializing a new Class
original_labels = np.zeros([X.shape[0], n_estimators])

for i, estimator in enumerate(estimators):
estimator.fit(X)
original_labels[:, i] = estimator.labels_

# Invoke method directly without initialiing a new Class
labels_by_vote1 = clusterer_ensemble_scores(original_labels, n_estimators,
n_clusters)
labels_by_vote2, aligned_labels = clusterer_ensemble_scores(
original_labels, n_estimators, n_clusters, return_results=True)

labels_by_vote3 = clusterer_ensemble_scores(original_labels, n_estimators,
n_clusters, reference_idx=1)

0 comments on commit a4e7625

Please sign in to comment.