-
Notifications
You must be signed in to change notification settings - Fork 73
/
spectral_clusterer.py
196 lines (169 loc) · 7.7 KB
/
spectral_clusterer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
"""A spectral clusterer class to perform clustering."""
import numpy as np
from spectralcluster import constraint
from spectralcluster import custom_distance_kmeans
from spectralcluster import laplacian
from spectralcluster import refinement
from spectralcluster import utils
RefinementName = refinement.RefinementName
LaplacianType = laplacian.LaplacianType
ConstraintName = constraint.ConstraintName
EigenGapType = utils.EigenGapType
class SpectralClusterer:
"""Spectral clustering class."""
def __init__(self,
min_clusters=None,
max_clusters=None,
refinement_options=None,
autotune=None,
laplacian_type=None,
stop_eigenvalue=1e-2,
row_wise_renorm=False,
custom_dist="cosine",
max_iter=300,
constraint_options=None,
eigengap_type=EigenGapType.Ratio):
"""Constructor of the clusterer.
Args:
min_clusters: minimal number of clusters allowed (only effective if not
None)
max_clusters: maximal number of clusters allowed (only effective if not
None), can be used together with min_clusters to fix the number of
clusters
refinement_options: a RefinementOptions object that contains refinement
arguments for the affinity matrix. If None, we will not refine
autotune: an AutoTune object to automatically search p_percentile
laplacian_type: a LaplacianType. If None, we do not use a laplacian matrix
stop_eigenvalue: when computing the number of clusters using Eigen Gap, we
do not look at eigen values smaller than this value
row_wise_renorm: if True, perform row-wise re-normalization on the
spectral embeddings
custom_dist: str or callable. custom distance measure for k-means. If a
string, "cosine", "euclidean", "mahalanobis", or any other distance
functions defined in scipy.spatial.distance can be used
max_iter: the maximum number of iterations for the custom k-means
constraint_options: a ConstraintOptions object that contains constraint
arguments
eigengap_type: the type of the eigengap computation
"""
self.min_clusters = min_clusters
self.max_clusters = max_clusters
if not refinement_options:
self.refinement_options = refinement.RefinementOptions()
else:
self.refinement_options = refinement_options
self.autotune = autotune
self.laplacian_type = laplacian_type
self.row_wise_renorm = row_wise_renorm
self.stop_eigenvalue = stop_eigenvalue
self.custom_dist = custom_dist
self.max_iter = max_iter
self.constraint_options = constraint_options
self.eigengap_type = eigengap_type
def _compute_eigenvectors_ncluster(self, affinity, constraint_matrix=None):
"""Perform eigen decomposition and estiamte the number of clusters.
Perform affinity refinement, eigen decomposition and sort eigenvectors by
the real part of eigenvalues. Estimate the number of clusters using EigenGap
principle.
Args:
affinity: the affinity matrix of input data
constraint_matrix: numpy array of shape (n_samples, n_samples). The
constraint matrix with prior information
Returns:
eigenvectors: sorted eigenvectors. numpy array of shape
(n_samples, n_samples)
n_clusters: number of clusters as an integer
max_delta_norm: normalized maximum eigen gap
"""
if (self.constraint_options and
self.constraint_options.apply_before_refinement):
# Perform the constraint operation before refinement
affinity = self.constraint_options.constraint_operator.adjust_affinity(
affinity, constraint_matrix)
# Perform refinement operations on the affinity matrix.
for refinement_name in self.refinement_options.refinement_sequence:
refinement_operator = self.refinement_options.get_refinement_operator(
refinement_name)
affinity = refinement_operator.refine(affinity)
if (self.constraint_options and
not self.constraint_options.apply_before_refinement):
# Perform the constraint operation after refinement
affinity = self.constraint_options.constraint_operator.adjust_affinity(
affinity, constraint_matrix)
if not self.laplacian_type or self.laplacian_type == LaplacianType.Affinity:
# Perform eigen decomposion.
(eigenvalues, eigenvectors) = utils.compute_sorted_eigenvectors(affinity)
# Get number of clusters.
n_clusters, max_delta_norm = utils.compute_number_of_clusters(
eigenvalues,
self.max_clusters,
self.stop_eigenvalue,
self.eigengap_type,
descend=True)
else:
# Compute Laplacian matrix
laplacian_norm = laplacian.compute_laplacian(
affinity, laplacian_type=self.laplacian_type)
# Perform eigen decomposion. Eigen values are sorted in an ascending
# order
(eigenvalues, eigenvectors) = utils.compute_sorted_eigenvectors(
laplacian_norm, descend=False)
# Get number of clusters. Eigen values are sorted in an ascending order
n_clusters, max_delta_norm = utils.compute_number_of_clusters(
eigenvalues, self.max_clusters, self.eigengap_type, descend=False)
return eigenvectors, n_clusters, max_delta_norm
def predict(self, embeddings, constraint_matrix=None):
"""Perform spectral clustering on data embeddings.
The spectral clustering is performed on an affinity matrix.
Args:
embeddings: numpy array of shape (n_samples, n_features)
constraint_matrix: numpy array of shape (n_samples, n_samples). The
constraint matrix with prior information
Returns:
labels: numpy array of shape (n_samples,)
Raises:
TypeError: if embeddings has wrong type
ValueError: if embeddings has wrong shape
"""
if not isinstance(embeddings, np.ndarray):
raise TypeError("embeddings must be a numpy array")
if len(embeddings.shape) != 2:
raise ValueError("embeddings must be 2-dimensional")
# Compute affinity matrix.
affinity = utils.compute_affinity_matrix(embeddings)
if self.autotune:
# Use Auto-tuning method to find a good p_percentile.
if (RefinementName.RowWiseThreshold
not in self.refinement_options.refinement_sequence):
raise ValueError(
"AutoTune is only effective when the refinement sequence"
"contains RowWiseThreshold")
def p_percentile_to_ratio(p_percentile):
"""Compute the `ratio` given a `p_percentile` value."""
self.refinement_options.p_percentile = p_percentile
(eigenvectors, n_clusters,
max_delta_norm) = self._compute_eigenvectors_ncluster(
affinity, constraint_matrix)
ratio = np.sqrt(1 - p_percentile) / max_delta_norm
return ratio, eigenvectors, n_clusters
eigenvectors, n_clusters, _ = self.autotune.tune(p_percentile_to_ratio)
else:
# Do not use Auto-tune.
eigenvectors, n_clusters, _ = self._compute_eigenvectors_ncluster(
affinity, constraint_matrix)
if self.min_clusters is not None:
n_clusters = max(n_clusters, self.min_clusters)
# Get spectral embeddings.
spectral_embeddings = eigenvectors[:, :n_clusters]
if self.row_wise_renorm:
# Perfrom row wise re-normalization.
rows_norm = np.linalg.norm(spectral_embeddings, axis=1, ord=2)
spectral_embeddings = spectral_embeddings / np.reshape(
rows_norm, (spectral_embeddings.shape[0], 1))
# Run K-means on spectral embeddings.
labels = custom_distance_kmeans.run_kmeans(
spectral_embeddings,
n_clusters=n_clusters,
custom_dist=self.custom_dist,
max_iter=self.max_iter)
return labels