diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst index c523236a11348..70a3c40fad0f1 100644 --- a/doc/modules/classes.rst +++ b/doc/modules/classes.rst @@ -114,7 +114,6 @@ Functions cluster.affinity_propagation cluster.cluster_optics_dbscan - cluster.cluster_optics_xi cluster.compute_optics_graph cluster.dbscan cluster.estimate_bandwidth diff --git a/doc/modules/clustering.rst b/doc/modules/clustering.rst index 0043ce5c4e36c..711283b6f8790 100644 --- a/doc/modules/clustering.rst +++ b/doc/modules/clustering.rst @@ -91,12 +91,6 @@ Overview of clustering methods - Non-flat geometry, uneven cluster sizes - Distances between nearest points - * - :ref:`OPTICS ` - - minimum cluster membership - - Very large ``n_samples``, large ``n_clusters`` - - Non-flat geometry, uneven cluster sizes, variable cluster density - - Distances between points - * - :ref:`Gaussian mixtures ` - many - Not scalable @@ -812,11 +806,6 @@ by black points below. be used (e.g., with sparse matrices). This matrix will consume n^2 floats. A couple of mechanisms for getting around this are: - - Use :ref:`OPTICS ` clustering in conjunction with the - `extract_dbscan` method. OPTICS clustering also calculates the full - pairwise matrix, but only keeps one row in memory at a time (memory - complexity n). - - A sparse radius neighborhood graph (where missing entries are presumed to be out of eps) can be precomputed in a memory-efficient way and dbscan can be run over this with ``metric='precomputed'``. See @@ -839,92 +828,6 @@ by black points below. Schubert, E., Sander, J., Ester, M., Kriegel, H. P., & Xu, X. (2017). In ACM Transactions on Database Systems (TODS), 42(3), 19. -.. _optics: - -OPTICS -====== - -The :class:`OPTICS` algorithm shares many similarities with the :class:`DBSCAN` -algorithm, and can be considered a generalization of DBSCAN that relaxes the -``eps`` requirement from a single value to a value range. The key difference -between DBSCAN and OPTICS is that the OPTICS algorithm builds a *reachability* -graph, which assigns each sample both a ``reachability_`` distance, and a spot -within the cluster ``ordering_`` attribute; these two attributes are assigned -when the model is fitted, and are used to determine cluster membership. If -OPTICS is run with the default value of *inf* set for ``max_eps``, then DBSCAN -style cluster extraction can be performed repeatedly in linear time for any -given ``eps`` value using the ``cluster_optics_dbscan`` method. Setting -``max_eps`` to a lower value will result in shorter run times, and can be -thought of as the maximum neighborhood radius from each point to find other -potential reachable points. - -.. |optics_results| image:: ../auto_examples/cluster/images/sphx_glr_plot_optics_001.png - :target: ../auto_examples/cluster/plot_optics.html - :scale: 50 - -.. centered:: |optics_results| - -The *reachability* distances generated by OPTICS allow for variable density -extraction of clusters within a single data set. As shown in the above plot, -combining *reachability* distances and data set ``ordering_`` produces a -*reachability plot*, where point density is represented on the Y-axis, and -points are ordered such that nearby points are adjacent. 'Cutting' the -reachability plot at a single value produces DBSCAN like results; all points -above the 'cut' are classified as noise, and each time that there is a break -when reading from left to right signifies a new cluster. The default cluster -extraction with OPTICS looks at the steep slopes within the graph to find -clusters, and the user can define what counts as a steep slope using the -parameter ``xi``. There are also other possibilities for analysis on the graph -itself, such as generating hierarchical representations of the data through -reachability-plot dendrograms, and the hierarchy of clusters detected by the -algorithm can be accessed through the ``cluster_hierarchy_`` parameter. The -plot above has been color-coded so that cluster colors in planar space match -the linear segment clusters of the reachability plot. Note that the blue and -red clusters are adjacent in the reachability plot, and can be hierarchically -represented as children of a larger parent cluster. - -.. topic:: Examples: - - * :ref:`sphx_glr_auto_examples_cluster_plot_optics.py` - - -.. topic:: Comparison with DBSCAN - - The results from OPTICS ``cluster_optics_dbscan`` method and DBSCAN are - very similar, but not always identical; specifically, labeling of periphery - and noise points. This is in part because the first samples of each dense - area processed by OPTICS have a large reachability value while being close - to other points in their area, and will thus sometimes be marked as noise - rather than periphery. This affects adjacent points when they are - considered as candidates for being marked as either periphery or noise. - - Note that for any single value of ``eps``, DBSCAN will tend to have a - shorter run time than OPTICS; however, for repeated runs at varying ``eps`` - values, a single run of OPTICS may require less cumulative runtime than - DBSCAN. It is also important to note that OPTICS' output is close to - DBSCAN's only if ``eps`` and ``max_eps`` are close. - -.. topic:: Computational Complexity - - Spatial indexing trees are used to avoid calculating the full distance - matrix, and allow for efficient memory usage on large sets of samples. - Different distance metrics can be supplied via the ``metric`` keyword. - - For large datasets, similar (but not identical) results can be obtained via - `HDBSCAN `_. The HDBSCAN implementation is - multithreaded, and has better algorithmic runtime complexity than OPTICS, - at the cost of worse memory scaling. For extremely large datasets that - exhaust system memory using HDBSCAN, OPTICS will maintain *n* (as opposed - to *n^2*) memory scaling; however, tuning of the ``max_eps`` parameter - will likely need to be used to give a solution in a reasonable amount of - wall time. - -.. topic:: References: - - * "OPTICS: ordering points to identify the clustering structure." - Ankerst, Mihael, Markus M. Breunig, Hans-Peter Kriegel, and Jörg Sander. - In ACM Sigmod Record, vol. 28, no. 2, pp. 49-60. ACM, 1999. - .. _birch: Birch diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst index 14b0ccea56cfb..cc39d921724fe 100644 --- a/doc/whats_new/v0.21.rst +++ b/doc/whats_new/v0.21.rst @@ -89,8 +89,7 @@ Support for Python 3.4 and below has been officially dropped. - |MajorFeature| A new clustering algorithm: :class:`cluster.OPTICS`: an algoritm related to :class:`cluster.DBSCAN`, that has hyperparameters easier to set and that scales better, by :user:`Shane `, - `Adrin Jalali`_, :user:`Erich Schubert `, `Hanmin Qin`_, and - :user:`Assia Benbihi `. + :user:`Adrin Jalali `, and :user:`Erich Schubert `. - |Fix| Fixed a bug where :class:`cluster.Birch` could occasionally raise an AttributeError. :pr:`13651` by `Joel Nothman`_. diff --git a/examples/cluster/plot_cluster_comparison.py b/examples/cluster/plot_cluster_comparison.py index fbd1716e94588..39d8bca458cc2 100644 --- a/examples/cluster/plot_cluster_comparison.py +++ b/examples/cluster/plot_cluster_comparison.py @@ -74,20 +74,14 @@ 'damping': .9, 'preference': -200, 'n_neighbors': 10, - 'n_clusters': 3, - 'min_samples': 20, - 'xi': 0.05, - 'min_cluster_size': 0.1} + 'n_clusters': 3} datasets = [ (noisy_circles, {'damping': .77, 'preference': -240, - 'quantile': .2, 'n_clusters': 2, - 'min_samples': 20, 'xi': 0.25}), + 'quantile': .2, 'n_clusters': 2}), (noisy_moons, {'damping': .75, 'preference': -220, 'n_clusters': 2}), - (varied, {'eps': .18, 'n_neighbors': 2, - 'min_samples': 5, 'xi': 0.035, 'min_cluster_size': .2}), - (aniso, {'eps': .15, 'n_neighbors': 2, - 'min_samples': 20, 'xi': 0.1, 'min_cluster_size': .2}), + (varied, {'eps': .18, 'n_neighbors': 2}), + (aniso, {'eps': .15, 'n_neighbors': 2}), (blobs, {}), (no_structure, {})] @@ -122,9 +116,6 @@ n_clusters=params['n_clusters'], eigen_solver='arpack', affinity="nearest_neighbors") dbscan = cluster.DBSCAN(eps=params['eps']) - optics = cluster.OPTICS(min_samples=params['min_samples'], - xi=params['xi'], - min_cluster_size=params['min_cluster_size']) affinity_propagation = cluster.AffinityPropagation( damping=params['damping'], preference=params['preference']) average_linkage = cluster.AgglomerativeClustering( @@ -142,7 +133,6 @@ ('Ward', ward), ('AgglomerativeClustering', average_linkage), ('DBSCAN', dbscan), - ('OPTICS', optics), ('Birch', birch), ('GaussianMixture', gmm) ) diff --git a/examples/cluster/plot_optics.py b/examples/cluster/plot_optics.py deleted file mode 100644 index 6209456af4161..0000000000000 --- a/examples/cluster/plot_optics.py +++ /dev/null @@ -1,98 +0,0 @@ -""" -=================================== -Demo of OPTICS clustering algorithm -=================================== -Finds core samples of high density and expands clusters from them. -This example uses data that is generated so that the clusters have -different densities. -The :class:`sklearn.cluster.OPTICS` is first used with its Xi cluster detection -method, and then setting specific thresholds on the reachability, which -corresponds to :class:`sklearn.cluster.DBSCAN`. We can see that the different -clusters of OPTICS's Xi method can be recovered with different choices of -thresholds in DBSCAN. -""" - -# Authors: Shane Grigsby -# Adrin Jalali -# License: BSD 3 clause - - -from sklearn.cluster import OPTICS, cluster_optics_dbscan -import matplotlib.gridspec as gridspec -import matplotlib.pyplot as plt -import numpy as np - -# Generate sample data - -np.random.seed(0) -n_points_per_cluster = 250 - -C1 = [-5, -2] + .8 * np.random.randn(n_points_per_cluster, 2) -C2 = [4, -1] + .1 * np.random.randn(n_points_per_cluster, 2) -C3 = [1, -2] + .2 * np.random.randn(n_points_per_cluster, 2) -C4 = [-2, 3] + .3 * np.random.randn(n_points_per_cluster, 2) -C5 = [3, -2] + 1.6 * np.random.randn(n_points_per_cluster, 2) -C6 = [5, 6] + 2 * np.random.randn(n_points_per_cluster, 2) -X = np.vstack((C1, C2, C3, C4, C5, C6)) - -clust = OPTICS(min_samples=50, xi=.05, min_cluster_size=.05) - -# Run the fit -clust.fit(X) - -labels_050 = cluster_optics_dbscan(reachability=clust.reachability_, - core_distances=clust.core_distances_, - ordering=clust.ordering_, eps=0.5) -labels_200 = cluster_optics_dbscan(reachability=clust.reachability_, - core_distances=clust.core_distances_, - ordering=clust.ordering_, eps=2) - -space = np.arange(len(X)) -reachability = clust.reachability_[clust.ordering_] -labels = clust.labels_[clust.ordering_] - -plt.figure(figsize=(10, 7)) -G = gridspec.GridSpec(2, 3) -ax1 = plt.subplot(G[0, :]) -ax2 = plt.subplot(G[1, 0]) -ax3 = plt.subplot(G[1, 1]) -ax4 = plt.subplot(G[1, 2]) - -# Reachability plot -colors = ['g.', 'r.', 'b.', 'y.', 'c.'] -for klass, color in zip(range(0, 5), colors): - Xk = space[labels == klass] - Rk = reachability[labels == klass] - ax1.plot(Xk, Rk, color, alpha=0.3) -ax1.plot(space[labels == -1], reachability[labels == -1], 'k.', alpha=0.3) -ax1.plot(space, np.full_like(space, 2., dtype=float), 'k-', alpha=0.5) -ax1.plot(space, np.full_like(space, 0.5, dtype=float), 'k-.', alpha=0.5) -ax1.set_ylabel('Reachability (epsilon distance)') -ax1.set_title('Reachability Plot') - -# OPTICS -colors = ['g.', 'r.', 'b.', 'y.', 'c.'] -for klass, color in zip(range(0, 5), colors): - Xk = X[clust.labels_ == klass] - ax2.plot(Xk[:, 0], Xk[:, 1], color, alpha=0.3) -ax2.plot(X[clust.labels_ == -1, 0], X[clust.labels_ == -1, 1], 'k+', alpha=0.1) -ax2.set_title('Automatic Clustering\nOPTICS') - -# DBSCAN at 0.5 -colors = ['g', 'greenyellow', 'olive', 'r', 'b', 'c'] -for klass, color in zip(range(0, 6), colors): - Xk = X[labels_050 == klass] - ax3.plot(Xk[:, 0], Xk[:, 1], color, alpha=0.3, marker='.') -ax3.plot(X[labels_050 == -1, 0], X[labels_050 == -1, 1], 'k+', alpha=0.1) -ax3.set_title('Clustering at 0.5 epsilon cut\nDBSCAN') - -# DBSCAN at 2. -colors = ['g.', 'm.', 'y.', 'c.'] -for klass, color in zip(range(0, 4), colors): - Xk = X[labels_200 == klass] - ax4.plot(Xk[:, 0], Xk[:, 1], color, alpha=0.3) -ax4.plot(X[labels_200 == -1, 0], X[labels_200 == -1, 1], 'k+', alpha=0.1) -ax4.set_title('Clustering at 2.0 epsilon cut\nDBSCAN') - -plt.tight_layout() -plt.show() diff --git a/sklearn/cluster/__init__.py b/sklearn/cluster/__init__.py index da4cfdb6f0734..3f6f7d50d3a4c 100644 --- a/sklearn/cluster/__init__.py +++ b/sklearn/cluster/__init__.py @@ -11,8 +11,7 @@ FeatureAgglomeration) from .k_means_ import k_means, KMeans, MiniBatchKMeans from .dbscan_ import dbscan, DBSCAN -from .optics_ import (OPTICS, cluster_optics_dbscan, compute_optics_graph, - cluster_optics_xi) +from .optics_ import OPTICS, cluster_optics_dbscan, compute_optics_graph from .bicluster import SpectralBiclustering, SpectralCoclustering from .birch import Birch @@ -22,7 +21,6 @@ 'DBSCAN', 'OPTICS', 'cluster_optics_dbscan', - 'cluster_optics_xi', 'compute_optics_graph', 'KMeans', 'FeatureAgglomeration', diff --git a/sklearn/cluster/optics_.py b/sklearn/cluster/optics_.py index 55f286404c9ee..c947d12fb8f8e 100755 --- a/sklearn/cluster/optics_.py +++ b/sklearn/cluster/optics_.py @@ -41,17 +41,14 @@ class OPTICS(BaseEstimator, ClusterMixin): Parameters ---------- - min_samples : int > 1 or float between 0 and 1 (default=None) - The number of samples in a neighborhood for a point to be considered as - a core point. Also, up and down steep regions can't have more then - ``min_samples`` consecutive non-steep points. Expressed as an absolute - number or a fraction of the number of samples (rounded to be at least - 2). + min_samples : int (default=5) + The number of samples in a neighborhood for a point to be considered + as a core point. max_eps : float, optional (default=np.inf) The maximum distance between two samples for them to be considered - as in the same neighborhood. Default value of ``np.inf`` will identify - clusters across all scales; reducing ``max_eps`` will result in + as in the same neighborhood. Default value of "np.inf" will identify + clusters across all scales; reducing `max_eps` will result in shorter run times. metric : string or callable, optional (default='minkowski') @@ -89,33 +86,19 @@ class OPTICS(BaseEstimator, ClusterMixin): metric_params : dict, optional (default=None) Additional keyword arguments for the metric function. - cluster_method : string, optional (default='xi') + cluster_method : string, optional (default='dbscan') The extraction method used to extract clusters using the calculated - reachability and ordering. Possible values are "xi" and "dbscan". + reachability and ordering. Possible values are "dbscan". - eps : float, optional (default=None) + eps : float, optional (default=0.5) The maximum distance between two samples for them to be considered - as in the same neighborhood. By default it assumes the same value as - ``max_eps``. - Used only when ``cluster_method='dbscan'``. - - xi : float, between 0 and 1, optional (default=0.05) - Determines the minimum steepness on the reachability plot that - constitutes a cluster boundary. For example, an upwards point in the - reachability plot is defined by the ratio from one point to its - successor being at most 1-xi. - Used only when ``cluster_method='xi'``. - - predecessor_correction : bool, optional (default=True) - Correct clusters according to the predecessors calculated by OPTICS - [2]_. This parameter has minimal effect on most datasets. - Used only when ``cluster_method='xi'``. - - min_cluster_size : int > 1 or float between 0 and 1 (default=None) + as in the same neighborhood. Used ony when `cluster_method='dbscan'`. + + min_cluster_size : int > 1 or float between 0 and 1 (default=0.005) Minimum number of samples in an OPTICS cluster, expressed as an - absolute number or a fraction of the number of samples (rounded to be - at least 2). If ``None``, the value of ``min_samples`` is used instead. - Used only when ``cluster_method='xi'``. + absolute number or a fraction of the number of samples (rounded + to be at least 2). + Used only when `extract_method='xi'`. algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional Algorithm used to compute the nearest neighbors: @@ -145,8 +128,7 @@ class OPTICS(BaseEstimator, ClusterMixin): ---------- labels_ : array, shape (n_samples,) Cluster labels for each point in the dataset given to fit(). - Noisy samples and points which are not included in a leaf cluster - of ``cluster_hierarchy_`` are labeled as -1. + Noisy samples are given the label -1. reachability_ : array, shape (n_samples,) Reachability distances per sample, indexed by object order. Use @@ -164,19 +146,9 @@ class OPTICS(BaseEstimator, ClusterMixin): Point that a sample was reached from, indexed by object order. Seed points have a predecessor of -1. - cluster_hierarchy_ : array, shape (n_clusters, 2) - The list of clusters in the form of ``[start, end]`` in each row, with - all indices inclusive. The clusters are ordered according to - ``(end, -start)`` (ascending) so that larger clusters encompassing - smaller clusters come after those smaller ones. Since ``labels_`` does - not reflect the hierarchy, usually - ``len(cluster_hierarchy_) > np.unique(optics.labels_)``. Please also - note that these indices are of the ``ordering_``, i.e. - ``X[ordering_][start:end + 1]`` form a cluster. - Only available when ``cluster_method='xi'``. - See also -------- + DBSCAN A similar clustering for a specified neighborhood radius (eps). Our implementation is optimized for runtime. @@ -190,12 +162,14 @@ class OPTICS(BaseEstimator, ClusterMixin): .. [2] Schubert, Erich, Michael Gertz. "Improving the Cluster Structure Extracted from OPTICS Plots." Proc. of the Conference "Lernen, Wissen, Daten, Analysen" (LWDA) (2018): 318-329. + """ def __init__(self, min_samples=5, max_eps=np.inf, metric='minkowski', p=2, - metric_params=None, cluster_method='xi', eps=None, xi=0.05, - predecessor_correction=True, min_cluster_size=None, - algorithm='auto', leaf_size=30, n_jobs=None): + metric_params=None, cluster_method='dbscan', eps=0.5, + min_cluster_size=.005, algorithm='auto', leaf_size=30, + n_jobs=None): + self.max_eps = max_eps self.min_samples = min_samples self.min_cluster_size = min_cluster_size @@ -206,15 +180,13 @@ def __init__(self, min_samples=5, max_eps=np.inf, metric='minkowski', p=2, self.leaf_size = leaf_size self.cluster_method = cluster_method self.eps = eps - self.xi = xi - self.predecessor_correction = predecessor_correction self.n_jobs = n_jobs def fit(self, X, y=None): """Perform OPTICS clustering Extracts an ordered list of points and reachability distances, and - performs initial clustering using ``max_eps`` distance specified at + performs initial clustering using `max_eps` distance specified at OPTICS object instantiation. Parameters @@ -231,11 +203,43 @@ def fit(self, X, y=None): """ X = check_array(X, dtype=np.float) - if self.cluster_method not in ['dbscan', 'xi']: + n_samples = len(X) + + if self.min_cluster_size <= 0 or (self.min_cluster_size != + int(self.min_cluster_size) + and self.min_cluster_size > 1): + raise ValueError('min_cluster_size must be a positive integer or ' + 'a float between 0 and 1. Got %r' % + self.min_cluster_size) + elif self.min_cluster_size > n_samples: + raise ValueError('min_cluster_size must be no greater than the ' + 'number of samples (%d). Got %d' % + (n_samples, self.min_cluster_size)) + + if self.min_samples > n_samples: + raise ValueError("Number of training samples (n_samples=%d) must " + "be greater than min_samples (min_samples=%d) " + "used for clustering." % + (n_samples, self.min_samples)) + + if self.cluster_method not in ['dbscan']: raise ValueError("cluster_method should be one of" - " 'dbscan' or 'xi' but is %s" % + " 'dbscan', but is %s" % self.cluster_method) + if self.cluster_method == 'dbscan': + if self.eps > self.max_eps: + raise ValueError('Specify an epsilon smaller than %s. Got %s.' + % (self.max_eps, self.eps)) + + if self.eps * 5.0 > self.max_eps * 1.05: + warnings.warn( + "Warning, max_eps (%s) is close to eps (%s): " + "Output may be unstable." % (self.max_eps, self.eps), + RuntimeWarning, stacklevel=2) + # Stability warning is documented in cluster_optics_dbscan + # method... + (self.ordering_, self.core_distances_, self.reachability_, self.predecessor_) = compute_optics_graph( X=X, min_samples=self.min_samples, algorithm=self.algorithm, @@ -244,48 +248,16 @@ def fit(self, X, y=None): max_eps=self.max_eps) # Extract clusters from the calculated orders and reachability - if self.cluster_method == 'xi': - labels_, clusters_ = cluster_optics_xi( - self.reachability_, - self.predecessor_, - self.ordering_, - self.min_samples, - self.min_cluster_size, - self.xi, - self.predecessor_correction) - self.cluster_hierarchy_ = clusters_ - elif self.cluster_method == 'dbscan': - if self.eps is None: - eps = self.max_eps - else: - eps = self.eps - - if eps > self.max_eps: - raise ValueError('Specify an epsilon smaller than %s. Got %s.' - % (self.max_eps, eps)) - + if self.cluster_method == 'dbscan': labels_ = cluster_optics_dbscan(self.reachability_, self.core_distances_, self.ordering_, - eps) + self.eps) self.labels_ = labels_ return self -def _validate_size(size, n_samples, param_name): - if size <= 0 or (size != - int(size) - and size > 1): - raise ValueError('%s must be a positive integer ' - 'or a float between 0 and 1. Got %r' % - (param_name, size)) - elif size > n_samples: - raise ValueError('%s must be no greater than the' - ' number of samples (%d). Got %d' % - (param_name, n_samples, size)) - - # OPTICS helper functions def _compute_core_distances_(X, neighbors, min_samples, working_memory): """Compute the k-th nearest neighbor of each sample @@ -310,7 +282,7 @@ def _compute_core_distances_(X, neighbors, min_samples, working_memory): Distance at which each sample becomes a core point. Points which will never be core have a distance of inf. """ - n_samples = X.shape[0] + n_samples = len(X) core_distances = np.empty(n_samples) core_distances.fill(np.nan) @@ -337,8 +309,7 @@ def compute_optics_graph(X, min_samples, max_eps, metric, p, metric_params, min_samples : int (default=5) The number of samples in a neighborhood for a point to be considered - as a core point. Expressed as an absolute number or a fraction of the - number of samples (rounded to be at least 2). + as a core point. max_eps : float, optional (default=np.inf) The maximum distance between two samples for them to be considered @@ -429,11 +400,7 @@ def compute_optics_graph(X, min_samples, max_eps, metric, p, metric_params, and Jörg Sander. "OPTICS: ordering points to identify the clustering structure." ACM SIGMOD Record 28, no. 2 (1999): 49-60. """ - n_samples = X.shape[0] - _validate_size(min_samples, n_samples, 'min_samples') - if min_samples <= 1: - min_samples = max(2, min_samples * n_samples) - + n_samples = len(X) # Start all points as 'unprocessed' ## reachability_ = np.empty(n_samples) reachability_.fill(np.inf) @@ -522,28 +489,29 @@ def _set_reach_dist(core_distances_, reachability_, predecessor_, predecessor_[unproc[improved]] = point_index -def cluster_optics_dbscan(reachability, core_distances, ordering, eps): +def cluster_optics_dbscan(reachability, core_distances, ordering, eps=0.5): """Performs DBSCAN extraction for an arbitrary epsilon. - Extracting the clusters runs in linear time. Note that this results in - ``labels_`` which are close to a `DBSCAN` with similar settings and - ``eps``, only if ``eps`` is close to ``max_eps``. + Extracting the clusters runs in linear time. Note that if the `max_eps` + OPTICS parameter was set to < inf for extracting reachability and ordering + arrays, DBSCAN extractions will be unstable for `eps` values close to + `max_eps`. Setting `eps` < (`max_eps` / 5.0) will guarantee extraction + parity with DBSCAN. Parameters ---------- reachability : array, shape (n_samples,) - Reachability distances calculated by OPTICS (``reachability_``) + Reachability distances calculated by OPTICS (`reachability_`) core_distances : array, shape (n_samples,) - Distances at which points become core (``core_distances_``) + Distances at which points become core (`core_distances_`) ordering : array, shape (n_samples,) - OPTICS ordered point indices (``ordering_``) + OPTICS ordered point indices (`ordering_`) - eps : float - DBSCAN ``eps`` parameter. Must be set to < ``max_eps``. Results - will be close to DBSCAN algorithm if ``eps`` and ``max_eps`` are close - to one another. + eps : float, optional (default=0.5) + DBSCAN `eps` parameter. Must be set to < `max_eps`. Results + will be close to DBSCAN algorithm if `eps` is < (`max_eps` / 5) Returns ------- @@ -559,350 +527,3 @@ def cluster_optics_dbscan(reachability, core_distances, ordering, eps): labels[ordering] = np.cumsum(far_reach[ordering] & near_core[ordering]) - 1 labels[far_reach & ~near_core] = -1 return labels - - -def cluster_optics_xi(reachability, predecessor, ordering, min_samples, - min_cluster_size=None, xi=0.05, - predecessor_correction=True): - """Automatically extract clusters according to the Xi-steep method. - - Parameters - ---------- - reachability : array, shape (n_samples,) - Reachability distances calculated by OPTICS (`reachability_`) - - predecessor : array, shape (n_samples,) - Predecessors calculated by OPTICS. - - ordering : array, shape (n_samples,) - OPTICS ordered point indices (`ordering_`) - - min_samples : int > 1 or float between 0 and 1 (default=None) - The same as the min_samples given to OPTICS. Up and down steep regions - can't have more then ``min_samples`` consecutive non-steep points. - Expressed as an absolute number or a fraction of the number of samples - (rounded to be at least 2). - - min_cluster_size : int > 1 or float between 0 and 1 (default=None) - Minimum number of samples in an OPTICS cluster, expressed as an - absolute number or a fraction of the number of samples (rounded to be - at least 2). If ``None``, the value of ``min_samples`` is used instead. - - xi : float, between 0 and 1, optional (default=0.05) - Determines the minimum steepness on the reachability plot that - constitutes a cluster boundary. For example, an upwards point in the - reachability plot is defined by the ratio from one point to its - successor being at most 1-xi. - - predecessor_correction : bool, optional (default=True) - Correct clusters based on the calculated predecessors. - - Returns - ------- - labels : array, shape (n_samples) - The labels assigned to samples. Points which are not included - in any cluster are labeled as -1. - - clusters : array, shape (n_clusters, 2) - The list of clusters in the form of ``[start, end]`` in each row, with - all indices inclusive. The clusters are ordered according to ``(end, - -start)`` (ascending) so that larger clusters encompassing smaller - clusters come after such nested smaller clusters. Since ``labels`` does - not reflect the hierarchy, usually ``len(clusters) > - np.unique(labels)``. - """ - n_samples = len(reachability) - _validate_size(min_samples, n_samples, 'min_samples') - if min_samples <= 1: - min_samples = max(2, min_samples * n_samples) - if min_cluster_size is None: - min_cluster_size = min_samples - _validate_size(min_cluster_size, n_samples, 'min_cluster_size') - if min_cluster_size <= 1: - min_cluster_size = max(2, min_cluster_size * n_samples) - - clusters = _xi_cluster(reachability[ordering], predecessor[ordering], - ordering, xi, - min_samples, min_cluster_size, - predecessor_correction) - labels = _extract_xi_labels(ordering, clusters) - return labels, clusters - - -def _extend_region(steep_point, xward_point, start, min_samples): - """Extend the area until it's maximal. - - It's the same function for both upward and downward reagions, depending on - the given input parameters. Assuming: - - - steep_{upward/downward}: bool array indicating whether a point is a - steep {upward/downward}; - - upward/downward: bool array indicating whether a point is - upward/downward; - - To extend an upward reagion, ``steep_point=steep_upward`` and - ``xward_point=downward`` are expected, and to extend a downward region, - ``steep_point=steep_downward`` and ``xward_point=upward``. - - Parameters - ---------- - steep_point : bool array, shape (n_samples) - True if the point is steep downward (upward). - - xward_point : bool array, shape (n_samples) - True if the point is an upward (respectively downward) point. - - start : integer - The start of the xward region. - - min_samples : integer - The same as the min_samples given to OPTICS. Up and down steep - regions can't have more then ``min_samples`` consecutive non-steep - points. - - Returns - ------- - index : integer - The current index iterating over all the samples, i.e. where we are up - to in our search. - - end : integer - The end of the region, which can be behind the index. The region - includes the ``end`` index. - """ - n_samples = len(steep_point) - non_xward_points = 0 - index = start - end = start - # find a maximal area - while index < n_samples: - if steep_point[index]: - non_xward_points = 0 - end = index - elif not xward_point[index]: - # it's not a steep point, but still goes up. - non_xward_points += 1 - # region should include no more than min_samples consecutive - # non steep xward points. - if non_xward_points > min_samples: - break - else: - return end - index += 1 - return end - - -def _update_filter_sdas(sdas, mib, xi_complement, reachability_plot): - """Update steep down areas (SDAs) using the new maximum in between (mib) - value, and the given complement of xi, i.e. ``1 - xi``. - """ - if np.isinf(mib): - return [] - res = [sda for sda in sdas - if mib <= reachability_plot[sda['start']] * xi_complement] - for sda in res: - sda['mib'] = max(sda['mib'], mib) - return res - - -def _correct_predecessor(reachability_plot, predecessor_plot, ordering, s, e): - """Correct for predecessors. - - Applies Algorithm 2 of [1]_. - - Input parameters are ordered by the computer OPTICS ordering. - - .. [1] Schubert, Erich, Michael Gertz. - "Improving the Cluster Structure Extracted from OPTICS Plots." Proc. of - the Conference "Lernen, Wissen, Daten, Analysen" (LWDA) (2018): 318-329. - """ - while s < e: - if reachability_plot[s] > reachability_plot[e]: - return s, e - p_e = ordering[predecessor_plot[e]] - for i in range(s, e): - if p_e == ordering[i]: - return s, e - e -= 1 - return None, None - - -def _xi_cluster(reachability_plot, predecessor_plot, ordering, xi, min_samples, - min_cluster_size, predecessor_correction): - """Automatically extract clusters according to the Xi-steep method. - - This is rouphly an implementation of Figure 19 of the OPTICS paper. - - Parameters - ---------- - reachability_plot : array, shape (n_samples) - The reachability plot, i.e. reachability ordered according to - the calculated ordering, all computed by OPTICS. - - predecessor_plot : array, shape (n_samples) - Predecessors ordered according to the calculated ordering. - - xi : float, between 0 and 1 - Determines the minimum steepness on the reachability plot that - constitutes a cluster boundary. For example, an upwards point in the - reachability plot is defined by the ratio from one point to its - successor being at most 1-xi. - - min_samples : int > 1 or float between 0 and 1 (default=None) - The same as the min_samples given to OPTICS. Up and down steep regions - can't have more then ``min_samples`` consecutive non-steep points. - Expressed as an absolute number or a fraction of the number of samples - (rounded to be at least 2). - - min_cluster_size : int > 1 or float between 0 and 1 - Minimum number of samples in an OPTICS cluster, expressed as an - absolute number or a fraction of the number of samples (rounded - to be at least 2). - - predecessor_correction : bool - Correct clusters based on the calculated predecessors. - - Returns - ------- - clusters : array, shape (n_clusters, 2) - The list of clusters in the form of [start, end] in each row, with all - indices inclusive. The clusters are ordered in a way that larger - clusters encompassing smaller clusters come after those smaller - clusters. - """ - - # all indices are inclusive (specially at the end) - # add an inf to the end of reachability plot - # this helps to find potential clusters at the end of the - # reachability plot even if there's no upward region at the end of it. - reachability_plot = np.hstack((reachability_plot, np.inf)) - - xi_complement = 1 - xi - sdas = [] # steep down areas, introduced in section 4.3.2 of the paper - clusters = [] - index = 0 - mib = 0. # maximum in between, section 4.3.2 - - with np.errstate(invalid='ignore'): - ratio = reachability_plot[:-1] / reachability_plot[1:] - steep_upward = ratio <= xi_complement - steep_downward = ratio >= 1 / xi_complement - downward = ratio > 1 - upward = ratio < 1 - - # the following loop is is almost exactly as Figure 19 of the paper. - # it jumps over the areas which are not either steep down or up areas - for steep_index in iter(np.flatnonzero(steep_upward | steep_downward)): - # just continue if steep_index has been a part of a discovered xward - # area. - if steep_index < index: - continue - - mib = max(mib, np.max(reachability_plot[index:steep_index + 1])) - - # steep downward areas - if steep_downward[steep_index]: - sdas = _update_filter_sdas(sdas, mib, xi_complement, - reachability_plot) - D_start = steep_index - D_end = _extend_region(steep_downward, upward, - D_start, min_samples) - D = {'start': D_start, 'end': D_end, 'mib': 0.} - sdas.append(D) - index = D_end + 1 - mib = reachability_plot[index] - - # steep upward areas - else: - sdas = _update_filter_sdas(sdas, mib, xi_complement, - reachability_plot) - U_start = steep_index - U_end = _extend_region(steep_upward, downward, U_start, - min_samples) - index = U_end + 1 - mib = reachability_plot[index] - - U_clusters = [] - for D in sdas: - c_start = D['start'] - c_end = U_end - - # line (**), sc2* - if reachability_plot[c_end + 1] * xi_complement < D['mib']: - continue - - # Definition 11: criterion 4 - D_max = reachability_plot[D['start']] - if D_max * xi_complement >= reachability_plot[c_end + 1]: - # Find the first index from the left side which is almost - # at the same level as the end of the detected cluster. - while (reachability_plot[c_start + 1] > - reachability_plot[c_end + 1] - and c_start < D['end']): - c_start += 1 - elif reachability_plot[c_end + 1] * xi_complement >= D_max: - # Find the first index from the right side which is almost - # at the same level as the beginning of the detected - # cluster. - while (reachability_plot[c_end - 1] < D_max - and c_end > U_start): - c_end -= 1 - - # predecessor correction - if predecessor_correction: - c_start, c_end = _correct_predecessor(reachability_plot, - predecessor_plot, - ordering, - c_start, - c_end) - if c_start is None: - continue - - # Definition 11: criterion 3.a - if c_end - c_start + 1 < min_cluster_size: - continue - - # Definition 11: criterion 1 - if c_start > D['end']: - continue - - # Definition 11: criterion 2 - if c_end < U_start: - continue - - U_clusters.append((c_start, c_end)) - - # add smaller clusters first. - U_clusters.reverse() - clusters.extend(U_clusters) - - return np.array(clusters) - - -def _extract_xi_labels(ordering, clusters): - """Extracts the labels from the clusters returned by `_xi_cluster`. - We rely on the fact that clusters are stored - with the smaller clusters coming before the larger ones. - - Parameters - ---------- - ordering : array, shape (n_samples) - The ordering of points calculated by OPTICS - - clusters : array, shape (n_clusters, 2) - List of clusters i.e. (start, end) tuples, - as returned by `_xi_cluster`. - - Returns - ------- - labels : array, shape (n_samples) - """ - - labels = np.full(len(ordering), -1, dtype=int) - label = 0 - for c in clusters: - if not np.any(labels[c[0]:(c[1] + 1)] != -1): - labels[c[0]:(c[1] + 1)] = label - label += 1 - labels[ordering] = labels.copy() - return labels diff --git a/sklearn/cluster/tests/test_optics.py b/sklearn/cluster/tests/test_optics.py old mode 100644 new mode 100755 index 6d23f0278df5a..37166e7f8dd20 --- a/sklearn/cluster/tests/test_optics.py +++ b/sklearn/cluster/tests/test_optics.py @@ -1,19 +1,16 @@ # Authors: Shane Grigsby -# Adrin Jalali +# Amy X. Zhang # License: BSD 3 clause import numpy as np import pytest from sklearn.datasets.samples_generator import make_blobs -from sklearn.cluster.optics_ import (OPTICS, - _extend_region, - _extract_xi_labels) +from sklearn.cluster.optics_ import OPTICS from sklearn.metrics.cluster import contingency_matrix from sklearn.metrics.pairwise import pairwise_distances from sklearn.cluster.dbscan_ import DBSCAN -from sklearn.utils import shuffle -from sklearn.utils.testing import assert_equal +from sklearn.utils.testing import assert_equal, assert_warns from sklearn.utils.testing import assert_array_equal from sklearn.utils.testing import assert_raise_message from sklearn.utils.testing import assert_allclose @@ -32,113 +29,6 @@ X = np.vstack((C1, C2, C3, C4, C5, C6)) -@pytest.mark.parametrize( - ('r_plot', 'end'), - [[[10, 8.9, 8.8, 8.7, 7, 10], 3], - [[10, 8.9, 8.8, 8.7, 8.6, 7, 10], 0], - [[10, 8.9, 8.8, 8.7, 7, 6, np.inf], 4], - [[10, 8.9, 8.8, 8.7, 7, 6, np.inf], 4], - ]) -def test_extend_downward(r_plot, end): - r_plot = np.array(r_plot) - ratio = r_plot[:-1] / r_plot[1:] - steep_downward = ratio >= 1 / .9 - upward = ratio < 1 - - e = _extend_region(steep_downward, upward, 0, 2) - assert e == end - - -@pytest.mark.parametrize( - ('r_plot', 'end'), - [[[1, 2, 2.1, 2.2, 4, 8, 8, np.inf], 6], - [[1, 2, 2.1, 2.2, 2.3, 4, 8, 8, np.inf], 0], - [[1, 2, 2.1, 2, np.inf], 0], - [[1, 2, 2.1, np.inf], 2], - ]) -def test_extend_upward(r_plot, end): - r_plot = np.array(r_plot) - ratio = r_plot[:-1] / r_plot[1:] - steep_upward = ratio <= .9 - downward = ratio > 1 - - e = _extend_region(steep_upward, downward, 0, 2) - assert e == end - - -@pytest.mark.parametrize( - ('ordering', 'clusters', 'expected'), - [[[0, 1, 2, 3], [[0, 1], [2, 3]], [0, 0, 1, 1]], - [[0, 1, 2, 3], [[0, 1], [3, 3]], [0, 0, -1, 1]], - [[0, 1, 2, 3], [[0, 1], [3, 3], [0, 3]], [0, 0, -1, 1]], - [[3, 1, 2, 0], [[0, 1], [3, 3], [0, 3]], [1, 0, -1, 0]], - ]) -def test_the_extract_xi_labels(ordering, clusters, expected): - labels = _extract_xi_labels(ordering, clusters) - - assert_array_equal(labels, expected) - - -def test_extract_xi(): - # small and easy test (no clusters around other clusters) - # but with a clear noise data. - rng = np.random.RandomState(0) - n_points_per_cluster = 5 - - C1 = [-5, -2] + .8 * rng.randn(n_points_per_cluster, 2) - C2 = [4, -1] + .1 * rng.randn(n_points_per_cluster, 2) - C3 = [1, -2] + .2 * rng.randn(n_points_per_cluster, 2) - C4 = [-2, 3] + .3 * rng.randn(n_points_per_cluster, 2) - C5 = [3, -2] + .6 * rng.randn(n_points_per_cluster, 2) - C6 = [5, 6] + .2 * rng.randn(n_points_per_cluster, 2) - - X = np.vstack((C1, C2, C3, C4, C5, np.array([[100, 100]]), C6)) - expected_labels = np.r_[[2] * 5, [0] * 5, [1] * 5, [3] * 5, [1] * 5, - -1, [4] * 5] - X, expected_labels = shuffle(X, expected_labels, random_state=rng) - - clust = OPTICS(min_samples=3, min_cluster_size=2, - max_eps=np.inf, cluster_method='xi', - xi=0.4).fit(X) - assert_array_equal(clust.labels_, expected_labels) - - X = np.vstack((C1, C2, C3, C4, C5, np.array([[100, 100]] * 2), C6)) - expected_labels = np.r_[[1] * 5, [3] * 5, [2] * 5, [0] * 5, [2] * 5, - -1, -1, [4] * 5] - X, expected_labels = shuffle(X, expected_labels, random_state=rng) - - clust = OPTICS(min_samples=3, min_cluster_size=3, - max_eps=np.inf, cluster_method='xi', - xi=0.1).fit(X) - # this may fail if the predecessor correction is not at work! - assert_array_equal(clust.labels_, expected_labels) - - C1 = [[0, 0], [0, 0.1], [0, -.1], [0.1, 0]] - C2 = [[10, 10], [10, 9], [10, 11], [9, 10]] - C3 = [[100, 100], [100, 90], [100, 110], [90, 100]] - X = np.vstack((C1, C2, C3)) - expected_labels = np.r_[[0] * 4, [1] * 4, [2] * 4] - X, expected_labels = shuffle(X, expected_labels, random_state=rng) - - clust = OPTICS(min_samples=2, min_cluster_size=2, - max_eps=np.inf, cluster_method='xi', - xi=0.04).fit(X) - assert_array_equal(clust.labels_, expected_labels) - - -def test_cluster_hierarchy_(): - n_points_per_cluster = 100 - C1 = [0, 0] + 2 * rng.randn(n_points_per_cluster, 2) - C2 = [0, 0] + 10 * rng.randn(n_points_per_cluster, 2) - X = np.vstack((C1, C2)) - X = shuffle(X, random_state=0) - - clusters = OPTICS(min_samples=20, xi=.1).fit(X).cluster_hierarchy_ - assert clusters.shape == (2, 2) - diff = np.sum(clusters - np.array([[0, 99], [0, 199]])) - assert diff / len(X) < 0.05 - - def test_correct_number_of_clusters(): # in 'auto' mode @@ -146,7 +36,7 @@ def test_correct_number_of_clusters(): X = generate_clustered_data(n_clusters=n_clusters) # Parameters chosen specifically for this task. # Compute OPTICS - clust = OPTICS(max_eps=5.0 * 6.0, min_samples=4, xi=.1) + clust = OPTICS(max_eps=5.0 * 6.0, min_samples=4) clust.fit(X) # number of clusters, ignoring noise if present n_clusters_1 = len(set(clust.labels_)) - int(-1 in clust.labels_) @@ -169,11 +59,12 @@ def test_correct_number_of_clusters(): def test_minimum_number_of_sample_check(): # test that we check a minimum number of samples - msg = "min_samples must be no greater than" + msg = ("Number of training samples (n_samples=1) must be greater than " + "min_samples (min_samples=10) used for clustering.") # Compute OPTICS X = [[1, 1]] - clust = OPTICS(max_eps=5.0 * 0.3, min_samples=10, min_cluster_size=1) + clust = OPTICS(max_eps=5.0 * 0.3, min_samples=10) # Run the fit assert_raise_message(ValueError, msg, clust.fit, X) @@ -205,7 +96,7 @@ def test_bad_reachability(): def test_close_extract(): - # Test extract where extraction eps is close to scaled max_eps + # Test extract where extraction eps is close to scaled epsPrime centers = [[1, 1], [-1, -1], [1, -1]] X, labels_true = make_blobs(n_samples=750, centers=centers, @@ -213,7 +104,9 @@ def test_close_extract(): # Compute OPTICS clust = OPTICS(max_eps=1.0, cluster_method='dbscan', - eps=0.3, min_samples=10).fit(X) + eps=0.3, min_samples=10) + # check warning when centers are passed + assert_warns(RuntimeWarning, clust.fit, X) # Cluster ordering starts at 0; max cluster label = 2 is 3 clusters assert_equal(max(clust.labels_), 2) @@ -245,32 +138,6 @@ def test_dbscan_optics_parity(eps, min_samples): assert percent_mismatch <= 0.05 -def test_min_samples_edge_case(): - C1 = [[0, 0], [0, 0.1], [0, -.1]] - C2 = [[10, 10], [10, 9], [10, 11]] - C3 = [[100, 100], [100, 96], [100, 106]] - X = np.vstack((C1, C2, C3)) - - expected_labels = np.r_[[0] * 3, [1] * 3, [2] * 3] - clust = OPTICS(min_samples=3, - max_eps=7, cluster_method='xi', - xi=0.04).fit(X) - assert_array_equal(clust.labels_, expected_labels) - - expected_labels = np.r_[[0] * 3, [1] * 3, [-1] * 3] - clust = OPTICS(min_samples=3, - max_eps=3, cluster_method='xi', - xi=0.04).fit(X) - assert_array_equal(clust.labels_, expected_labels) - - expected_labels = np.r_[[-1] * 9] - with pytest.warns(UserWarning, match="All reachability values"): - clust = OPTICS(min_samples=4, - max_eps=3, cluster_method='xi', - xi=0.04).fit(X) - assert_array_equal(clust.labels_, expected_labels) - - # try arbitrary minimum sizes @pytest.mark.parametrize('min_cluster_size', range(2, X.shape[0] // 10, 23)) def test_min_cluster_size(min_cluster_size): diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index f79d082b7ec20..d5d59a041fdf4 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -873,10 +873,6 @@ def check_fit2d_1sample(name, estimator_orig): set_random_state(estimator, 1) - # min_cluster_size cannot be less than the data size for OPTICS. - if name == 'OPTICS': - estimator.set_params(min_samples=1) - msgs = ["1 sample", "n_samples = 1", "n_samples=1", "one sample", "1 class", "one class"]