-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathscore.py
104 lines (80 loc) · 3.62 KB
/
score.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
"""
clustering-benchmarks Package
"""
# ############################################################################ #
# #
# Copyleft (C) 2020-2025, Marek Gagolewski <https://www.gagolewski.com> #
# #
# #
# This program is free software: you can redistribute it and/or modify #
# it under the terms of the GNU Affero General Public License #
# Version 3, 19 November 2007, published by the Free Software Foundation. #
# This program is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
# GNU Affero General Public License Version 3 for more details. #
# You should have received a copy of the License along with this program. #
# If this is not the case, refer to <https://www.gnu.org/licenses/>. #
# #
# ############################################################################ #
import numpy as np
import warnings
import genieclust
from .load_results import labels_list_to_dict
def get_score(
labels,
results,
metric=genieclust.compare_partitions.normalized_clustering_accuracy,
compute_max=True,
warn_if_missing=True
):
"""
Computes a similarity score between the reference and the predicted partitions
Takes into account that there can be more than one ground truth partition
and ignores the noise points (as explained in the Methodology section
of the clustering benchmark framework's website).
If ``labels`` is a single label vector, it will be wrapped inside
a list. If ``results`` is not a dictionary,
`labels_list_to_dict` will be called first.
Parameters
----------
labels
A vector-like object or a list thereof.
results
A dictionary of clustering results, where
``results[K]`` gives a K-partition.
metric : function
An external cluster validity measure; defaults to
``genieclust.compare_partitions.normalized_clustering_accuracy``.
It will be called like ``metric(y_true, y_pred)``.
compute_max : bool
Whether to apply ``max`` on the particular similarity scores.
warn_if_missing : bool
Warn if some ``results[K]`` is required, but missing.
Returns
-------
score : float or array thereof
The computed similarity scores. Ultimately, it is a vector of
``metric(y_true[y_true>0], results[max(y_true)][y_true>0])``
over all ``y_true`` in ``labels``
or the maximum thereof if ``compute_max`` is ``True``.
"""
labels = list(np.array(labels, ndmin=2))
if type(results) is not dict:
results = labels_list_to_dict(results)
scores = []
for y_true in labels:
k = int(max(y_true))
if k not in results:
if warn_if_missing:
warnings.warn("`results[%d]` is not available." % k)
scores.append(np.nan)
continue
y_pred = results[k]
if np.min(y_pred) < 1 or np.max(y_pred) > k:
raise ValueError("`results[k]` is not between 1 and k=%d." % k)
scores.append(metric(y_true[y_true > 0], y_pred[y_true > 0]))
if compute_max and len(scores) > 0:
return np.nanmax(scores)
else:
return np.array(scores)