-
Notifications
You must be signed in to change notification settings - Fork 4
/
user_interface.py
267 lines (203 loc) · 9.48 KB
/
user_interface.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
import numpy as np
from paretoset.algorithms_numpy import paretoset_efficient, pareto_rank_naive
from paretoset.utils import user_has_package, validate_inputs
import pandas as pd
if user_has_package("numba"):
from paretoset.algorithms_numba import BNL
def paretoset(costs, sense=None, distinct=True, use_numba=True):
"""Return boolean mask indicating the Pareto set of (non-NaN) numerical data.
The input data in `costs` can be either a pandas DataFrame or a NumPy ndarray
of shape (observations, objectives). The user is responsible for dealing with
NaN values *before* sending data to this function. Only numerical data is
allowed, with the exception of `diff` (different) columns.
Parameters
----------
costs : np.ndarray or pd.DataFrame
Array or DataFrame of shape (observations, objectives).
sense : list
List with strings for each column (objective). The value `min` (default)
indicates minimization, `max` indicates maximization and `diff` indicates
different values. Using `diff` is equivalent to a group-by operation
over the columns marked with `diff`. If None, minimization is assumed.
distinct : bool
How to treat duplicate rows. If `True`, only the first duplicate is returned.
If `False`, every identical observation is returned instead.
use_numba : bool
If True, numba will be used if it is installed by the user.
Returns
-------
mask : np.ndarray
Boolean mask with `True` for observations in the Pareto set.
Examples
--------
>>> from paretoset import paretoset
>>> import numpy as np
>>> costs = np.array([[2, 0], [1, 1], [0, 2], [3, 3]])
>>> paretoset(costs)
array([ True, True, True, False])
>>> paretoset(costs, sense=["min", "max"])
array([False, False, True, True])
The `distinct` parameter:
>>> paretoset([0, 0], distinct=True)
array([ True, False])
>>> paretoset([0, 0], distinct=False)
array([ True, True])
"""
if user_has_package("numba") and use_numba:
paretoset_algorithm = BNL
else:
paretoset_algorithm = paretoset_efficient
costs, sense = validate_inputs(costs=costs, sense=sense)
assert isinstance(sense, list)
n_costs, n_objectives = costs.shape
diff_cols = [i for i in range(n_objectives) if sense[i] == "diff"]
max_cols = [i for i in range(n_objectives) if sense[i] == "max"]
min_cols = [i for i in range(n_objectives) if sense[i] == "min"]
# Check data types (MIN and MAX must be numerical)
message = "Data must be numerical. Please convert it. Data has type: {}"
if isinstance(costs, pd.DataFrame):
data_types = [costs.dtypes.values[i] for i in (max_cols + min_cols)]
if any(d == np.dtype("O") for d in data_types):
raise TypeError(message.format(data_types))
else:
if costs.dtype == np.dtype("O"):
raise TypeError(message.format(costs.dtype))
# CASE 1: THE ONLY SENSE IS MINIMIZATION
# ---------------------------------------
if all(s == "min" for s in sense):
if isinstance(costs, pd.DataFrame):
costs = costs.to_numpy(copy=True)
return paretoset_algorithm(costs, distinct=distinct)
n_costs, n_objectives = costs.shape
if not diff_cols:
# Its an array
if not isinstance(costs, np.ndarray):
costs = costs.to_numpy(copy=True)
for col in max_cols:
costs[:, col] = -costs[:, col]
return paretoset_algorithm(costs, distinct=distinct)
if isinstance(costs, pd.DataFrame):
df = costs.copy() # Copy to avoid mutating inputs
df.columns = np.arange(n_objectives)
else:
df = pd.DataFrame(costs)
assert isinstance(df, pd.DataFrame)
assert np.all(df.columns == np.arange(n_objectives))
# If `object` columns are present and they can be converted, do it.
for col in max_cols + min_cols:
df[col] = pd.to_numeric(df[col], errors="coerce")
is_efficient = np.zeros(n_costs, dtype=np.bool_)
# Create the groupby object
# We could've implemented our own groupby, but choose to use pandas since
# it's likely better than what we can come up with on our own.
groupby = df.groupby(diff_cols)
# Iteration through the groups
for key, data in groupby:
# Get the relevant data for the group and compute the efficient points
relevant_data = data[max_cols + min_cols].to_numpy(copy=True)
efficient_mask = paretoset_algorithm(relevant_data.copy(), distinct=distinct)
# The `pd.DataFrame.groupby.indices` dict holds the row indices of the group
data_mask = groupby.indices[key]
is_efficient[data_mask] = efficient_mask
return is_efficient
def paretorank(costs, sense=None, distinct=True, use_numba=True):
"""Return integer array with Pareto ranks of (non-NaN) numerical data.
Observations in the Pareto set are assigned rank 1. After removing the Pareto
set, the Pareto set of the remaining data is assigned rank 2, and so forth.
The input data in `costs` can be either a pandas DataFrame or a NumPy ndarray
of shape (observations, objectives). The user is responsible for dealing with
NaN values *before* sending data to this function. Only numerical data is
allowed, with the exception of `diff` (different) columns.
Parameters
----------
costs : np.ndarray or pd.DataFrame
Array or DataFrame of shape (observations, objectives).
sense : list
List with strings for each column (objective). The value `min` (default)
indicates minimization, `max` indicates maximization and `diff` indicates
different values. Using `diff` is equivalent to a group-by operation
over the columns marked with `diff`. If None, minimization is assumed.
distinct : bool
How to treat duplicate rows. If `True`, only the first duplicate is returned.
If `False`, every identical observation is returned instead.
use_numba : bool
If True, numba will be used if it is installed by the user.
Returns
-------
ranks : np.ndarray
Integer array with Pareto ranks of the observations.
Examples
--------
>>> from paretoset import paretoset
>>> import numpy as np
>>> costs = np.array([[2, 0], [1, 1], [0, 2], [3, 3]])
>>> paretorank(costs)
array([1, 1, 1, 2])
>>> paretorank(costs, sense=["min", "max"])
array([3, 2, 1, 1])
The `distinct` parameter:
>>> paretorank([0, 0], distinct=True)
array([1, 2])
>>> paretorank([0, 0], distinct=False)
array([1, 1])
"""
if user_has_package("numba") and use_numba:
paretorank_algorithm = pareto_rank_naive
else:
paretorank_algorithm = pareto_rank_naive
costs, sense = validate_inputs(costs=costs, sense=sense)
assert isinstance(sense, list)
n_costs, n_objectives = costs.shape
diff_cols = [i for i in range(n_objectives) if sense[i] == "diff"]
max_cols = [i for i in range(n_objectives) if sense[i] == "max"]
min_cols = [i for i in range(n_objectives) if sense[i] == "min"]
# Check data types (MIN and MAX must be numerical)
message = "Data must be numerical. Please convert it. Data has type: {}"
if isinstance(costs, pd.DataFrame):
data_types = [costs.dtypes.values[i] for i in (max_cols + min_cols)]
if any(d == np.dtype("O") for d in data_types):
raise TypeError(message.format(data_types))
else:
if costs.dtype == np.dtype("O"):
raise TypeError(message.format(costs.dtype))
# CASE 1: THE ONLY SENSE IS MINIMIZATION
# ---------------------------------------
if all(s == "min" for s in sense):
if isinstance(costs, pd.DataFrame):
costs = costs.to_numpy(copy=True)
return paretorank_algorithm(costs, distinct=distinct, use_numba=use_numba)
n_costs, n_objectives = costs.shape
if not diff_cols:
# Its an array
if not isinstance(costs, np.ndarray):
costs = costs.to_numpy(copy=True)
for col in max_cols:
costs[:, col] = -costs[:, col]
return paretorank_algorithm(costs, distinct=distinct, use_numba=use_numba)
if isinstance(costs, pd.DataFrame):
df = costs.copy() # Copy to avoid mutating inputs
df.columns = np.arange(n_objectives)
else:
df = pd.DataFrame(costs)
assert isinstance(df, pd.DataFrame)
assert np.all(df.columns == np.arange(n_objectives))
# If `object` columns are present and they can be converted, do it.
for col in max_cols + min_cols:
df[col] = pd.to_numeric(df[col], errors="coerce")
all_ranks = np.zeros(n_costs, dtype=np.int_)
# Create the groupby object
# We could've implemented our own groupby, but choose to use pandas since
# it's likely better than what we can come up with on our own.
groupby = df.groupby(diff_cols)
# Iteration through the groups
for key, data in groupby:
# Get the relevant data for the group and compute the efficient points
relevant_data = data[max_cols + min_cols].to_numpy(copy=True)
ranks = paretorank_algorithm(relevant_data.copy(), distinct=distinct, use_numba=use_numba)
# The `pd.DataFrame.groupby.indices` dict holds the row indices of the group
data_mask = groupby.indices[key]
all_ranks[data_mask] = ranks
return all_ranks
if __name__ == "__main__":
import pytest
pytest.main(args=[".", "--doctest-modules", "--maxfail=5", "--cache-clear", "--color", "yes", ""])