/
active_learner.py
403 lines (327 loc) · 15.9 KB
/
active_learner.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
import numpy as np
from abc import ABC, abstractmethod
from pathlib import Path
from scipy.sparse import csr_matrix
from small_text.base import LABEL_IGNORED
from small_text.exceptions import LearnerNotInitializedException
from small_text.utils.data import list_length
from small_text.utils.labels import concatenate, get_ignored_labels_mask, remove_by_index
from small_text.version import __version__ as version
class ActiveLearner(ABC):
"""Abstract base class for Active Learners."""
@abstractmethod
def query(self, num_samples=10):
pass
@abstractmethod
def update(self, y):
pass
class AbstractPoolBasedActiveLearner(ActiveLearner):
def query(self, num_samples=10):
pass
def update(self, y):
pass
@abstractmethod
def initialize_data(self, indices_initial, y_initial, *args, **kwargs):
"""(Re-)Initializes the current labeled pool.
This methods needs to be called whenever the underlying data changes, in particularly
before the first loop.
Parameters
----------
indices_initial : np.ndarray[int]
Positional indices pointing at training examples. This is the intially labelled set
for training an initial classifier.
y_initial : numpy.ndarray[int] or scipy.sparse.csr_matrix
The respective labels belonging to the examples referenced by `x_indices_initial`.
"""
pass
@property
@abstractmethod
def classifier(self):
pass
@property
@abstractmethod
def query_strategy(self):
pass
class PoolBasedActiveLearner(AbstractPoolBasedActiveLearner):
"""A pool-based active learner in which a pool holds all available unlabeled data.
It uses a classifier, a query strategy and manages the mutually exclusive partition over the
whole training data into labeled and unlabeled.
Parameters
----------
clf_factory : small_text.classifiers.factories.AbstractClassifierFactory
A factory responsible for creating new classifier instances.
query_strategy : small_text.query_strategies.QueryStrategy
Query strategy which is responsible for selecting instances during a `query()` call.
dataset : ~small_text.data.datasets.Dataset
A training dataset that is supported by the underlying classifier.
reuse_model : bool, default=False
Reuses the previous model during retraining (if a previous model exists),
otherwise creates a new model for each retraining.
Attributes
----------
indices_labeled : numpy.ndarray
Indices of instances (relative to `self.x_train`) constituting the labeled pool.
indices_ignored : numpy.ndarray or scipy.sparse.csr_matrix
Indices of instances (relative to `self.x_train`) which have been ignored,
i.e. which will never be returned by a query.
y : numpy.ndarray or scipy.sparse.csr_matrix
Labels for the the current labeled pool. Each tuple `(x_indices_labeled[i], y[i])`
represents one labeled sample.
indices_queried : numpy.ndarray or None
Queried indices returned by the last `query()` call, or `None` if no query has been
executed yet.
"""
def __init__(self, clf_factory, query_strategy, dataset, reuse_model=False):
self._clf = None
self._clf_factory = clf_factory
self._query_strategy = query_strategy
self._index_to_position = None
self.dataset = dataset
self.reuse_model = reuse_model
self.indices_labeled = np.empty(shape=0, dtype=int)
self.indices_ignored = np.empty(shape=0, dtype=int)
self.y = None
self.indices_queried = None
def initialize_data(self, indices_initial, y_initial, indices_ignored=None,
indices_validation=None, retrain=True):
"""(Re-)Initializes the current labeled pool.
This is required once before the first `query()` call, and whenever the labeled pool
is changed from the outside, i.e. when `self.x_train` changes.
Parameters
----------
indices_initial : numpy.ndarray
A list of indices (relative to `self.x_train`) of initially labeled samples.
y_initial : numpy.ndarray or or scipy.sparse.csr_matrix
Label matrix. One row corresponds to an index in `x_indices_initial`. If the
passed type is numpy.ndarray (dense) all further label-based operations assume dense
labels, otherwise sparse labels for scipy.sparse.csr_matrix.
indices_ignored : numpy.ndarray
List of ignored samples which will be invisible to the query strategy.
indices_validation : numpy.ndarray, default=None
The given indices (relative to `self.x_indices_labeled`) define a custom validation set
if provided. Otherwise each classifier that uses a validation set will be responsible
for creating a validation set. Only used if `retrain=True`.
retrain : bool, default=True
Retrains the model after the update if True.
"""
self.indices_labeled = indices_initial
self._index_to_position = self._build_index_to_position_dict()
self.y = y_initial
if isinstance(self.y, csr_matrix):
self.multi_label = True
else:
self.multi_label = False
if indices_ignored is not None:
self.indices_ignored = indices_ignored
else:
self.indices_ignored = np.empty(shape=(0), dtype=int)
if retrain:
self._retrain(indices_validation=indices_validation)
def query(self, num_samples=10, representation=None, query_strategy_kwargs=dict()):
"""Performs a query step, which selects a number of samples from the unlabeled pool.
A query step must be followed by an update step.
Parameters
----------
num_samples : int, default=10
Number of samples to query.
representation : numpy.ndarray, default=None
Alternative representation for the samples in the unlabeled pool. his can be used
if you want to rely pre-computed fixed representations instead of embeddings that
change during each active learning iteration.
query_strategy_kwargs : dict, default=dict()
Returns
-------
queried_indices : numpy.ndarray[int]
List of queried indices (relative to the current unlabeled pool).
Raises
------
LearnerNotInitializedException
Thrown when the active learner was not initialized via `initialize_data(...)`.
ValueError
Raised when args or kwargs are not used and consumed.
"""
if self._index_to_position is None:
raise LearnerNotInitializedException()
size = list_length(self.dataset)
if representation is not None and size != list_length(representation):
raise ValueError('Number of rows of alternative representation x must match the train '
'set (dim 0).')
self.mask = np.ones(size, bool)
self.mask[np.concatenate([self.indices_labeled, self.indices_ignored])] = False
indices = np.arange(size)
representation = self.dataset if representation is None else representation
self.indices_queried = self.query_strategy.query(self._clf,
representation,
indices[self.mask],
self.indices_labeled,
self.y,
n=num_samples,
**query_strategy_kwargs)
return self.indices_queried
def update(self, y, indices_validation=None):
"""Performs an update step, which passes the label for each of the
previously queried indices.
An update step must be preceded by a query step. At the end of the update step the
current model is retrained using all available labels.
Parameters
----------
y : numpy.ndarray or scipy.sparse.csr_matrix
Labels provided in response to the previous query. Each label at index i corresponds
to the sample x[i] for single-label data (ndarray) and each row of labels at index i
corresponds to the sample x[i] for multi-label data (csr_matrix). Setting the label /
row of labels to ` small_text.base import LABEL_IGNORED` will ignore the respective
sample.
indices_validation : numpy.ndarray, default=None
The given indices (relative to `self.x_indices_labeled`) define a custom validation set
if provided. Otherwise each classifier that uses a validation set will be responsible
for creating a validation set.
"""
if self.indices_queried.shape[0] != y.shape[0]:
raise ValueError('Query-update mismatch: indices queried - {} / labels provided - {}'
.format(self.indices_queried.shape[0], y.shape[0])
)
ignored = get_ignored_labels_mask(y, LABEL_IGNORED)
if ignored.any():
y = remove_by_index(y, np.arange(y.shape[0])[ignored])
self.indices_ignored = np.concatenate([self.indices_ignored, self.indices_queried[ignored]])
self.indices_labeled = np.concatenate([self.indices_labeled, self.indices_queried[~ignored]])
self._index_to_position = self._build_index_to_position_dict()
if self.indices_labeled.shape[0] != np.unique(self.indices_labeled).shape[0]:
raise ValueError('Duplicate indices detected in the labeled pool! '
'Please re-examine your query strategy.')
if not ignored.all():
self.y = concatenate(self.y, y)
self._retrain(indices_validation=indices_validation)
self.indices_queried = None
self.mask = None
def update_label_at(self, index, y, retrain=False, indices_validation=None):
"""Updates the label for the given x_index (with regard to `self.x_train`).
Notes
-----
After adding labels the current model might not reflect the labeled data anymore.
You should consider if a retraining is necessary when using this operation.
Since retraining is often time-consuming, `retrain` is set to `False` by default.
Parameters
----------
index : int
Data index (relative to `self.x_train`) for which the label should be updated.
y : int or numpy.ndarray
The new label(s) to be assigned for the sample at `self.x_indices_labeled[x_index]`.
retrain : bool, default=False
Retrains the model after the update if True.
indices_validation : numpy.ndarray, default=None
The given indices (relative to `self.x_indices_labeled`) define a custom validation set
if provided. This is only used if `retrain` is `True`.
"""
position = self._index_to_position[index]
self.y[position] = y
if retrain:
self._retrain(indices_validation=indices_validation)
def remove_label_at(self, x_index, retrain=False, x_indices_validation=None):
"""Removes the labeling for the given x_index (with regard to `self.x_train`).
Notes
-----
After removing labels the current model might not reflect the labeled data anymore.
You should consider if a retraining is necessary when using this operation.
Since retraining is often time-consuming, `retrain` is set to `False` by default.
Parameters
----------
x_index : int
Data index (relative to `self.x_train`) for which the label should be removed.
retrain : bool, default=None
Retrains the model after removal if True.
x_indices_validation : numpy.ndarray, default=None
The given indices (relative to `self.x_indices_labeled`) define a custom validation set
if provided. This is only used if `retrain` is `True`.
"""
position = self._index_to_position[x_index]
self.y = remove_by_index(self.y, position)
self.indices_labeled = np.delete(self.indices_labeled, position)
if retrain:
self._retrain(indices_validation=x_indices_validation)
def ignore_sample_at(self, index, retrain=False, indices_validation=None):
"""Ignores the sample at the given `x_index.
Any labels which had previously been assigned to this sample will be removed.
Notes
-----
If ignoring a sample incurs the removal of a label label, the current model might not
reflect the labeled data anymore. You should consider if a retraining is necessary when
using this operation. Since retraining is often time-consuming, `retrain` is set to
`False` by default.
Parameters
----------
index : int
Data index (relative to `self.x_train`) for which the label should be ignored.
retrain : bool, default=False
Retrains the model after the removal if True.
indices_validation : numpy.ndarray, default=None
The given indices (relative to `self.x_indices_labeled`) define a custom validation set
if provided. This is only used if `retrain` is `True`.
"""
labeling_exists = index in self._index_to_position
if labeling_exists:
position = self._index_to_position[index]
self.y = remove_by_index(self.y, position)
self.indices_labeled = np.delete(self.indices_labeled, position)
self.indices_ignored = np.concatenate([self.indices_ignored, [index]])
if labeling_exists and retrain:
self._retrain(indices_validation=indices_validation)
def save(self, file):
"""Serializes the current active learner object into a single file for later re-use.
Parameters
----------
file : str or path or file
Serialized output file to be written.
"""
if isinstance(file, (str, Path)):
with open(file, 'wb+') as f:
self._save(f)
else:
self._save(file)
def _save(self, file_handle):
import dill as pickle
pickle.dump(version, file_handle)
pickle.dump(self, file_handle)
@classmethod
def load(cls, file):
"""Deserializes a serialized active learner.
Parameters
----------
file : str or path or file
File to be loaded.
"""
if isinstance(file, (str, Path)):
with open(file, 'rb') as f:
return cls._load(f)
else:
return cls._load(file)
@classmethod
def _load(cls, file_handle):
import dill as pickle
_ = pickle.load(file_handle) # version, will be used in the future
return pickle.load(file_handle)
@property
def classifier(self):
return self._clf
@property
def query_strategy(self):
return self._query_strategy
def _retrain(self, indices_validation=None):
if self._clf is None or not self.reuse_model:
if hasattr(self, '_clf'):
del self._clf
self._clf = self._clf_factory.new()
dataset = self.dataset[self.indices_labeled].clone()
dataset.y = self.y
if indices_validation is None:
self._clf.fit(dataset)
else:
indices = np.arange(self.indices_labeled.shape[0])
mask = np.isin(indices, indices_validation)
train = dataset[indices[~mask]]
valid = dataset[indices[mask]]
self._clf.fit(train, validation_set=valid)
def _build_index_to_position_dict(self):
return dict({
x_index: position for position, x_index in enumerate(self.indices_labeled)
})