/
select.py
502 lines (407 loc) · 18.2 KB
/
select.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
# -*- coding: utf-8 -*-
#
# Author: Taylor Smith <taylor.smith@alkaline-ml.com>
import numpy as np
import pandas as pd
from .base import BaseFeatureSelector
from ..utils.validation import check_dataframe, validate_multiple_cols
from ..utils.metaestimators import timed_instance_method
__all__ = [
'FeatureFilter',
'MultiCorrFilter',
'NearZeroVarianceFilter',
'SparseFeatureFilter'
]
# TODO: add functionality for a dummy column indicating present value?
class SparseFeatureFilter(BaseFeatureSelector):
"""Drop overly sparse features.
Retains features that are less sparse (NaN) than the provided
threshold. Useful in situations where matrices are too sparse to
impute reliably.
Parameters
----------
cols : array-like, shape=(n_features,), optional (default=None)
The names of the columns on which to apply the transformation.
If no column names are provided, the transformer will be ``fit``
on the entire frame. Note that the transformation will also only
apply to the specified columns, and any other non-specified
columns will still be present after transformation.
threshold : float, optional (default=0.5)
The threshold of sparsity above which features will be
deemed "too sparse" and will be dropped.
as_df : bool, optional (default=True)
Whether to return a Pandas ``DataFrame`` in the ``transform``
method. If False, will return a Numpy ``ndarray`` instead.
Since most skutil transformers depend on explicitly-named
``DataFrame`` features, the ``as_df`` parameter is True by default.
Examples
--------
>>> import numpy as np
>>> import pandas as pd
>>>
>>> nan = np.nan
>>> X = np.array([
... [1.0, 2.0, nan],
... [2.0, 3.0, nan],
... [3.0, nan, 1.0],
... [4.0, 5.0, nan]
... ])
>>>
>>> X = pd.DataFrame.from_records(data=X, columns=['a','b','c'])
>>> dropper = SparseFeatureFilter(threshold=0.5)
>>> X_transform = dropper.fit_transform(X)
>>> assert X_transform.shape[1] == 2 # drop out last column
Attributes
----------
sparsity_ : array-like, shape=(n_features,)
The array of sparsity values
drop_ : array-like, shape=(n_features,)
Assigned after calling ``fit``. These are the features that
are designated as "bad" and will be dropped in the ``transform``
method.
Notes
-----
Sometimes the presence of a value in an overly sparse column can be highly
informative. If you're using the sparse filter, consider creating a new
(dummy) feature indicating whether there was a value present.
"""
def __init__(self, cols=None, threshold=0.5, as_df=True):
super(SparseFeatureFilter, self).__init__(
cols=cols, as_df=as_df)
self.threshold = threshold
@timed_instance_method(attribute_name="fit_time_")
def fit(self, X, y=None):
"""Fit the transformer.
Parameters
----------
X : pd.DataFrame, shape=(n_samples, n_features)
The Pandas frame to fit. The frame will only
be fit on the prescribed ``cols`` (see ``__init__``) or
all of them if ``cols`` is None. Furthermore, ``X`` will
not be altered in the process of the fit.
y : array-like or None, shape=(n_samples,), optional (default=None)
Pass-through for ``sklearn.pipeline.Pipeline``. Even
if explicitly set, will not change behavior of ``fit``.
"""
X, cols = check_dataframe(X, cols=self.cols)
thresh = self.threshold
# validate the threshold
if not (isinstance(thresh, float) and (0.0 <= thresh < 1.0)):
raise ValueError('thresh must be a float between '
'0 (inclusive) and 1. Got %s' % str(thresh))
# assess sparsity
subset = X[cols]
self.sparsity_ = subset.apply(
lambda x: x.isnull().sum() / x.shape[0]).values # type: np.ndarray
mask = self.sparsity_ > thresh # numpy boolean array
self.drop_ = subset.columns[mask].tolist()
return self
class FeatureFilter(BaseFeatureSelector):
"""A simple feature-dropping transformer class.
A very simple class to be used at the beginning or any stage of a
Pipeline that will drop the given features from the remainder of the pipe.
This is useful if a transformer or encoder creates variables that you're
disinterested in and would like to exclude from your modeling process.
Parameters
----------
cols : array-like, shape=(n_features,), optional (default=None)
The features to drop. Note that ``FeatureFilter`` behaves slightly
differently from all other ``BaseFeatureSelector`` classes in the sense
that it will drop all of the features prescribed in this parameter.
However, if ``cols`` is None, it will not drop any (which is counter
to other classes, which will operate on all columns in the absence
of an explicit ``cols`` parameter).
as_df : bool, optional (default=True)
Whether to return a Pandas ``DataFrame`` in the ``transform``
method. If False, will return a Numpy ``ndarray`` instead.
Since most skoot transformers depend on explicitly-named
``DataFrame`` features, the ``as_df`` parameter is True by default.
Examples
--------
An example using the FeatureFilter:
>>> import numpy as np
>>> import pandas as pd
>>>
>>> X = pd.DataFrame.from_records(data=np.random.rand(3,3),
... columns=['a','b','c'])
>>> dropper = FeatureFilter(cols=['a','b'])
>>> X_transform = dropper.fit_transform(X)
>>> assert X_transform.shape[1] == 1 # drop out first two columns
Attributes
----------
drop_ : array-like, shape=(n_features,)
Assigned after calling ``fit``. These are the features that
are designated as "bad" and will be dropped in the ``transform``
method.
"""
def __init__(self, cols=None, as_df=True):
# just a pass-through for super constructor
super(FeatureFilter, self).__init__(
cols=cols, as_df=as_df)
def fit(self, X, y=None):
# check on state of X and cols
X, cols = check_dataframe(X, cols=self.cols)
# if the provided self.cols was None, we drop nothing. otherwise
# we drop the specified columns
self.drop_ = [] if not self.cols else cols
return self
class MultiCorrFilter(BaseFeatureSelector):
"""Remove highly correlated features.
Multi-collinear data (features which are not independent from one another)
can pose problems in coefficient stability for parametric models, or
feature importance scores for non-parametric models.
This class filters out features with a correlation greater than the
provided threshold. When a pair of correlated features is identified, the
mean absolute correlation (MAC) of each feature is considered, and the
feature with the highest MAC is discarded.
Parameters
----------
cols : array-like, shape=(n_features,), optional (default=None)
The names of the columns on which to apply the transformation.
If no column names are provided, the transformer will be ``fit``
on the entire frame. Note that the transformation will also only
apply to the specified columns, and any other non-specified
columns will still be present after transformation.
threshold : float, optional (default=0.85)
The threshold above which to filter correlated features
method : str, optional (default='pearson')
The method used to compute the correlation,
one of ('pearson', 'kendall', 'spearman').
as_df : bool, optional (default=True)
Whether to return a Pandas ``DataFrame`` in the ``transform``
method. If False, will return a Numpy ``ndarray`` instead.
Since most skoot transformers depend on explicitly-named
``DataFrame`` features, the ``as_df`` parameter is True by default.
Examples
--------
The following demonstrates a simple multi-correlation filter
applied to the iris dataset.
>>> from skoot.datasets import load_iris_df
>>>
>>> X = load_iris_df(include_tgt=False)
>>> mcf = MultiCorrFilter(threshold=0.85)
>>> mcf.fit_transform(X).head()
sepal length (cm) sepal width (cm) petal width (cm)
0 5.1 3.5 0.2
1 4.9 3.0 0.2
2 4.7 3.2 0.2
3 4.6 3.1 0.2
4 5.0 3.6 0.2
Attributes
----------
drop_ : array-like, shape=(n_features,)
Assigned after calling ``fit``. These are the features that
are designated as "bad" and will be dropped in the ``transform``
method.
mean_abs_correlations_ : list, float
The corresponding mean absolute correlations of each ``drop_`` name
"""
def __init__(self, cols=None, threshold=0.85,
method='pearson', as_df=True):
super(MultiCorrFilter, self).__init__(
cols=cols, as_df=as_df)
self.threshold = threshold
self.method = method
def fit(self, X, y=None):
"""Fit the multi-collinearity filter.
Parameters
----------
X : pd.DataFrame, shape=(n_samples, n_features)
The Pandas frame to fit. The frame will only
be fit on the prescribed ``cols`` (see ``__init__``) or
all of them if ``cols`` is None. Furthermore, ``X`` will
not be altered in the process of the fit.
y : array-like or None, shape=(n_samples,), optional (default=None)
Pass-through for ``sklearn.pipeline.Pipeline``. Even
if explicitly set, will not change behavior of ``fit``.
"""
# check on state of X and cols. Also need all columns to be finite!
X, cols = check_dataframe(X, cols=self.cols, assert_all_finite=True)
# we need to make sure there's more than 1 column!
validate_multiple_cols(self.__class__.__name__, cols)
# Generate correlation matrix
c = X[cols].corr(method=self.method)
# get drops list
# TODO: write a _find_correlations_exact for smaller matrices
self.drop_, self.mean_abs_correlations_ = \
self._find_correlations_fast(c, self.threshold)
return self
@staticmethod
def _find_correlations_fast(c, threshold):
"""Filter highly correlated features.
This function identifies the correlations between features that
are greater than the provided threshold, and identifies which
to drop on the basis of mean absolute correlation. This function is
based on Caret's ``findCorrelation_fast`` method [1], which is very
efficient for large matrices.
Parameters
----------
c : pd.DataFrame
The pre-computed correlation matrix.
threshold : float
The threshold above which to filter features which
are multi-collinear in nature.
Returns
-------
drop_names : list, shape=(n_features,)
The feature names that should be dropped
average_corr : array-like, shape=(n_features,)
The mean absolute correlations between
the features.
References
----------
.. [1] Caret findCorrelations.R (findCorrelation_fast)
https://bit.ly/2E1AMcJ
"""
# get the average absolute column correlations
c_abs = c.abs() # type: pd.DataFrame
average_corr = c_abs.mean().values # type: np.ndarray
# get the sort order
average_corr_order = np.argsort(average_corr)
# set the lower tri to NaN so we don't consider anymore
# first, get the lower tri indices and then zip them
lower_tri = np.tril_indices(n=c.shape[0], k=0)
c.values[lower_tri] = np.nan
# get those above cutoff (re-compute abs on amended array)
c_abs = c.abs() # type: pd.DataFrame
combs_above_thresh = np.where(c_abs > threshold)
rows_to_check, cols_to_check = combs_above_thresh
cols_to_discard = (average_corr_order[cols_to_check] >
average_corr_order[rows_to_check])
rows_to_discard = ~cols_to_discard
# append each set of discard rows/cols, get the distinct
drop_cols = np.unique(
np.concatenate([cols_to_check[cols_to_discard],
rows_to_check[rows_to_discard]]))
# the names to drop
drop_names = c.columns[drop_cols].tolist()
return drop_names, average_corr
class NearZeroVarianceFilter(BaseFeatureSelector):
r"""Identify near zero variance predictors.
Diagnose and remove any features that have one unique value
(i.e., are zero variance predictors) or that are have both of the
following characteristics: they have very few unique values relative
to the number of samples and the ratio of the frequency of the most
common value to the frequency of the second most common value is large.
A note of caution: if you attempt to run this over large, continuous data,
it might take a long time. Since for each column in ``cols`` it will
compute ``value_counts``, applying to continuous data could be a bad idea.
Parameters
----------
cols : array-like, shape=(n_features,), optional (default=None)
The names of the columns on which to apply the transformation.
If no column names are provided, the transformer will be ``fit``
on the entire frame. Note that the transformation will also only
apply to the specified columns, and any other non-specified
columns will still be present after transformation.
freq_cut : float, optional (default=95/5)
The cutoff for the ratio of the most common value to the second most
common value. That is, if the frequency of the most common value is
>= ``freq_cut`` times the frequency of the second most, the feature
will be dropped.
as_df : bool, optional (default=True)
Whether to return a Pandas ``DataFrame`` in the ``transform``
method. If False, will return a Numpy ``ndarray`` instead.
Since most skutil transformers depend on explicitly-named
``DataFrame`` features, the ``as_df`` parameter is True by default.
Examples
--------
An example of the near zero variance filter on a completely
constant column:
>>> import pandas as pd
>>> import numpy as np
>>>
>>> X = pd.DataFrame.from_records(
... data=np.array([[1,2,3], [4,5,3], [6,7,3], [8,9,3]]),
... columns=['a','b','c'])
>>> flt = NearZeroVarianceFilter(freq_cut=25)
>>> flt.fit_transform(X)
a b
0 1 2
1 4 5
2 6 7
3 8 9
An example on a column with two unique values represented at 2:1. Also
shows how we can extract the fitted ratios and drop names:
>>> X = pd.DataFrame.from_records(
... data=np.array([[1,2,3], [4,5,3], [6,7,5]]),
... columns=['a','b','c'])
>>> nzv = NearZeroVarianceFilter(freq_cut=2.)
>>> nzv.fit_transform(X)
a b
0 1 2
1 4 5
2 6 7
>>> nzv.ratios_
array([ 1., 1., 2.])
>>> nzv.drop_
['c']
Attributes
----------
drop_ : array-like, shape=(n_features,)
Assigned after calling ``fit``. These are the features that
are designated as "bad" and will be dropped in the ``transform``
method.
ratios_ : array-like, shape=(n_features,)
The ratios of the counts of the most populous classes to the second
most populated classes for each column in ``cols``.
References
----------
.. [1] Kuhn, M. & Johnson, K. "Applied Predictive
Modeling" (2013). New York, NY: Springer.
.. [2] Caret (R package) nearZeroVariance R code
https://bit.ly/2J0ozbM
"""
def __init__(self, cols=None, freq_cut=95. / 5., as_df=True):
super(NearZeroVarianceFilter, self).__init__(
cols=cols, as_df=as_df)
self.freq_cut = freq_cut
def fit(self, X, y=None):
"""Fit the near-zero variance filter.
Parameters
----------
X : pd.DataFrame, shape=(n_samples, n_features)
The Pandas frame to fit. The frame will only
be fit on the prescribed ``cols`` (see ``__init__``) or
all of them if ``cols`` is None. Furthermore, ``X`` will
not be altered in the process of the fit.
y : array-like or None, shape=(n_samples,), optional (default=None)
Pass-through for ``sklearn.pipeline.Pipeline``. Even
if explicitly set, will not change behavior of ``fit``.
"""
# check on state of X and cols
X, cols = check_dataframe(X, self.cols)
# get the freq cut and validate it is an appropriate value...
freq_cut = self.freq_cut
if not (isinstance(freq_cut, (int, float)) and 1. < freq_cut):
raise ValueError("freq_cut must be a float > 1.0")
# make sure it's cast to a float if not already
freq_cut = float(freq_cut)
# get a mask of which should be dropped
subset = X[cols]
ratios = subset.apply(self._filter_freq_cut).values
self.drop_ = subset.columns[ratios >= freq_cut].tolist()
self.ratios_ = ratios
return self
@staticmethod
def _filter_freq_cut(series):
"""Filter above a frequency cut.
Parameters
----------
series : pd.Series
One series in the specified column list.
Returns
-------
ratio : float
The ratio of the count of the most populated class to the second
most populated class. If there is only one class, will return
infinity.
"""
vc = series.value_counts()
n_levels = vc.shape[0]
# base case 1: vc len is 1 (single value, no variance at all)
if n_levels == 1:
return np.inf
# get the first two levels and counts
first_two = vc.values[:2].astype(float)
return first_two[0] / first_two[1]