Skip to content

Commit

Permalink
Added custom _indexable in fixes to try to fix sklearn's
Browse files Browse the repository at this point in the history
  • Loading branch information
tgsmith61591 committed Nov 12, 2016
1 parent e4dc29a commit 512263a
Show file tree
Hide file tree
Showing 7 changed files with 3,412 additions and 3,119 deletions.
4 changes: 2 additions & 2 deletions skutil/feature_selection/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,14 +80,14 @@ def transform(self, X):
# check on state of X and cols
X, _ = validate_is_pd(X, self.cols)

if self.drop_ is None:
if not self.drop_: # empty or None
return X if self.as_df else X.as_matrix()
else:
# what if we don't want to throw this key error for a non-existent
# column that we hope to drop anyways? We need to at least inform the
# user...
drops = [x for x in self.drop_ if x in X.columns]
if not len(drops) == len(self.drop_):
if len(drops) != len(self.drop_):
warnings.warn('one or more features to drop not contained '
'in input data feature names', UserWarning)

Expand Down
16 changes: 3 additions & 13 deletions skutil/feature_selection/select.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ def fit(self, X, y=None):
# assess sparsity
self.sparsity_ = X[cols].apply(lambda x: x.isnull().sum() / x.shape[0]).values # numpy array
mask = self.sparsity_ > thresh # numpy boolean array
self.drop_ = X.columns[mask].tolist() if mask.sum() > 0 else None
self.drop_ = X.columns[mask].tolist()
return self


Expand Down Expand Up @@ -289,7 +289,7 @@ def transform(self, X):
X, _ = validate_is_pd(X, self.cols) # copy X
cols = X.columns if self.cols is None else self.cols

retained = X[cols] # if cols is None, returns all
retained = X[cols] # if not cols, returns all
return retained if self.as_df else retained.as_matrix()


Expand Down Expand Up @@ -521,10 +521,7 @@ def fit(self, X, y=None):
c = X[cols].corr(method=self.method).apply(lambda x: np.abs(x))

# get drops list
d, mac, crz = filter_collinearity(c, self.threshold)
self.drop_ = d if d else None
self.mean_abs_correlations_ = mac if mac else None
self.correlations_ = crz if crz else None
self.drop_, self.mean_abs_correlations_, self.correlations_ = filter_collinearity(c, self.threshold)

return self

Expand Down Expand Up @@ -702,11 +699,4 @@ def fit(self, X, y=None):
self.drop_ = np.asarray(cols)[drop_mask].tolist()
self.var_ = dict(zip(self.drop_, matrix[drop_mask, 0].tolist())) # just retain the variances

# I don't like making this None; it opens up bugs in pd.drop,
# but it was the precedent the API set from early on, so don't
# want to change it without a warning. TODO: in future versions,
# don't do this...
if not self.drop_:
self.drop_ = None

return self
2 changes: 1 addition & 1 deletion skutil/feature_selection/tests/test_select.py
Original file line number Diff line number Diff line change
Expand Up @@ -232,7 +232,7 @@ def test_multi_collinearity():

def test_nzv_filterer():
transformer = NearZeroVarianceFilterer().fit(X)
assert transformer.drop_ is None
assert not transformer.drop_

y = X.copy()
y['zeros'] = np.zeros(150)
Expand Down

0 comments on commit 512263a

Please sign in to comment.