Added custom _indexable in fixes to try to fix sklearn's

tgsmith61591 · Nov 12, 2016 · 512263a · 512263a
1 parent e4dc29a
commit 512263a
Show file tree

Hide file tree

Showing 7 changed files with 3,412 additions and 3,119 deletions.
diff --git a/skutil/feature_selection/base.py b/skutil/feature_selection/base.py
@@ -80,14 +80,14 @@ def transform(self, X):
         # check on state of X and cols
         X, _ = validate_is_pd(X, self.cols)
 
-        if self.drop_ is None:
+        if not self.drop_: # empty or None
             return X if self.as_df else X.as_matrix()
         else:
             # what if we don't want to throw this key error for a non-existent
             # column that we hope to drop anyways? We need to at least inform the
             # user...
             drops = [x for x in self.drop_ if x in X.columns]
-            if not len(drops) == len(self.drop_):
+            if len(drops) != len(self.drop_):
                 warnings.warn('one or more features to drop not contained '
                               'in input data feature names', UserWarning)
 

diff --git a/skutil/feature_selection/select.py b/skutil/feature_selection/select.py
@@ -134,7 +134,7 @@ def fit(self, X, y=None):
         # assess sparsity
         self.sparsity_ = X[cols].apply(lambda x: x.isnull().sum() / x.shape[0]).values  # numpy array
         mask = self.sparsity_ > thresh  # numpy boolean array
-        self.drop_ = X.columns[mask].tolist() if mask.sum() > 0 else None
+        self.drop_ = X.columns[mask].tolist()
         return self
 
 
@@ -289,7 +289,7 @@ def transform(self, X):
         X, _ = validate_is_pd(X, self.cols)  # copy X
         cols = X.columns if self.cols is None else self.cols
 
-        retained = X[cols]  # if cols is None, returns all
+        retained = X[cols]  # if not cols, returns all
         return retained if self.as_df else retained.as_matrix()
 
 
@@ -521,10 +521,7 @@ def fit(self, X, y=None):
         c = X[cols].corr(method=self.method).apply(lambda x: np.abs(x))
 
         # get drops list
-        d, mac, crz = filter_collinearity(c, self.threshold)
-        self.drop_ = d if d else None
-        self.mean_abs_correlations_ = mac if mac else None
-        self.correlations_ = crz if crz else None
+        self.drop_, self.mean_abs_correlations_, self.correlations_ = filter_collinearity(c, self.threshold)
 
         return self
 
@@ -702,11 +699,4 @@ def fit(self, X, y=None):
             self.drop_ = np.asarray(cols)[drop_mask].tolist()
             self.var_ = dict(zip(self.drop_, matrix[drop_mask, 0].tolist())) # just retain the variances
 
-        # I don't like making this None; it opens up bugs in pd.drop,
-        # but it was the precedent the API set from early on, so don't
-        # want to change it without a warning. TODO: in future versions,
-        # don't do this...
-        if not self.drop_:
-            self.drop_ = None
-
         return self
diff --git a/skutil/feature_selection/tests/test_select.py b/skutil/feature_selection/tests/test_select.py
@@ -232,7 +232,7 @@ def test_multi_collinearity():
 
 def test_nzv_filterer():
     transformer = NearZeroVarianceFilterer().fit(X)
-    assert transformer.drop_ is None
+    assert not transformer.drop_
 
     y = X.copy()
     y['zeros'] = np.zeros(150)