Added code for sklearn.preprocessing.RankScaler

turian · Jul 21, 2013 · c6a8954 · c6a8954
1 parent 93deda5
commit c6a8954
Show file tree

Hide file tree

Showing 2 changed files with 116 additions and 0 deletions.
diff --git a/sklearn/preprocessing.py b/sklearn/preprocessing.py
@@ -294,6 +294,9 @@ class StandardScaler(BaseEstimator, TransformerMixin):
     :func:`sklearn.preprocessing.scale` to perform centering and
     scaling without using the ``Transformer`` object oriented API
 
+    :class:`sklearn.preprocessing.RankScaler` to perform standardization
+    that is more robust to outliers, but slower and more memory-intensive.
+
     :class:`sklearn.decomposition.RandomizedPCA` with `whiten=True`
     to further remove the linear correlation across features.
     """
@@ -399,6 +402,96 @@ def __init__(self, copy=True, with_mean=True, with_std=True):
         super(Scaler, self).__init__(copy, with_mean, with_std)
 
 
+class RankScaler(BaseEstimator, TransformerMixin):
+    """Rank-standardize features to a percentile, in the range [0, 1].
+
+    Rank-scaling happens independently on each feature, by determining
+    the percentile of the feature value.
+    A feature value that is smaller than observed during fitting will scale to 0.
+    A feature value that is larger than observed during fitting will scale to 1.
+    A feature value that is the median will scale to 0.5.
+
+    Standardization of a dataset is a common requirement for many
+    machine learning estimators. Rank-scaling is useful when
+    estimators perform badly on StandardScalar features. Rank-scaling
+    is more robust than StandardScaler, because outliers can't have
+    large values post scaling. It is an empirical question whether
+    you want outliers to be given high importance (StandardScaler)
+    or not (RankScaler).
+
+    Memory used will be equivalent to the size of the initial fit X.
+    (An approximation could be made by downsampling, which would
+    improve memory and speed.)
+
+    TODO: min and max parameters?
+
+    Attributes
+    ----------
+    `sort_X_` : array of ints with shape [n_samples, n_features]
+        The rank-index of every feature in the fit X.
+
+    See also
+    --------
+    :class:`sklearn.preprocessing.StandardScaler` to perform standardization
+    that is faster, but less robust to outliers.
+    """
+
+    def __init__(self):
+        """
+        TODO: Add min and max parameters? Default = [0, 1]
+        """
+        self.copy=True  # We don't have self.copy=False implemented
+        pass
+
+    def fit(self, X, y=None):
+        """Compute the feature ranks for later scaling.
+
+        fit will take time O(n_features * n_samples * log(n_samples)),
+        and use memory O(n_samples * n_features).
+
+        Parameters
+        ----------
+        X : array-like or CSR matrix with shape [n_samples, n_features]
+            The data used to compute feature ranks.
+        """
+        X = check_arrays(X, copy=self.copy, sparse_format="csr")[0]
+        if sp.issparse(X):
+            raise ValueError("Cannot rank-standardize sparse matrices.")
+        if X.ndim != 2:
+            raise ValueError("Rank-standardization only tested on 2-D matrices.")
+        else:
+            self.sort_X_ = np.sort(X, axis=0)
+        return self
+
+    def transform(self, X):
+        """Perform rank-standardization.
+
+        transform will take O(n_samples * n_samples * log(n_fit_samples)),
+        where `n_fit_samples` is the number of samples used during `fit`.
+
+        Parameters
+        ----------
+        X : array-like with shape [n_samples, n_features]
+            The data used to scale along the features axis.
+        """
+#        copy = copy if copy is not None else self.copy
+#        X = check_arrays(X, copy=copy, sparse_format="csr")[0]
+        X = check_arrays(X, copy=self.copy, sparse_format="csr")[0]
+        if sp.issparse(X):
+            raise ValueError("Cannot rank-standardize sparse matrices.")
+        if X.ndim != 2:
+            raise ValueError("Rank-standardization only tested on 2-D matrices.")
+        else:
+            warn_if_not_float(X, estimator=self)
+            newX = []
+            for j in range(X.shape[1]):
+                newX.append(1. * (np.searchsorted(self.sort_X_[:,j], X[:,j], side='left') + np.searchsorted(self.sort_X_[:,j], X[:,j], side='right')) / (2 * self.sort_X_.shape[0]))
+            X = np.vstack(newX).T
+        return X
+
+#    def inverse_transform(self, X, copy=None):
+##       Not implemented
+
 def normalize(X, norm='l2', axis=1, copy=True):
     """Normalize a dataset along any axis
 

diff --git a/sklearn/tests/test_preprocessing.py b/sklearn/tests/test_preprocessing.py
@@ -23,6 +23,7 @@
 from sklearn.preprocessing import StandardScaler
 from sklearn.preprocessing import scale
 from sklearn.preprocessing import MinMaxScaler
+from sklearn.preprocessing import RankScaler
 from sklearn.preprocessing import add_dummy_feature
 
 from sklearn import datasets
@@ -63,6 +64,9 @@ def test_scaler_1d():
     assert_array_almost_equal(X_scaled.mean(axis=0), 0.0)
     assert_array_almost_equal(X_scaled.std(axis=0), 1.0)
 
+#    rank_scaler = RankScaler()
+#    X_rank_scaled = rank_scaler.fit(X).transform(X)
+
 
 def test_scaler_2d_arrays():
     """Test scaling of 2d array along first axis"""
@@ -113,6 +117,25 @@ def test_scaler_2d_arrays():
     assert_true(X_scaled is not X)
 
 
+    X = np.array([[1, 0, 0, 0, 1],
+                   [2, 1, 4, 1, 1],
+                   [3, 2, 3, 1, 0],
+                   [3, 0, 0, 4, 1]])
+
+    rank_scaler = RankScaler()
+    rank_scaler.fit(X)
+    X_scaled = rank_scaler.transform(X)
+    assert_array_almost_equal(X_scaled, [[ 0.125,  0.25 ,  0.25 ,  0.125,  0.625],
+                                         [ 0.375,  0.625,  0.875,  0.5  ,  0.625],
+                                         [ 0.75 ,  0.875,  0.625,  0.5  ,  0.125],
+                                         [ 0.75 ,  0.25 ,  0.25 ,  0.875,  0.625]])
+
+    X2 = np.array([[0, 1.5, 0, 5, 10]])
+    X2_scaled = rank_scaler.transform(X2)
+    assert_array_almost_equal(X2_scaled, [[ 0.  ,  0.75,  0.25,  1.  ,  1.  ]])
+
+
+
 def test_min_max_scaler_iris():
     X = iris.data
     scaler = MinMaxScaler()