Skip to content

Commit

Permalink
Added code for sklearn.preprocessing.RankScaler
Browse files Browse the repository at this point in the history
  • Loading branch information
turian committed Jul 21, 2013
1 parent 93deda5 commit c6a8954
Show file tree
Hide file tree
Showing 2 changed files with 116 additions and 0 deletions.
93 changes: 93 additions & 0 deletions sklearn/preprocessing.py
Expand Up @@ -294,6 +294,9 @@ class StandardScaler(BaseEstimator, TransformerMixin):
:func:`sklearn.preprocessing.scale` to perform centering and
scaling without using the ``Transformer`` object oriented API
:class:`sklearn.preprocessing.RankScaler` to perform standardization
that is more robust to outliers, but slower and more memory-intensive.
:class:`sklearn.decomposition.RandomizedPCA` with `whiten=True`
to further remove the linear correlation across features.
"""
Expand Down Expand Up @@ -399,6 +402,96 @@ def __init__(self, copy=True, with_mean=True, with_std=True):
super(Scaler, self).__init__(copy, with_mean, with_std)


class RankScaler(BaseEstimator, TransformerMixin):
"""Rank-standardize features to a percentile, in the range [0, 1].
Rank-scaling happens independently on each feature, by determining
the percentile of the feature value.
A feature value that is smaller than observed during fitting will scale to 0.
A feature value that is larger than observed during fitting will scale to 1.
A feature value that is the median will scale to 0.5.
Standardization of a dataset is a common requirement for many
machine learning estimators. Rank-scaling is useful when
estimators perform badly on StandardScalar features. Rank-scaling
is more robust than StandardScaler, because outliers can't have
large values post scaling. It is an empirical question whether
you want outliers to be given high importance (StandardScaler)
or not (RankScaler).
Memory used will be equivalent to the size of the initial fit X.
(An approximation could be made by downsampling, which would
improve memory and speed.)
TODO: min and max parameters?
Attributes
----------
`sort_X_` : array of ints with shape [n_samples, n_features]
The rank-index of every feature in the fit X.
See also
--------
:class:`sklearn.preprocessing.StandardScaler` to perform standardization
that is faster, but less robust to outliers.
"""

def __init__(self):
"""
TODO: Add min and max parameters? Default = [0, 1]
"""
self.copy=True # We don't have self.copy=False implemented
pass

def fit(self, X, y=None):
"""Compute the feature ranks for later scaling.
fit will take time O(n_features * n_samples * log(n_samples)),
and use memory O(n_samples * n_features).
Parameters
----------
X : array-like or CSR matrix with shape [n_samples, n_features]
The data used to compute feature ranks.
"""
X = check_arrays(X, copy=self.copy, sparse_format="csr")[0]
if sp.issparse(X):
raise ValueError("Cannot rank-standardize sparse matrices.")
if X.ndim != 2:
raise ValueError("Rank-standardization only tested on 2-D matrices.")
else:
self.sort_X_ = np.sort(X, axis=0)
return self

def transform(self, X):
"""Perform rank-standardization.
transform will take O(n_samples * n_samples * log(n_fit_samples)),
where `n_fit_samples` is the number of samples used during `fit`.
Parameters
----------
X : array-like with shape [n_samples, n_features]
The data used to scale along the features axis.
"""
# copy = copy if copy is not None else self.copy
# X = check_arrays(X, copy=copy, sparse_format="csr")[0]
X = check_arrays(X, copy=self.copy, sparse_format="csr")[0]
if sp.issparse(X):
raise ValueError("Cannot rank-standardize sparse matrices.")
if X.ndim != 2:
raise ValueError("Rank-standardization only tested on 2-D matrices.")
else:
warn_if_not_float(X, estimator=self)
newX = []
for j in range(X.shape[1]):
newX.append(1. * (np.searchsorted(self.sort_X_[:,j], X[:,j], side='left') + np.searchsorted(self.sort_X_[:,j], X[:,j], side='right')) / (2 * self.sort_X_.shape[0]))
X = np.vstack(newX).T
return X

# def inverse_transform(self, X, copy=None):
## Not implemented

def normalize(X, norm='l2', axis=1, copy=True):
"""Normalize a dataset along any axis
Expand Down
23 changes: 23 additions & 0 deletions sklearn/tests/test_preprocessing.py
Expand Up @@ -23,6 +23,7 @@
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import scale
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RankScaler
from sklearn.preprocessing import add_dummy_feature

from sklearn import datasets
Expand Down Expand Up @@ -63,6 +64,9 @@ def test_scaler_1d():
assert_array_almost_equal(X_scaled.mean(axis=0), 0.0)
assert_array_almost_equal(X_scaled.std(axis=0), 1.0)

# rank_scaler = RankScaler()
# X_rank_scaled = rank_scaler.fit(X).transform(X)


def test_scaler_2d_arrays():
"""Test scaling of 2d array along first axis"""
Expand Down Expand Up @@ -113,6 +117,25 @@ def test_scaler_2d_arrays():
assert_true(X_scaled is not X)


X = np.array([[1, 0, 0, 0, 1],
[2, 1, 4, 1, 1],
[3, 2, 3, 1, 0],
[3, 0, 0, 4, 1]])

rank_scaler = RankScaler()
rank_scaler.fit(X)
X_scaled = rank_scaler.transform(X)
assert_array_almost_equal(X_scaled, [[ 0.125, 0.25 , 0.25 , 0.125, 0.625],
[ 0.375, 0.625, 0.875, 0.5 , 0.625],
[ 0.75 , 0.875, 0.625, 0.5 , 0.125],
[ 0.75 , 0.25 , 0.25 , 0.875, 0.625]])

X2 = np.array([[0, 1.5, 0, 5, 10]])
X2_scaled = rank_scaler.transform(X2)
assert_array_almost_equal(X2_scaled, [[ 0. , 0.75, 0.25, 1. , 1. ]])



def test_min_max_scaler_iris():
X = iris.data
scaler = MinMaxScaler()
Expand Down

0 comments on commit c6a8954

Please sign in to comment.