## Preprocessing data - Standardization, or mean removal and variance scaling
(http://scikit-learn.org/stable/modules/preprocessing.html)

### Gaussian with zero mean and unit variance

In [20]:
from sklearn import preprocessing
import numpy as np

X = np.array([[ 1., -1.,  2.],
               [ 2.,  0.,  0.],
               [ 0.,  1., -1.]])
X_scaled = preprocessing.scale(X)

X_scaled

array([[ 0.        , -1.22474487,  1.33630621],
       [ 1.22474487,  0.        , -0.26726124],
       [-1.22474487,  1.22474487, -1.06904497]])

In [3]:
X_scaled.mean(axis=0)

array([ 0.,  0.,  0.])

In [4]:
X_scaled.std(axis=0)

array([ 1.,  1.,  1.])

### the mean and standard deviation on a training set (X에 대해 구한 값을 transform을 통해 다른 dataset에 대해서도 적용 가능)

In [5]:
scaler = preprocessing.StandardScaler().fit(X)
scaler

StandardScaler(copy=True, with_mean=True, with_std=True)

In [6]:
scaler.mean_

array([ 1.        ,  0.        ,  0.33333333])

In [7]:
scaler.scale_

array([ 0.81649658,  0.81649658,  1.24721913])

In [8]:
scaler.transform(X)

array([[ 0.        , -1.22474487,  1.33630621],
       [ 1.22474487,  0.        , -0.26726124],
       [-1.22474487,  1.22474487, -1.06904497]])

In [9]:
scaler.transform([[-1.,  1., 0.]])    # test dataset에 대해서도 적용

array([[-2.44948974,  1.22474487, -0.26726124]])

### Scaling features to a range

In [10]:
X_train = np.array([[ 1., -1.,  2.],
                     [ 2.,  0.,  0.],
                     [ 0.,  1., -1.]])

min_max_scaler = preprocessing.MinMaxScaler()    #  to scale a toy data matrix to the [0, 1] range
X_train_minmax = min_max_scaler.fit_transform(X_train)
X_train_minmax

array([[ 0.5       ,  0.        ,  1.        ],
       [ 1.        ,  0.5       ,  0.33333333],
       [ 0.        ,  1.        ,  0.        ]])

In [11]:
X_test = np.array([[ -3., -1.,  4.]])
X_test_minmax = min_max_scaler.transform(X_test)
X_test_minmax

array([[-1.5       ,  0.        ,  1.66666667]])

#### MinMaxScaler() full formula feature_range=(min, max)

In [19]:
X_std = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0))
print(X_std)
X_scaled = X_std / (X.max(axis=0) - X.min(axis=0)) + X.min(axis=0)
print(X_scaled)

[[ 0.5         0.          1.        ]
 [ 1.          0.5         0.33333333]
 [ 0.          1.          0.        ]]
[[ 0.25       -1.         -0.66666667]
 [ 0.5        -0.75       -0.88888889]
 [ 0.         -0.5        -1.        ]]


#### MaxAbsScaler

In [15]:
X_train = np.array([[ 1., -1.,  2.],
                     [ 2.,  0.,  0.],
                     [ 0.,  1., -1.]])

max_abs_scaler = preprocessing.MaxAbsScaler()    # within the range [-1, 1]
X_train_maxabs = max_abs_scaler.fit_transform(X_train)
X_train_maxabs                # doctest +NORMALIZE_WHITESPACE^

array([[ 0.5, -1. ,  1. ],
       [ 1. ,  0. ,  0. ],
       [ 0. ,  1. , -0.5]])

In [16]:
X_test = np.array([[ -3., -1.,  4.]])
X_test_maxabs = max_abs_scaler.transform(X_test)
X_test_maxabs

array([[-1.5, -1. ,  2. ]])

In [17]:
print(max_abs_scaler.scale_)

[ 2.  1.  2.]


#### Scaling sparse data

#### Scaling data with outliers

#### Centering kernel matrices