## 1. Standardization, or mean removal and variance scaling

In practice we often ignore the shape of the distribution and just transform the data to center it by removing the mean value of each feature, then scale it by dividing non-constant features by their standard deviation.

In [1]:
from sklearn import preprocessing

In [2]:
import numpy as np

In [3]:
X_train = np.array([[ 1., -1.,  2.],[ 2.,  0.,  0.],[ 0.,  1., -1.]])

In [4]:
X_train

array([[ 1., -1.,  2.],
       [ 2.,  0.,  0.],
       [ 0.,  1., -1.]])

In [5]:
X_scaled = preprocessing.scale(X_train)
X_scaled

array([[ 0.        , -1.22474487,  1.33630621],
       [ 1.22474487,  0.        , -0.26726124],
       [-1.22474487,  1.22474487, -1.06904497]])

In [6]:

X_scaled.mean(axis=0)

array([0., 0., 0.])

In [7]:
X_scaled.std(axis=0)

array([1., 1., 1.])

The standard score of a sample x is calculated as:

    z = (x - u) / s

In [8]:
from sklearn.preprocessing import StandardScaler
data = [[0, 0], [0, 0], [1, 1], [1, 1],[1,2]]
scaler = StandardScaler(copy=False)
print(scaler.fit(data))
StandardScaler(copy=True, with_mean=True, with_std=True)
print(scaler.mean_)
print(scaler.transform(data))


StandardScaler(copy=False, with_mean=True, with_std=True)
[0.6 0.8]
[[-1.22474487 -1.06904497]
 [-1.22474487 -1.06904497]
 [ 0.81649658  0.26726124]
 [ 0.81649658  0.26726124]
 [ 0.81649658  1.60356745]]


### Scaling features to a range

Transforms features by scaling each feature to a given range.

In [9]:
min_max_scaler = preprocessing.MinMaxScaler(feature_range=(2,15))

In [10]:
X_train_minmax = min_max_scaler.fit_transform(X_train)

In [11]:
X_train_minmax

array([[ 8.5       ,  2.        , 15.        ],
       [15.        ,  8.5       ,  6.33333333],
       [ 2.        , 15.        ,  2.        ]])

Scale each feature by its maximum absolute value. range(-1,1)

In [12]:
transformer = preprocessing.MaxAbsScaler().fit_transform(X_train)

In [13]:
transformer

array([[ 0.5, -1. ,  1. ],
       [ 1. ,  0. ,  0. ],
       [ 0. ,  1. , -0.5]])

#### Mapping to a Uniform distribution

In [24]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
iris = load_iris()
X, y = iris.data, iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
quantile_transformer = preprocessing.QuantileTransformer(random_state=0)
X_train_trans = quantile_transformer.fit_transform(X_train)
X_test_trans = quantile_transformer.transform(X_test)
X_train[0]

array([5.9, 3. , 4.2, 1.5])

In [25]:
X_train_trans[0]

array([0.54954955, 0.43693694, 0.48198198, 0.59009009])

#### Mapping to a Gaussian distribution

In [16]:
pt = preprocessing.PowerTransformer()
data = [[1, 2], [3, 2], [4, 5]]
print(pt.fit(data))
print(pt.lambdas_)
print(pt.transform(data))


PowerTransformer(copy=True, method='yeo-johnson', standardize=True)
[ 1.38668178 -3.10053309]
[[-1.31616039 -0.70710678]
 [ 0.20998268 -0.70710678]
 [ 1.1061777   1.41421356]]


In [19]:
pt = preprocessing.PowerTransformer(method='box-cox', standardize=False)
X_lognormal = np.random.RandomState(616).lognormal(size=(3, 3))
print(X_lognormal)                                         
ss=pt.fit_transform(X_lognormal)                   
ss

[[1.28331718 1.18092228 0.84160269]
 [0.94293279 1.60960836 0.3879099 ]
 [1.35235668 0.21715673 1.09977091]]


array([[ 0.49024349,  0.17881995, -0.1563781 ],
       [-0.05102892,  0.58863196, -0.57612415],
       [ 0.69420008, -0.84857822,  0.10051454]])

In [23]:
np.mean(ss[2])

-0.01795453104331375

### Encoding categorical features

In [20]:
enc = preprocessing.OrdinalEncoder()
X = [['male', 'from US', 'uses Safari'], ['female', 'from Europe', 'uses Firefox']]
enc.fit(X)  
enc.transform([['female', 'from US', 'uses Safari']])


array([[0., 1., 1.]])

In [21]:
enc.categories_

[array(['female', 'male'], dtype=object),
 array(['from Europe', 'from US'], dtype=object),
 array(['uses Firefox', 'uses Safari'], dtype=object)]

#### OneHotEncoder

In [22]:
enc = preprocessing.OneHotEncoder()
X = [['male', 'from US', 'uses Safari'], ['female', 'from Europe', 'uses Firefox']]
enc.fit(X)  
enc.transform([['female', 'from US', 'uses Safari'],['male', 'from Europe', 'uses Safari']]).toarray()


array([[1., 0., 0., 1., 0., 1.],
       [0., 1., 1., 0., 0., 1.]])

In [23]:
enc.categories_

[array(['female', 'male'], dtype=object),
 array(['from Europe', 'from US'], dtype=object),
 array(['uses Firefox', 'uses Safari'], dtype=object)]

In [None]:
le = preprocessing.LabelEncoder()
le.fit(["paris", "paris", "tokyo", "amsterdam"])
LabelEncoder()
>>> list(le.classes_)
['amsterdam', 'paris', 'tokyo']
>>> le.transform(["tokyo", "tokyo", "paris"]) 
array([2, 2, 1]...)
>>> list(le.inverse_transform([2, 2, 1]))