# sklearn - preprocessing general concepts

- scaler (standard or minmax) only scales (mostly moves) the dataset. Does not change the distribution!
- normalization (L1 or L2 - default)

In [1]:
from sklearn import preprocessing
import numpy as np

In [24]:
# dummy dataset

data = np.array(
    [
        [2.1, -1.9, 5.5],     #axis=1
        [-1.5, 2.4, 3.5],
        [0.5, -7.9, 5.6],
        [5.9, 2.3, -5.8]
    ])
       #axis=0

data.ndim, data.shape, data.size

(2, (4, 3), 12)

In [25]:
data.mean(), data.mean(axis=0), data.mean(axis=1)

(0.8916666666666666,
 array([ 1.75 , -1.275,  2.2  ]),
 array([ 1.9       ,  1.46666667, -0.6       ,  0.8       ]))

## standard scaler
μ = 0, σ = 1

In [41]:
# preprocessing.scale vs preprocessing.StandardScaler()
# latter is transform API, so it can be used pipelines!
# in pipelines, fit and transforms acs a little bit different for train and test data!!
#
# μ = 0, σ = 1

# scaledData = preprocessing.scale(X=data, axis=0)
scaler = preprocessing.StandardScaler()
scaledData = np.array(scaler.fit_transform(X= data))

scaledData.mean(axis=0), scaledData.std(axis=0), scaledData

(array([1.11022302e-16, 0.00000000e+00, 0.00000000e+00]),
 array([1., 1., 1.]),
 array([[ 0.12894603, -0.14880162,  0.70300338],
        [-1.19735598,  0.8749535 ,  0.27694073],
        [-0.46052153, -1.57729713,  0.72430651],
        [ 1.52893149,  0.85114524, -1.70425062]]))

## minmax scaler
min=0, max=1

In [38]:
# min=0, max=1

minmax_scaler = preprocessing.MinMaxScaler()
minMaxData = np.array(minmax_scaler.fit_transform(X=data))

print(minMaxData)

[[0.48648649 0.58252427 0.99122807]
 [0.         1.         0.81578947]
 [0.27027027 0.         1.        ]
 [1.         0.99029126 0.        ]]


## normalization
- two types L1 and L2 (default)
- works row based, (not axis based)!, so use Transpose if you need!


In [39]:
# L1 or L2
# L1 :  least absolute deviations, sum of absolute rows = 1
# L2 :  least squares (root), sum of squares rows = 1

normalizer = preprocessing.Normalizer(norm="l2")
normalizedL2Data = np.array(normalizer.fit_transform(X= data))

# 5.5 / sqrt[(2.1² + 1.9² + 5.5²)] = 0.88906489

print(normalizedL2Data)

[[ 0.33946114 -0.30713151  0.88906489]
 [-0.33325106  0.53320169  0.7775858 ]
 [ 0.05156558 -0.81473612  0.57753446]
 [ 0.68706914  0.26784051 -0.6754239 ]]


In [40]:
# L1
normalizer = preprocessing.Normalizer(norm="l1")
normalizedL1Data = np.array(normalizer.fit_transform(X= data))

# 5.5 / (2.1 + 1.9 + 5.5) = 0.57894737

print(normalizedL1Data)

[[ 0.22105263 -0.2         0.57894737]
 [-0.2027027   0.32432432  0.47297297]
 [ 0.03571429 -0.56428571  0.4       ]
 [ 0.42142857  0.16428571 -0.41428571]]
