# sklearn - feature scaling

- Normalization
- Standardization (StandardScaler, MinMaxScaler)

In [6]:
import pandas as pd
from sklearn import preprocessing

abaloneFile = "./dataset/abalone.parquet"
abaloneStandardizedMinMaxFile =  "./dataset/abaloneStandardizedMinMax.parquet"
abaloneStandardizedFile =  "./dataset/abaloneStandardized.parquet"

In [7]:
dfAbalone = pd.read_parquet(abaloneFile)

print(dfAbalone.head())

   length  diameter  height  whole_weight  shucked_weight  viscera_weight  \
0   0.435     0.335   0.110         0.334          0.1355          0.0775   
1   0.585     0.450   0.125         0.874          0.3545          0.2075   
2   0.655     0.510   0.160         1.092          0.3960          0.2825   
3   0.545     0.425   0.125         0.768          0.2940          0.1495   
4   0.545     0.420   0.130         0.879          0.3740          0.1695   

   shell_weight  age  
0        0.0965    7  
1        0.2250    6  
2        0.3700   14  
3        0.2600   16  
4        0.2300   13  


In [8]:
# standardization

min_max_scaler = preprocessing.MinMaxScaler().fit_transform(X=dfAbalone)  #standardize to a range
scaler = preprocessing.StandardScaler().fit_transform(X=dfAbalone)        #standardize as z-score

dfAbaloneStandardizedMinMax = pd.DataFrame(min_max_scaler)        # (min=0, max=1)
dfAbaloneStandardized = pd.DataFrame(scaler)                      # (μ=0, σ=1)

print(dfAbaloneStandardized.describe())

                  0             1             2             3             4  \
count  3.320000e+03  3.320000e+03  3.320000e+03  3.320000e+03  3.320000e+03   
mean   2.776895e-16  4.654911e-17  2.504021e-16 -2.814348e-16  2.182993e-16   
std    1.000151e+00  1.000151e+00  1.000151e+00  1.000151e+00  1.000151e+00   
min   -3.695497e+00 -3.512070e+00 -3.261480e+00 -1.675262e+00 -1.605405e+00   
25%   -6.000452e-01 -6.151080e-01 -5.683854e-01 -7.907822e-01 -7.930368e-01   
50%    1.428632e-01  1.840540e-01  1.706990e-02 -6.003667e-02 -9.993561e-02   
75%    7.619535e-01  7.334779e-01  6.025252e-01  6.623095e-01  6.606755e-01   
max    2.412861e+00  2.431697e+00  2.320110e+01  4.073983e+00  5.087072e+00   

                  5             6             7  
count  3.320000e+03  3.320000e+03  3.320000e+03  
mean   1.771006e-16 -1.054043e-16 -2.204395e-16  
std    1.000151e+00  1.000151e+00  1.000151e+00  
min   -1.630840e+00 -1.685830e+00 -2.775823e+00  
25%   -8.002687e-01 -7.882611e-01 -5.9

In [9]:
# save to files

dfAbaloneStandardizedMinMax.to_parquet(abaloneStandardizedMinMaxFile)
dfAbaloneStandardized.to_parquet(abaloneStandardizedFile)