In [1]:
# scale datasets uniformly

# load up new datasets - wine
import pandas as pd
import numpy as np
df_wine = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data', header=None)
df_wine.columns = ['label', 'alcohol', 'malic acid', 'ash', 'alcalinity of ash', 
                   'magnesium', 'total phenols', 'flavanoids', 'nonflavanoid phenols', 
                   'proanthocyanins', 'color intensity', 'hue', 
                   'OD280/OD315 of diluted wines', 'proline']
print(np.unique(df_wine['label']), '\n')
print(df_wine.head())

[1 2 3] 

   label  alcohol  malic acid   ash  alcalinity of ash  magnesium  \
0      1    14.23        1.71  2.43               15.6        127   
1      1    13.20        1.78  2.14               11.2        100   
2      1    13.16        2.36  2.67               18.6        101   
3      1    14.37        1.95  2.50               16.8        113   
4      1    13.24        2.59  2.87               21.0        118   

   total phenols  flavanoids  nonflavanoid phenols  proanthocyanins  \
0           2.80        3.06                  0.28             2.29   
1           2.65        2.76                  0.26             1.28   
2           2.80        3.24                  0.30             2.81   
3           3.85        3.49                  0.24             2.18   
4           2.80        2.69                  0.39             1.82   

   color intensity   hue  OD280/OD315 of diluted wines  proline  
0             5.64  1.04                          3.92     1065  
1             4.

In [2]:
# split datasets
from sklearn.model_selection import train_test_split
X, y = df_wine.iloc[:, 1:].values, df_wine.iloc[:, 0].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0, stratify=y)
print(pd.DataFrame(X_train).head())
print(pd.DataFrame(y_train).head())

      0     1     2     3      4     5     6     7     8    9     10    11  \
0  13.62  4.95  2.35  20.0   92.0  2.00  0.80  0.47  1.02  4.4  0.91  2.05   
1  13.76  1.53  2.70  19.5  132.0  2.95  2.74  0.50  1.35  5.4  1.25  3.00   
2  13.73  1.50  2.70  22.5  101.0  3.00  3.25  0.29  2.38  5.7  1.19  2.71   
3  13.51  1.80  2.65  19.0  110.0  2.35  2.53  0.29  1.54  4.2  1.10  2.87   
4  12.60  2.46  2.20  18.5   94.0  1.62  0.66  0.63  0.94  7.1  0.73  1.58   

       12  
0   550.0  
1  1235.0  
2  1285.0  
3  1095.0  
4   695.0  
   0
0  3
1  1
2  1
3  1
4  3


In [3]:
# scale training datasets and apply the 
# fitted scalar to test and other datasets

# min-max scalar (normalize)
# -----------------
# min-max scalar scale the datasets into range of 0.0 and 1.0 
from sklearn.preprocessing import MinMaxScaler
mms = MinMaxScaler()
X_train_norm = mms.fit_transform(X_train)
X_test_norm = mms.transform(X_test)
print(pd.DataFrame(X_train_norm).head())

         0         1         2         3         4         5         6     7   \
0  0.646199  0.832016  0.424837  0.462366  0.271605  0.351724  0.097046  0.68   
1  0.687135  0.156126  0.653595  0.435484  0.765432  0.679310  0.506329  0.74   
2  0.678363  0.150198  0.653595  0.596774  0.382716  0.696552  0.613924  0.32   
3  0.614035  0.209486  0.620915  0.408602  0.493827  0.472414  0.462025  0.32   
4  0.347953  0.339921  0.326797  0.381720  0.296296  0.220690  0.067511  1.00   

         8         9         10        11        12  
0  0.189873  0.236234  0.457447  0.285714  0.194009  
1  0.294304  0.325044  0.819149  0.633700  0.682596  
2  0.620253  0.351687  0.755319  0.527473  0.718260  
3  0.354430  0.218472  0.659574  0.586081  0.582739  
4  0.164557  0.476021  0.265957  0.113553  0.297432  


In [4]:
# standard scalar (standardize)
# -----------------------------
# standard scalar scale datasets into range of -1.0 and 1.0
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
X_train_std = ss.fit_transform(X_train)
X_test_std = ss.transform(X_test)
print(pd.DataFrame(X_train_std).head())

         0         1         2         3         4         5         6   \
0  0.712259  2.220487 -0.130259  0.059629 -0.504327 -0.528316 -1.240000   
1  0.882292 -0.704572  1.175336 -0.090655  2.341479  1.016759  0.662995   
2  0.845856 -0.730230  1.175336  0.811048  0.135979  1.098079  1.163267   
3  0.578661 -0.473646  0.988823 -0.240939  0.776285  0.040922  0.457000   
4 -0.526554  0.090839 -0.689799 -0.391223 -0.362037 -1.146346 -1.377330   

         7         8         9         10        11        12  
0  0.841180 -1.052151 -0.292189 -0.200170 -0.821641 -0.629464  
1  1.088743 -0.492935  0.131521  1.339826  0.549313  1.475688  
2 -0.644195  1.252496  0.258634  1.068062  0.130811  1.629349  
3 -0.644195 -0.170963 -0.376931  0.660416  0.361708  1.045438  
4  2.161513 -1.187719  0.851827 -1.015462 -1.499903 -0.183848  


In [5]:
# manully normalize and standardize
# ---------------------------------
ex = np.array([0, 1, 2, 3, 4, 5])
print('standardized: \n', pd.DataFrame((ex - np.mean(ex)) / np.std(ex)))
print('normalize: \n', pd.DataFrame((ex - ex.min()) / (ex.max() - ex.min())))

standardized: 
          0
0 -1.46385
1 -0.87831
2 -0.29277
3  0.29277
4  0.87831
5  1.46385
normalize: 
      0
0  0.0
1  0.2
2  0.4
3  0.6
4  0.8
5  1.0


In [6]:
# Robust Scalar
# -------------
# robust scalar is more suitable for small datasets
# and datasets who have many abnormal data or easily 
# overfitting
from sklearn.preprocessing import RobustScaler
rs = RobustScaler()
X_train_rs = rs.fit_transform(X_train)
X_test_rs = rs.transform(X_test)
print(pd.DataFrame(X_train_rs).head())

         0         1         2         3         4         5         6   \
0  0.416510  2.003226 -0.057971  0.119048 -0.301370 -0.396135 -0.821053   
1  0.521576 -0.203226  0.956522  0.000000  1.890411  0.521739  0.345865   
2  0.499062 -0.222581  0.956522  0.714286  0.191781  0.570048  0.652632   
3  0.333959 -0.029032  0.811594 -0.119048  0.684932 -0.057971  0.219549   
4 -0.348968  0.396774 -0.492754 -0.238095 -0.191781 -0.763285 -0.905263   

         7         8         9         10        11        12  
0  0.753623 -0.804196 -0.116667 -0.166667 -0.652452 -0.230949  
1  0.927536 -0.342657  0.216667  0.863636  0.157783  1.050023  
2 -0.289855  1.097902  0.316667  0.681818 -0.089552  1.143525  
3 -0.289855 -0.076923 -0.183333  0.409091  0.046908  0.788219  
4  1.681159 -0.916084  0.783333 -0.712121 -1.053305  0.040206  
