In [11]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

In [9]:
df = pd.read_csv('../data/wine/wine.data', header=None)
df.columns = [
    'Label',
    'Alcohol',
    'Malic acid',
    'Ash',
    'Alcalinity of ash',
    'Magnesium',
    'Total phenols',
    'Flavanoids',
    'Nonflavanoid phenols',
    'Proanthocyanins',
    'Color intensity',
    'Hue',
    'OD280/OD315',
    'Proline',
]

In [10]:
df.head()

Unnamed: 0,Label,Alcohol,Malic acid,Ash,Alcalinity of ash,Magnesium,Total phenols,Flavanoids,Nonflavanoid phenols,Proanthocyanins,Color intensity,Hue,OD280/OD315,Proline
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [27]:
X, y = df.iloc[:, 1:].values, df.iloc[:, 0].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=0)

## Нормализация

$$x^{(i)}_{norm} = \frac{x^{(i)}-x_{min}}{x_{max}-x_{min}}$$

In [20]:
from sklearn.preprocessing import MinMaxScaler

In [28]:
mms = MinMaxScaler()
X_train_norm = mms.fit_transform(X_train)
X_test_norm = mms.transform(X_test)

In [47]:
pd.DataFrame([X_train_norm.mean(axis=0), X_train_norm.std(axis=0)], index=['Mean', 'Std'])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
Mean,0.525017,0.313805,0.539325,0.460093,0.327051,0.416686,0.458568,0.431297,0.377373,0.355293,0.355431,0.487888,0.352037
Std,0.214543,0.237837,0.14799,0.183768,0.158696,0.215026,0.297584,0.237949,0.188552,0.22367,0.20051,0.263494,0.249077


In [48]:
X_train_norm[0]

array([ 0.72043011,  0.20378151,  0.53763441,  0.30927835,  0.33695652,
        0.54316547,  0.73700306,  0.25      ,  0.40189873,  0.24068768,
        0.48717949,  1.        ,  0.5854251 ])

## Стандартизация
Является более предпочтительной, т.к. менее чувствительна к выбросам и приводит к нормальному распределению

$$x^{(i)}_{std} = \frac{x^{(i)}-\mu_x}{\sigma_x}$$
где $\mu_x$ - эмпирическое среднее отдельно взятого столбца, <br/>
а $\sigma_x$ - соответвующее стандартное отклонение

In [29]:
from sklearn.preprocessing import StandardScaler

In [30]:
ss = StandardScaler()
X_train_std = ss.fit_transform(X_train)
X_test_std = ss.transform(X_test)

In [49]:
X_train_std[0]

array([ 0.91083058, -0.46259897, -0.01142613, -0.82067872,  0.06241693,
        0.58820446,  0.93565436, -0.7619138 ,  0.13007174, -0.51238741,
        0.65706596,  1.94354495,  0.93700997])

In [63]:
pd.options.display.float_format = '{:,.1f}'.format
# pd.options.display.float_format = None

In [64]:
pd.DataFrame([X_train_std.mean(axis=0), X_train_std.std(axis=0)], index=['Mean', 'Std'])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
Mean,0.0,-0.0,0.0,0.0,-0.0,-0.0,0.0,-0.0,0.0,-0.0,-0.0,0.0,0.0
Std,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
