## Scaling


In [3]:
import pandas as pd
from sklearn.datasets import load_iris

column_names = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
iris = load_iris()
iris_x = pd.DataFrame(iris.data, columns=column_names)
display(iris_x)
iris_y = pd.Series(iris.target)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


In [29]:
from sklearn.model_selection import train_test_split

train_x, test_x, train_y, test_y = train_test_split(iris_x, iris_y, test_size=0.1, shuffle=False)

## how to scale train and test data?
* Don't apply a different scaler! Because the distribution is different.

```
train_scaler = preprocessing.StandardScaler()
train_x[column_names] = train_scaler.fit_transform(train_x[column_names])

test_scaler = preprocessing.StandardScaler()
test_x[column_names] = test_scaler.fit_transform(test_x[column_names])
display(test_x) 
```

In [37]:
from sklearn import preprocessing

# method 1
scaler = preprocessing.StandardScaler()
scaler.fit(train_x[column_names])

train_x[column_names] = scaler.transform(train_x[column_names])
test_x[column_names] = scaler.transform(test_x[column_names])
display(test_x)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
135,2.34921,-0.126303,1.432195,1.662477
136,0.646753,0.759459,1.14828,1.80042
137,0.768357,0.095137,1.091497,0.972758
138,0.281941,-0.126303,0.694015,0.972758
139,1.376378,0.095137,1.034714,1.386589
140,1.13317,0.095137,1.14828,1.80042
141,1.376378,0.095137,0.864364,1.662477
142,0.038733,-0.790625,0.864364,1.110702
143,1.254774,0.316578,1.318629,1.662477
144,1.13317,0.538019,1.205063,1.938364


Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
135,2.249683,-0.131979,1.331133,1.448832
136,0.553333,0.788808,1.046945,1.580464
137,0.674501,0.098217,0.990108,0.790671
138,0.18983,-0.131979,0.592246,0.790671
139,1.28034,0.098217,0.933271,1.185567
140,1.038005,0.098217,1.046945,1.580464
141,1.28034,0.098217,0.762758,1.448832
142,-0.052506,-0.82257,0.762758,0.922303
143,1.159173,0.328414,1.217458,1.448832
144,1.038005,0.558611,1.103783,1.712096


In [None]:
# method 2
scaler = preprocessing.StandardScaler()
scaler.fit(pd.concat([train_x[column_names], test_x[column_names]]))

train_x[column_names] = scaler.transform(train_x[column_names])
test_x[column_names] = scaler.transform(test_x[column_names])
display(test_x)

## standarlization
* average=0, standard deviation=1

In [38]:
scaler = preprocessing.StandardScaler()
scaler.fit(train_x[column_names])

train_x[column_names] = scaler.transform(train_x[column_names])
test_x[column_names] = scaler.transform(test_x[column_names])
display(test_x)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
135,2.34921,-0.126303,1.432195,1.662477
136,0.646753,0.759459,1.14828,1.80042
137,0.768357,0.095137,1.091497,0.972758
138,0.281941,-0.126303,0.694015,0.972758
139,1.376378,0.095137,1.034714,1.386589
140,1.13317,0.095137,1.14828,1.80042
141,1.376378,0.095137,0.864364,1.662477
142,0.038733,-0.790625,0.864364,1.110702
143,1.254774,0.316578,1.318629,1.662477
144,1.13317,0.538019,1.205063,1.938364


## min-max scaling
* fit scale from 0 to 1
* image data have good affinity with min-max scaler because their range is 0-255

In [39]:
from sklearn import preprocessing
scaler = preprocessing.MinMaxScaler()
scaler.fit(train_x[column_names])

train_x[column_names] = scaler.transform(train_x[column_names])
test_x[column_names] = scaler.transform(test_x[column_names])
display(test_x)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
135,0.944444,0.416667,0.864407,0.916667
136,0.555556,0.583333,0.779661,0.958333
137,0.583333,0.458333,0.762712,0.708333
138,0.472222,0.416667,0.644068,0.708333
139,0.722222,0.458333,0.745763,0.833333
140,0.666667,0.458333,0.779661,0.958333
141,0.722222,0.458333,0.694915,0.916667
142,0.416667,0.291667,0.694915,0.75
143,0.694444,0.5,0.830508,0.916667
144,0.666667,0.541667,0.79661,1.0


## log scaling
* dealing with money is a good use case for applying this scaling