# Feature Engineering - Feature Scaling
#### 작성: 고우주 | kubwa 쿱와

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [None]:
df = pd.read_csv('../dataset/spaceship-preprocessing.csv')
df.head()

In [None]:
X = df[:500].drop('Transported', axis=1)
y = df[:500]['Transported']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.3,
                                                    random_state=0)
X_train.shape, X_test.shape

## 1-Standardization
**z = (x - x_mean) /  std**

In [None]:
# standardisation: with the StandardScaler from sklearn
scaler = StandardScaler()

# fit the scaler to the train set, it will learn the parameters
scaler.fit(X_train)

# transform train and test sets
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# the scaler stores the mean of the features, learned from train set
scaler.mean_

In [None]:
# the scaler stores the standard deviation deviation of the features,
# learned from train set
scaler.scale_

In [None]:
# transform the returned NumPy arrays to dataframes
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)

In [None]:
#compare the variable distributions before and after scaling
fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(12, 5))

# before scaling
ax1.set_title('Before Scaling')
sns.kdeplot(X_train['Age'], ax=ax1)
sns.kdeplot(X_train['Spa'], ax=ax1)
sns.kdeplot(X_train['TotalPayment'], ax=ax1)


# after scaling
ax2.set_title('After Standard Scaling')
sns.kdeplot(X_train_scaled['Age'], ax=ax2)
sns.kdeplot(X_train_scaled['Spa'], ax=ax2)
sns.kdeplot(X_train_scaled['TotalPayment'], ax=ax2)
plt.show()

## 2-Mean Normalization
**x_scaled = (x - x_mean) / ( x_max - x_min)**

In [None]:
# the scalers - for mean normalisation
from sklearn.preprocessing import StandardScaler, RobustScaler

In [None]:
means = X_train.mean(axis=0)
means

In [None]:
# from the train set
ranges = X_train.max(axis=0)-X_train.min(axis=0)
ranges

In [None]:
# perform mean normalization:
X_train_scaled = (X_train - means) / ranges
X_test_scaled = (X_test - means) / ranges

In [None]:
scaler_mean = StandardScaler(with_mean=True, with_std=False)

scaler_minmax = RobustScaler(with_centering=False,
                             with_scaling=True,
                             quantile_range=(0, 100))

# fit the scalers to the train set, it will learn the parameters
scaler_mean.fit(X_train)
scaler_minmax.fit(X_train)

# transform train and test sets
X_train_scaled = scaler_minmax.transform(scaler_mean.transform(X_train))
X_test_scaled = scaler_minmax.transform(scaler_mean.transform(X_test))

In [None]:
#transform the returned NumPy arrays to dataframes
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)

In [None]:
#compare the variable distributions before and after scaling
fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(12, 5))

# before scaling
ax1.set_title('Before Scaling')
sns.kdeplot(X_train['Age'], ax=ax1)
sns.kdeplot(X_train['Spa'], ax=ax1)
sns.kdeplot(X_train['TotalPayment'], ax=ax1)


# after scaling
ax2.set_title('After Standard Scaling')
sns.kdeplot(X_train_scaled['Age'], ax=ax2)
sns.kdeplot(X_train_scaled['Spa'], ax=ax2)
sns.kdeplot(X_train_scaled['TotalPayment'], ax=ax2)
plt.show()

## 3-MinMaxScaling
**X_scaled = (X - X.min / (X.max - X.min)**

In [None]:
# the scaler - for min-max scaling
from sklearn.preprocessing import MinMaxScaler

In [None]:
# set up the scaler
scaler = MinMaxScaler()

# fit the scaler to the train set, it will learn the parameters
scaler.fit(X_train)

# transform train and test sets
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# the scaler stores the maximum values of the features, learned from train set
scaler.data_max_

In [None]:
# the scaler stores the minimum values of the features, learned from train set
scaler.min_

In [None]:
# the scaler also stores the value range (max -  min)
scaler.data_range_

In [None]:
# transform the returned NumPy arrays to dataframes
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)

In [None]:
#compare the variable distributions before and after scaling
fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(12, 5))

# before scaling
ax1.set_title('Before Scaling')
sns.kdeplot(X_train['Age'], ax=ax1)
sns.kdeplot(X_train['Spa'], ax=ax1)
sns.kdeplot(X_train['TotalPayment'], ax=ax1)

# after scaling
ax2.set_title('After Standard Scaling')
sns.kdeplot(X_train_scaled['Age'], ax=ax2)
sns.kdeplot(X_train_scaled['Spa'], ax=ax2)
sns.kdeplot(X_train_scaled['TotalPayment'], ax=ax2)
plt.show()

## 4-MaxAbsScaling
**X_scaled = X / X.max**

In [None]:
# the scaler - for MaxAbsScaling, with centering
from sklearn.preprocessing import MaxAbsScaler, StandardScaler

In [None]:
# set up the scaler
scaler = MaxAbsScaler()

# fit the scaler to the train set, it will learn the parameters
scaler.fit(X_train)

# transform train and test sets
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# the scaler stores the maximum values of the features as learned from train set
scaler.max_abs_

In [None]:
# transform the returned NumPy arrays to dataframes 

X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)

In [None]:
#compare the variable distributions before and after scaling
fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(12, 5))

# before scaling
ax1.set_title('Before Scaling')
sns.kdeplot(X_train['Age'], ax=ax1)
sns.kdeplot(X_train['Spa'], ax=ax1)
sns.kdeplot(X_train['TotalPayment'], ax=ax1)

# after scaling
ax2.set_title('After Standard Scaling')
sns.kdeplot(X_train_scaled['Age'], ax=ax2)
sns.kdeplot(X_train_scaled['Spa'], ax=ax2)
sns.kdeplot(X_train_scaled['TotalPayment'], ax=ax2)
plt.show()

## 5-Centering + MaxAbsScaling
- 2개 Tranform을 사용 결합하여 분포 0을 중앙에 둔 다음 절대 최대값으로 확장

In [None]:
scaler_mean = StandardScaler(with_mean=True, with_std=False)

# set up the MaxAbsScaler normally
scaler_maxabs = MaxAbsScaler()

# fit the scalers to the train set, it will learn the parameters
scaler_mean.fit(X_train)
scaler_maxabs.fit(X_train)

# transform train and test sets
X_train_scaled = scaler_maxabs.transform(scaler_mean.transform(X_train))
X_test_scaled = scaler_maxabs.transform(scaler_mean.transform(X_test))

In [None]:
#transform the returned NumPy arrays to dataframes
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)

In [None]:
#compare the variable distributions before and after scaling
fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(12, 5))

# before scaling
ax1.set_title('Before Scaling')
sns.kdeplot(X_train['Age'], ax=ax1)
sns.kdeplot(X_train['Spa'], ax=ax1)
sns.kdeplot(X_train['TotalPayment'], ax=ax1)

# after scaling
ax2.set_title('After Standard Scaling')
sns.kdeplot(X_train_scaled['Age'], ax=ax2)
sns.kdeplot(X_train_scaled['Spa'], ax=ax2)
sns.kdeplot(X_train_scaled['TotalPayment'], ax=ax2)
plt.show()

## 6-RobustScaling: quantiles and median
**X_scaled = X - X_median / ( X.quantile(0.75) - X.quantile(0.25) )**

In [None]:
from sklearn.preprocessing import RobustScaler

In [None]:
# set up the scaler
scaler = RobustScaler()

# fit the scaler to the train set, it will learn the parameters
scaler.fit(X_train)

# transform train and test sets
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# the scaler stores the median values of the features as learned from train set
scaler.center_

In [None]:
# the scaler stores the IQR values of the features as learned from train set
scaler.scale_

In [None]:
# transform the returned NumPy arrays to dataframes
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)

In [None]:
#compare the variable distributions before and after scaling
fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(12, 5))

# before scaling
ax1.set_title('Before Scaling')
sns.kdeplot(X_train['Age'], ax=ax1)
sns.kdeplot(X_train['Spa'], ax=ax1)
sns.kdeplot(X_train['TotalPayment'], ax=ax1)

# after scaling
ax2.set_title('After Standard Scaling')
sns.kdeplot(X_train_scaled['Age'], ax=ax2)
sns.kdeplot(X_train_scaled['Spa'], ax=ax2)
sns.kdeplot(X_train_scaled['TotalPayment'], ax=ax2)
plt.show()

## 7-Normalizer: L1, L2
**X_scaled_l1 = X / l1(X)**</br>
**X_scaled_l2 = X / l2(X)**

### L1 Norm 공식
- `norm='l1'`으로 지정하면 절댓값인 L1 노름을 사용합니다. 
- 앞에서와 마찬가지로 샘플 별로 L1 노름을 계산한 다음 각 샘플을 나눕니다.

$ \lVert \boldsymbol{x} \rVert_1 = \lvert x_1 \rvert + \lvert x_2 \rvert + \cdots + \lvert x_n \rvert $

### L2 Norm 공식

$ \lVert \boldsymbol{x} \rVert_2 = \sqrt{x_1^2 + x_2^2 + \cdots + x_n^2} $

- 먼저 샘플 별로 특성의 제곱을 더하기 위해 `axis=1`을 사용합니다. 
- 이 값의 제곱근을 구하면 L2 노름입니다. 
- 그 다음 각 샘플의 특성을 해당 L2 노름으로 나눕니다.

In [None]:
from sklearn.preprocessing import Normalizer

In [None]:
# set up the scaler
scaler = Normalizer(norm='l1') # for euclidean distance we change to norm='l2' 

# fit the scaler, this procedure does NOTHING
scaler.fit(X_train)

# transform train and test sets
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# calculate the norm for each observation (feature vector)
np.round(np.linalg.norm(X_train, ord=1, axis=1), 1)

In [None]:
# set up the scaler
scaler = Normalizer(norm='l2')

# fit the scaler, this procedure does NOTHING
scaler.fit(X_train)

# transform train and test sets
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Scaled data
np.round( np.linalg.norm(X_train_scaled, ord=1, axis=1), 1)