# Feature Scaling

In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler

In [2]:
data = pd.read_csv('../pca/data/pulsar_stars.csv')
data.head()

Unnamed: 0,Mean of the integrated profile,Standard deviation of the integrated profile,Excess kurtosis of the integrated profile,Skewness of the integrated profile,Mean of the DM-SNR curve,Standard deviation of the DM-SNR curve,Excess kurtosis of the DM-SNR curve,Skewness of the DM-SNR curve,target_class
0,140.5625,55.683782,-0.234571,-0.699648,3.199833,19.110426,7.975532,74.242225,0
1,102.507812,58.88243,0.465318,-0.515088,1.677258,14.860146,10.576487,127.39358,0
2,103.015625,39.341649,0.323328,1.051164,3.121237,21.744669,7.735822,63.171909,0
3,136.75,57.178449,-0.068415,-0.636238,3.642977,20.95928,6.896499,53.593661,0
4,88.726562,40.672225,0.600866,1.123492,1.17893,11.46872,14.269573,252.567306,0


In [7]:
# the target column is removed to perform scaling
data_pca = data[[x for x in data.columns if x != 'target_class']]

## Scale data

In [10]:
scaler = MinMaxScaler()

scaler.fit(data_pca)

scaler.data_max_

array([ 192.6171875 ,   98.77891067,    8.06952205,   68.10162173,
        223.3921405 ,  110.6422106 ,   34.53984419, 1191.000837  ])

In [14]:
data_pca.columns = [i.strip() for i in data_pca.columns]

data_pca.columns

Index(['Mean of the integrated profile',
       'Standard deviation of the integrated profile',
       'Excess kurtosis of the integrated profile',
       'Skewness of the integrated profile', 'Mean of the DM-SNR curve',
       'Standard deviation of the DM-SNR curve',
       'Excess kurtosis of the DM-SNR curve', 'Skewness of the DM-SNR curve'],
      dtype='object')

In [13]:
data_pca['Skewness of the DM-SNR curve'].max()

1191.0008369999998

In [18]:
scaler.transform(data_pca)

array([[0.72134164, 0.41768745, 0.16504291, ..., 0.11368057, 0.29498574,
        0.06388987],
       [0.51762787, 0.46090841, 0.23541516, ..., 0.0725243 , 0.36401483,
        0.10844339],
       [0.52034628, 0.19686832, 0.22113842, ..., 0.13918843, 0.28862387,
        0.05461031],
       ...,
       [0.60771193, 0.4751437 , 0.2046521 , ..., 0.49869934, 0.14965285,
        0.00550903],
       [0.58186609, 0.39361695, 0.20885482, ..., 0.05820853, 0.34892638,
        0.11418141],
       [0.27435072, 0.82458965, 0.33003783, ..., 0.5552546 , 0.04091771,
        0.00285542]])

In [26]:
scaler = MinMaxScaler()
data_rescaled = scaler.fit_transform(data_pca)

data_rescaled_manually = (data_pca - data_pca.min()) / (data_pca.max() - data_pca.min())

np.abs(np.sum(data_rescaled - data_rescaled_manually.values))

3.7772011436860726e-13

In [27]:
scaler = RobustScaler()
data_rescaled = scaler.fit_transform(data_pca)

data_rescaled_manually = (data_pca - data_pca.quantile(0.5)) / (data_pca.quantile(0.75) - data_pca.quantile(0.25))

np.abs(np.sum(data_rescaled - data_rescaled_manually.values))

0.0

In [28]:
scaler = StandardScaler()
data_rescaled = scaler.fit_transform(data_pca)

data_rescaled_manually = (data_pca - data_pca.mean()) / data_pca.std()

np.abs(np.sum(data_rescaled - data_rescaled_manually.values))

1.9289436020697437e-10