# Preprocessing Dataset & Compare with Sklearn Methods

In [8]:
import numpy as np
import pandas as pd
from sklearn import datasets
from numpy import set_printoptions

### Import Dataset from sklearn

In [19]:
dataset = datasets.load_diabetes()
df = pd.DataFrame(data= dataset['data'])
df.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
count,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0
mean,-3.634285e-16,1.308343e-16,-8.045349e-16,1.281655e-16,-8.835316000000001e-17,1.327024e-16,-4.574646e-16,3.777301e-16,-3.830854e-16,-3.412882e-16
std,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905
min,-0.1072256,-0.04464164,-0.0902753,-0.1123996,-0.1267807,-0.1156131,-0.1023071,-0.0763945,-0.1260974,-0.1377672
25%,-0.03729927,-0.04464164,-0.03422907,-0.03665645,-0.03424784,-0.0303584,-0.03511716,-0.03949338,-0.03324879,-0.03317903
50%,0.00538306,-0.04464164,-0.007283766,-0.005670611,-0.004320866,-0.003819065,-0.006584468,-0.002592262,-0.001947634,-0.001077698
75%,0.03807591,0.05068012,0.03124802,0.03564384,0.02835801,0.02984439,0.0293115,0.03430886,0.03243323,0.02791705
max,0.1107267,0.05068012,0.1705552,0.1320442,0.1539137,0.198788,0.1811791,0.1852344,0.133599,0.1356118


In [20]:
X = dataset.data[:, 0:9]

### Standardization 

In [22]:
from sklearn.preprocessing import StandardScaler

def standardize_X(X):
    X_mean = X.mean(axis=0)
    X_std = X.std(axis=0)
    X_scaled = (X - X_mean) / X_std
    return X_scaled

scaler = StandardScaler().fit(X)
rescaledX_sk = scaler.transform(X)

set_printoptions(precision=3)
print(rescaledX_sk[0:5, :])

rescaledX = standardize_X(X)

set_printoptions(precision=3)
print(rescaledX[0:5, :])

[[ 0.801  1.065  1.297  0.46  -0.93  -0.732 -0.912 -0.054  0.419]
 [-0.04  -0.939 -1.082 -0.554 -0.178 -0.403  1.564 -0.83  -1.437]
 [ 1.793  1.065  0.935 -0.119 -0.959 -0.719 -0.68  -0.054  0.06 ]
 [-1.872 -0.939 -0.244 -0.771  0.256  0.525 -0.758  0.721  0.477]
 [ 0.113 -0.939 -0.765  0.46   0.083  0.328  0.171 -0.054 -0.673]]
[[ 0.801  1.065  1.297  0.46  -0.93  -0.732 -0.912 -0.054  0.419]
 [-0.04  -0.939 -1.082 -0.554 -0.178 -0.403  1.564 -0.83  -1.437]
 [ 1.793  1.065  0.935 -0.119 -0.959 -0.719 -0.68  -0.054  0.06 ]
 [-1.872 -0.939 -0.244 -0.771  0.256  0.525 -0.758  0.721  0.477]
 [ 0.113 -0.939 -0.765  0.46   0.083  0.328  0.171 -0.054 -0.673]]


### Sample Normalization

In [24]:
from sklearn.preprocessing import Normalizer

L1 Norm

In [25]:
scaler = Normalizer(norm='l1').fit(X)
X_l1_sk = scaler.transform(X)

set_printoptions(precision=3)
print(X_l1_sk[0:5, :])

norms = np.abs(X).sum(axis=1)

X_l1_norm = X / norms[:, np.newaxis]
print(X_l1_norm[0:5, :])

[[ 0.12   0.16   0.194  0.069 -0.139 -0.11  -0.137 -0.008  0.063]
 [-0.006 -0.134 -0.154 -0.079 -0.025 -0.057  0.223 -0.118 -0.204]
 [ 0.281  0.167  0.146 -0.019 -0.15  -0.113 -0.107 -0.009  0.009]
 [-0.285 -0.143 -0.037 -0.117  0.039  0.08  -0.115  0.11   0.073]
 [ 0.032 -0.262 -0.213  0.128  0.023  0.091  0.048 -0.015 -0.188]]
[[ 0.12   0.16   0.194  0.069 -0.139 -0.11  -0.137 -0.008  0.063]
 [-0.006 -0.134 -0.154 -0.079 -0.025 -0.057  0.223 -0.118 -0.204]
 [ 0.281  0.167  0.146 -0.019 -0.15  -0.113 -0.107 -0.009  0.009]
 [-0.285 -0.143 -0.037 -0.117  0.039  0.08  -0.115  0.11   0.073]
 [ 0.032 -0.262 -0.213  0.128  0.023  0.091  0.048 -0.015 -0.188]]


L2 Norm

In [26]:
scaler = Normalizer(norm='l2').fit(X)
X_l2_sk = scaler.transform(X)

set_printoptions(precision=3)
print(X_l2_sk[0:5, :])

norms = np.einsum('ij, ij->i', X, X)
np.sqrt(norms, norms)

X_l2_norm = X / norms[:, np.newaxis]
set_printoptions(precision=3)
print(X_l2_norm[0:5, :])

[[ 0.325  0.432  0.526  0.186 -0.377 -0.297 -0.37  -0.022  0.17 ]
 [-0.014 -0.337 -0.389 -0.199 -0.064 -0.145  0.562 -0.298 -0.516]
 [ 0.671  0.399  0.35  -0.045 -0.359 -0.269 -0.255 -0.02   0.023]
 [-0.723 -0.362 -0.094 -0.298  0.099  0.203 -0.293  0.279  0.184]
 [ 0.075 -0.62  -0.506  0.304  0.055  0.217  0.113 -0.036 -0.445]]
[[ 0.325  0.432  0.526  0.186 -0.377 -0.297 -0.37  -0.022  0.17 ]
 [-0.014 -0.337 -0.389 -0.199 -0.064 -0.145  0.562 -0.298 -0.516]
 [ 0.671  0.399  0.35  -0.045 -0.359 -0.269 -0.255 -0.02   0.023]
 [-0.723 -0.362 -0.094 -0.298  0.099  0.203 -0.293  0.279  0.184]
 [ 0.075 -0.62  -0.506  0.304  0.055  0.217  0.113 -0.036 -0.445]]


### Feature Binarization

In [34]:
from sklearn.preprocessing import Binarizer

X_bin = dataset.data[:, 0:1]
binarizer = Binarizer(threshold=0.0).fit(X_bin)
binaryX = binarizer.transform(X_bin)
set_printoptions(precision=0)
print(binaryX[0:5, :])

threshold = 0.0
cond = X_bin > threshold
not_cond = np.logical_not(cond)
X_binarized = X_bin
X_binarized[cond] = 1
X_binarized[not_cond] = 0
set_printoptions(precision=0)
print(X_binarized[0:5, :])

[[ 1.]
 [ 0.]
 [ 1.]
 [ 0.]
 [ 1.]]
[[ 1.]
 [ 0.]
 [ 1.]
 [ 0.]
 [ 1.]]


### Feature MinMax

In [37]:
from sklearn.preprocessing import MinMaxScaler

def MinMax(X, MIN, MAX):
    X_min = np.min(X, axis=0)
    X_max = np.max(X, axis=0)
    X_std =  (X - X_min) / (X_max - X_min)   
    X_scaled = X_std * (MAX - MIN) + MIN
    return X_scaled

scaler = MinMaxScaler(feature_range=(0,1))
rescaledX_sk = scaler.fit_transform(X)

set_printoptions(precision=3)
print(rescaledX_sk[0:5, :])

rescaledX = MinMax(X, 0.0, 1.0)
set_printoptions(precision=3)
print(rescaledX[0:5, :])

[[ 1.     1.     0.583  0.549  0.294  0.257  0.208  0.     0.562]
 [ 0.     0.     0.149  0.352  0.422  0.307  0.623  0.     0.222]
 [ 1.     1.     0.517  0.437  0.289  0.259  0.247  0.     0.497]
 [ 0.     0.     0.302  0.31   0.495  0.447  0.234  1.     0.573]
 [ 1.     0.     0.207  0.549  0.466  0.417  0.39   0.     0.362]]
[[ 1.     1.     0.583  0.549  0.294  0.257  0.208  0.     0.562]
 [ 0.     0.     0.149  0.352  0.422  0.307  0.623  0.     0.222]
 [ 1.     1.     0.517  0.437  0.289  0.259  0.247  0.     0.497]
 [ 0.     0.     0.302  0.31   0.495  0.447  0.234  1.     0.573]
 [ 1.     0.     0.207  0.549  0.466  0.417  0.39   0.     0.362]]
