## Transforming Numerical Data

### Part 1: Scaling Numerical Data

#### Rescaling a Numerical feature between 0,1 with MinMax

In [3]:
from sklearn.preprocessing import MinMaxScaler
import numpy as np

feature = np.array([[-500.5],
 [-100.1],
 [0],
 [100.1],
 [900.9]])

# creater scaler
minmax_scale = MinMaxScaler(feature_range = (0,1))

# scaled feature
scaled_feature = minmax_scale.fit_transform(feature)

scaled_feature

array([[0.        ],
       [0.28571429],
       [0.35714286],
       [0.42857143],
       [1.        ]])

#### Standardizing feature using mean and standard deviation

In [4]:
from sklearn.preprocessing import StandardScaler, RobustScaler

x = np.array([[-1000.1],
 [-200.2],
 [500.5],
 [600.6],
 [9000.9]])

# create scaler
scaler = StandardScaler() # classical standardizer
robust_scaler = RobustScaler() # if features has outliers

# transform features
standardized_features = scaler.fit_transform(x)

standardized_features

print('Mean: ', standardized_features.mean())
print('Standard deviation: ', standardized_features.std())

Mean:  4.4408920985006264e-17
Standard deviation:  1.0


#### Normalizing Observations with unit vector

In [5]:
from sklearn.preprocessing import Normalizer

# create matrix
features = np.array([[0.5, 0.5],
 [1.1, 3.4],
 [1.5, 20.2],
 [1.63, 34.4],
 [10.9, 3.3]])

# create normalizer
normalizer = Normalizer()

# Normalize data
features_normalized = normalizer.fit_transform(features)
features_normalized

# alternative: using euclidean norm (l2): sum squared
features_l2_norm = Normalizer(norm='l2').transform(features)
features_l2_norm

# alternative: using manhattan norm (l1): absolute value
features_l1_norm = Normalizer(norm='l1').transform(features)
features_l1_norm

array([[0.5       , 0.5       ],
       [0.24444444, 0.75555556],
       [0.06912442, 0.93087558],
       [0.04524008, 0.95475992],
       [0.76760563, 0.23239437]])

#### Generating polynomial and interaction features:  non linear relationship between features and target

In [6]:
from sklearn.preprocessing import PolynomialFeatures

# Create feature matrix
features = np.array([[2, 3],
 [2, 3],
 [2, 3]])

# create polynomial features object
polynomial_interaction = PolynomialFeatures(degree = 2,
            include_bias = False,
            interaction_only = True)

# create polynomial features
polynomial_interaction.fit_transform(features)

# degree: maximum degreee of polynomial

array([[2., 3., 6.],
       [2., 3., 6.],
       [2., 3., 6.]])

#### Transforming Features

In [8]:
from sklearn.preprocessing import FunctionTransformer
import pandas as pd

# Create feature matrix
features = np.array([[2, 3],
 [2, 3],
 [2, 3]])

# define function
def add_ten(x):
    return x+10

# create tranformer
ten_transformer = FunctionTransformer(add_ten)

# tranform features
ten_transformer.fit_transform(features)

# alternative: use the apply function
df = pd.DataFrame(features, columns = ['feat_1', 'feat_2'])
df.apply(add_ten)

# note: we use transformer mainly to transform values into log

Unnamed: 0,feat_1,feat_2
0,12,13
1,12,13
2,12,13


### Part 2: Break up  Features into bins ie Discretizating Features

#### Binarize features according to treshold

In [10]:
from sklearn.preprocessing import Binarizer

# Create feature
age = np.array([[6],
 [12],
 [20],
 [36],
 [65]])

# create binarizer: split data in two
binarizer = Binarizer(25)

# transform features
binarizer.fit_transform(age)

array([[0],
       [0],
       [0],
       [1],
       [1]])

#### Split Data into multiple bins

In [11]:
np.digitize(age, bins=[20,30,64], right = True)

# right = True makes treshold inclusive

array([[0],
       [0],
       [0],
       [2],
       [3]], dtype=int64)