# Feature scaling - Normalization and Standardization

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
data = {'Age': [25, 40, 65, 30, 55],
        'Salary': [50000, 80000, 150000, 60000, 120000]}
df = pd.DataFrame(data)
df


Unnamed: 0,Age,Salary
0,25,50000
1,40,80000
2,65,150000
3,30,60000
4,55,120000


# Normalization
- Calcualte min and max value of each column
- norm=(value-min)/(max-min)

In [5]:
age_min=df["Age"].min()
age_max=df["Age"].max()
age_max, age_min

(np.int64(65), np.int64(25))

In [6]:
# noramalize the data
df["Age_norm"]=(df["Age"]-age_min)/(age_max-age_min)
df

Unnamed: 0,Age,Salary,Age_norm
0,25,50000,0.0
1,40,80000,0.375
2,65,150000,1.0
3,30,60000,0.125
4,55,120000,0.75


In [7]:
# normalize salary column
salary_min=df["Salary"].min()
salary_max=df["Salary"].max()
df["Salary_norm"]=(df["Salary"]-salary_min)/(salary_max-salary_min)
df

Unnamed: 0,Age,Salary,Age_norm,Salary_norm
0,25,50000,0.0,0.0
1,40,80000,0.375,0.3
2,65,150000,1.0,1.0
3,30,60000,0.125,0.1
4,55,120000,0.75,0.7


In [9]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

df[["Age_norm_sk", "Salary_norm_sk"]]=scaler.fit_transform(df[["Age", "Salary"]])
df


Unnamed: 0,Age,Salary,Age_norm,Salary_norm,Age_norm_sk,Salary_norm_sk
0,25,50000,0.0,0.0,0.0,0.0
1,40,80000,0.375,0.3,0.375,0.3
2,65,150000,1.0,1.0,1.0,1.0
3,30,60000,0.125,0.1,0.125,0.1
4,55,120000,0.75,0.7,0.75,0.7


# Standardization
norm=(value-mean)/S.D

After standardization
mean=0, 
S.D=1

In [10]:
data = {'Age': [25, 40, 65, 30, 55],
        'Salary': [50000, 80000, 150000, 60000, 120000]}
df = pd.DataFrame(data)
df

Unnamed: 0,Age,Salary
0,25,50000
1,40,80000
2,65,150000
3,30,60000
4,55,120000


In [11]:
age_mean=df["Age"].mean()
age_std=df["Age"].std()
age_mean, age_std

(np.float64(43.0), np.float64(16.80773631397161))

In [12]:
df["Age_std"]=(df["Age"]-age_mean)/age_std
df

Unnamed: 0,Age,Salary,Age_std
0,25,50000,-1.070935
1,40,80000,-0.178489
2,65,150000,1.308921
3,30,60000,-0.773453
4,55,120000,0.713957


In [13]:
df["Age"].mean(), df["Age_std"].mean()

(np.float64(43.0), np.float64(0.0))

In [14]:
# standardize salary column
salary_mean=df["Salary"].mean()
salary_std=df["Salary"].std()
df["Salary_std"]=(df["Salary"]-salary_mean)/salary_std
df

Unnamed: 0,Age,Salary,Age_std,Salary_std
0,25,50000,-1.070935,-0.998304
1,40,80000,-0.178489,-0.28523
2,65,150000,1.308921,1.37861
3,30,60000,-0.773453,-0.760612
4,55,120000,0.713957,0.665536


In [15]:
df["Salary"].mean(), df["Salary_std"].mean()

(np.float64(92000.0), np.float64(-2.2204460492503132e-17))

In [16]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df[["Age_std_sk", "Salary_std_sk"]]=scaler.fit_transform(df[["Age", "Salary"]])
df

Unnamed: 0,Age,Salary,Age_std,Salary_std,Age_std_sk,Salary_std_sk
0,25,50000,-1.070935,-0.998304,-1.197342,-1.116137
1,40,80000,-0.178489,-0.28523,-0.199557,-0.318896
2,65,150000,1.308921,1.37861,1.463418,1.541333
3,30,60000,-0.773453,-0.760612,-0.864747,-0.85039
4,55,120000,0.713957,0.665536,0.798228,0.744092


In [17]:
df["Salary"].mean(), df["Salary_std_sk"].mean()

(np.float64(92000.0), np.float64(0.0))