# Feature Scaling

* StandardScaler
* RobustScaler
* MinMaxScaler

# Import Necessary Libraries

In [3]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)
pd.set_option("display.width", 500)
pd.set_option("display.float_format", lambda x: "%.3f" % x)
import warnings
warnings.filterwarnings("ignore")

# Import Dataset

In [2]:
titanic = sns.load_dataset("titanic")
df = titanic.copy()
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.283,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


# StandardScaler

(Xi - Xmean) / Xstd

In [59]:
# Example of Formula
(df["age"][0] - df["age"].mean()) / (df["age"].std())

-0.5300050983330725

In [27]:
def standard_scaler(dataframe, num_col):
  temp_df = dataframe.copy()
  ss = StandardScaler()
  temp_df[num_col] = ss.fit_transform(temp_df[[num_col]])
  return temp_df

In [28]:
new_df = standard_scaler(df, "age")

In [29]:
new_df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,-0.53,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,0.572,1,0,71.283,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,-0.255,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,0.365,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,0.365,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [30]:
def grab_col_names(dataframe, cat_th=10, car_th=20):
    cat_cols = [col for col in dataframe.columns if str(dataframe[col].dtypes) in ["category", "object", "bool"]]
    num_but_cat = [col for col in dataframe.columns if dataframe[col].nunique() < cat_th and dataframe[col].dtypes in ["uint8", "int64", "float64"]]
    cat_but_car = [col for col in dataframe.columns if dataframe[col].nunique() > car_th and str(dataframe[col].dtypes) in ["category", "object"]]
    cat_cols = cat_cols + num_but_cat
    cat_cols = [col for col in cat_cols if col not in cat_but_car]

    num_cols = [col for col in dataframe.columns if dataframe[col].dtypes in ["uint8", "int64", "float64"]]
    num_cols = [col for col in num_cols if col not in cat_cols]

    return cat_cols, num_cols, cat_but_car, num_but_cat

In [31]:
cat_cols, num_cols, cat_but_car, num_but_cat = grab_col_names(df)
print(f"observations: {df.shape[0]}")
print(f"variables: {df.shape[1]}")
print(f"cat_cols: {len(cat_cols)}")
print(f"num_cols: {len(num_cols)}")
print(f"cat_but_car: {len(cat_but_car)}")
print(f"num_but_cat: {len(num_but_cat)}")

observations: 891
variables: 15
cat_cols: 13
num_cols: 2
cat_but_car: 0
num_but_cat: 4


In [32]:
def standard_scaler_dataframe(dataframe):
  temp_df = dataframe.copy()
  ss = StandardScaler()
  cat_cols, num_cols, cat_but_car, num_but_cat = grab_col_names(dataframe)
  for col in num_cols:
    temp_df[col] = ss.fit_transform(temp_df[[col]])
  return temp_df

In [33]:
new_df = standard_scaler_dataframe(df)

In [34]:
new_df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,-0.53,1,0,-0.502,S,Third,man,True,,Southampton,no,False
1,1,1,female,0.572,1,0,0.787,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,-0.255,0,0,-0.489,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,0.365,1,0,0.421,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,0.365,0,0,-0.486,S,Third,man,True,,Southampton,no,True


# RobustScaler

(Xi - Xmedian) / Xiqr

Xiqr = X75 - X25

In [65]:
# Examle of Formula
(df["age"][0] - df["age"].median()) / (df["age"].quantile(0.75) - df["age"].quantile(0.25))

-0.3356643356643357

In [47]:
def robust_scaler(dataframe, num_col):
  temp_df = dataframe.copy()
  rs = RobustScaler()
  temp_df[num_col] = rs.fit_transform(temp_df[[num_col]])
  return temp_df

In [48]:
new_df = robust_scaler(df, "age")

In [49]:
new_df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,-0.336,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,0.559,1,0,71.283,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,-0.112,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,0.392,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,0.392,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [53]:
def robust_scaler_dataframe(dataframe):
  temp_df = dataframe.copy()
  rs = RobustScaler()
  cat_cols, num_cols, cat_but_car, num_but_cat = grab_col_names(dataframe)
  for col in num_cols:
    temp_df[col] = rs.fit_transform(temp_df[[col]])
  return temp_df

In [54]:
new_df = robust_scaler_dataframe(df)

In [52]:
new_df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,-0.336,1,0,-0.312,S,Third,man,True,,Southampton,no,False
1,1,1,female,0.559,1,0,2.461,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,-0.112,0,0,-0.283,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,0.392,1,0,1.674,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,0.392,0,0,-0.277,S,Third,man,True,,Southampton,no,True


# MinMaxScaler

y = (Xi — Xmin) / (Xmax - Xmin)

In [69]:
# Example of Formula
(df["age"][0] - df["age"].min()) / (df["age"].max() - df["age"].min())

0.2711736617240513

In [74]:
def minmax_scaler(dataframe, num_col):
  temp_df = dataframe.copy()
  mms = MinMaxScaler()
  temp_df[num_col] = mms.fit_transform(temp_df[[num_col]])
  return temp_df

In [75]:
new_df = minmax_scaler(df, "age")

In [76]:
new_df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,0.271,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,0.472,1,0,71.283,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,0.321,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,0.435,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,0.435,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [77]:
def minmax_scaler_dataframe(dataframe):
  temp_df = dataframe.copy()
  mms = MinMaxScaler()
  cat_cols, num_cols, cat_but_car, num_but_cat = grab_col_names(dataframe)
  for col in num_cols:
    temp_df[col] = mms.fit_transform(temp_df[[col]])
  return temp_df

In [78]:
new_df = minmax_scaler_dataframe(df)

In [79]:
new_df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,0.271,1,0,0.014,S,Third,man,True,,Southampton,no,False
1,1,1,female,0.472,1,0,0.139,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,0.321,0,0,0.015,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,0.435,1,0,0.104,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,0.435,0,0,0.016,S,Third,man,True,,Southampton,no,True
