# One hot encoding and Feature Scaling

In [1]:
import pandas as pd 
import numpy as np
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

#### Loding the dataset

In [2]:
data = sns.load_dataset('titanic')
data.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [3]:
data.dropna(inplace=True)
data.drop_duplicates(inplace=True)

In [4]:
data

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
6,0,1,male,54.0,0,0,51.8625,S,First,man,True,E,Southampton,no,True
10,1,3,female,4.0,1,1,16.7000,S,Third,child,False,G,Southampton,yes,False
11,1,1,female,58.0,0,0,26.5500,S,First,woman,False,C,Southampton,yes,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
871,1,1,female,47.0,1,1,52.5542,S,First,woman,False,D,Southampton,yes,False
872,0,1,male,33.0,0,0,5.0000,S,First,man,True,B,Southampton,no,True
879,1,1,female,56.0,0,1,83.1583,C,First,woman,False,C,Cherbourg,yes,False
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True


### One Hot Encoding

In [5]:
data['SEX'] = pd.get_dummies(data=data.sex, drop_first=True)

In [6]:
data.head(3)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone,SEX
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False,0
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False,0
6,0,1,male,54.0,0,0,51.8625,S,First,man,True,E,Southampton,no,True,1


### Converting the Entire dataframe

In [7]:
data_No_Dummies = pd.get_dummies(data=data)

In [8]:
data_No_Dummies

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,adult_male,alone,SEX,sex_female,...,deck_C,deck_D,deck_E,deck_F,deck_G,embark_town_Cherbourg,embark_town_Queenstown,embark_town_Southampton,alive_no,alive_yes
1,1,1,38.0,1,0,71.2833,False,False,0,1,...,1,0,0,0,0,1,0,0,0,1
3,1,1,35.0,1,0,53.1000,False,False,0,1,...,1,0,0,0,0,0,0,1,0,1
6,0,1,54.0,0,0,51.8625,True,True,1,0,...,0,0,1,0,0,0,0,1,1,0
10,1,3,4.0,1,1,16.7000,False,False,0,1,...,0,0,0,0,1,0,0,1,0,1
11,1,1,58.0,0,0,26.5500,False,True,0,1,...,1,0,0,0,0,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
871,1,1,47.0,1,1,52.5542,False,False,0,1,...,0,1,0,0,0,0,0,1,0,1
872,0,1,33.0,0,0,5.0000,True,True,1,0,...,0,0,0,0,0,0,0,1,1,0
879,1,1,56.0,0,1,83.1583,False,False,0,1,...,1,0,0,0,0,1,0,0,0,1
887,1,1,19.0,0,0,30.0000,False,True,0,1,...,0,0,0,0,0,0,0,1,0,1


In [9]:
data_No_Dummies.shape

(181, 32)

In [10]:
data_With_Dummies = pd.get_dummies(data=data, drop_first=True)
data_With_Dummies.head()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,adult_male,alone,SEX,sex_male,...,who_woman,deck_B,deck_C,deck_D,deck_E,deck_F,deck_G,embark_town_Queenstown,embark_town_Southampton,alive_yes
1,1,1,38.0,1,0,71.2833,False,False,0,0,...,1,0,1,0,0,0,0,0,0,1
3,1,1,35.0,1,0,53.1,False,False,0,0,...,1,0,1,0,0,0,0,0,1,1
6,0,1,54.0,0,0,51.8625,True,True,1,1,...,0,0,0,0,1,0,0,0,1,0
10,1,3,4.0,1,1,16.7,False,False,0,0,...,0,0,0,0,0,0,1,0,1,1
11,1,1,58.0,0,0,26.55,False,True,0,0,...,1,0,1,0,0,0,0,0,1,1


In [11]:
data_With_Dummies.shape

(181, 25)

In [12]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler, LabelEncoder

## Label encoder

In [13]:
le = LabelEncoder()

In [14]:
data

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone,SEX
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False,0
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False,0
6,0,1,male,54.0,0,0,51.8625,S,First,man,True,E,Southampton,no,True,1
10,1,3,female,4.0,1,1,16.7000,S,Third,child,False,G,Southampton,yes,False,0
11,1,1,female,58.0,0,0,26.5500,S,First,woman,False,C,Southampton,yes,True,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
871,1,1,female,47.0,1,1,52.5542,S,First,woman,False,D,Southampton,yes,False,0
872,0,1,male,33.0,0,0,5.0000,S,First,man,True,B,Southampton,no,True,1
879,1,1,female,56.0,0,1,83.1583,C,First,woman,False,C,Cherbourg,yes,False,0
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True,0


In [15]:
data['EMBARKED'] = le.fit_transform(data.embarked)

In [16]:
data

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone,SEX,EMBARKED
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False,0,0
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False,0,2
6,0,1,male,54.0,0,0,51.8625,S,First,man,True,E,Southampton,no,True,1,2
10,1,3,female,4.0,1,1,16.7000,S,Third,child,False,G,Southampton,yes,False,0,2
11,1,1,female,58.0,0,0,26.5500,S,First,woman,False,C,Southampton,yes,True,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
871,1,1,female,47.0,1,1,52.5542,S,First,woman,False,D,Southampton,yes,False,0,2
872,0,1,male,33.0,0,0,5.0000,S,First,man,True,B,Southampton,no,True,1,2
879,1,1,female,56.0,0,1,83.1583,C,First,woman,False,C,Cherbourg,yes,False,0,0
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True,0,2


In [17]:
data.embarked.unique()

array(['C', 'S', 'Q'], dtype=object)

## Data Scaling or Data Normalising

## Minmax Scaler

In [18]:
minmax = MinMaxScaler()

In [19]:
data[['age']]

Unnamed: 0,age
1,38.0
3,35.0
6,54.0
10,4.0
11,58.0
...,...
871,47.0
872,33.0
879,56.0
887,19.0


In [20]:
data[['age']].values

array([[38.  ],
       [35.  ],
       [54.  ],
       [ 4.  ],
       [58.  ],
       [34.  ],
       [28.  ],
       [19.  ],
       [49.  ],
       [65.  ],
       [45.  ],
       [29.  ],
       [25.  ],
       [23.  ],
       [46.  ],
       [71.  ],
       [23.  ],
       [21.  ],
       [47.  ],
       [24.  ],
       [32.5 ],
       [54.  ],
       [19.  ],
       [37.  ],
       [24.  ],
       [36.5 ],
       [22.  ],
       [61.  ],
       [56.  ],
       [50.  ],
       [ 1.  ],
       [ 3.  ],
       [44.  ],
       [58.  ],
       [ 2.  ],
       [40.  ],
       [31.  ],
       [32.  ],
       [38.  ],
       [35.  ],
       [44.  ],
       [37.  ],
       [29.  ],
       [62.  ],
       [30.  ],
       [52.  ],
       [40.  ],
       [58.  ],
       [35.  ],
       [37.  ],
       [63.  ],
       [19.  ],
       [36.  ],
       [ 2.  ],
       [50.  ],
       [ 0.92],
       [17.  ],
       [30.  ],
       [24.  ],
       [18.  ],
       [31.  ],
       [40.  ],
       [

In [21]:
age = data[['age']].values

In [22]:
age

array([[38.  ],
       [35.  ],
       [54.  ],
       [ 4.  ],
       [58.  ],
       [34.  ],
       [28.  ],
       [19.  ],
       [49.  ],
       [65.  ],
       [45.  ],
       [29.  ],
       [25.  ],
       [23.  ],
       [46.  ],
       [71.  ],
       [23.  ],
       [21.  ],
       [47.  ],
       [24.  ],
       [32.5 ],
       [54.  ],
       [19.  ],
       [37.  ],
       [24.  ],
       [36.5 ],
       [22.  ],
       [61.  ],
       [56.  ],
       [50.  ],
       [ 1.  ],
       [ 3.  ],
       [44.  ],
       [58.  ],
       [ 2.  ],
       [40.  ],
       [31.  ],
       [32.  ],
       [38.  ],
       [35.  ],
       [44.  ],
       [37.  ],
       [29.  ],
       [62.  ],
       [30.  ],
       [52.  ],
       [40.  ],
       [58.  ],
       [35.  ],
       [37.  ],
       [63.  ],
       [19.  ],
       [36.  ],
       [ 2.  ],
       [50.  ],
       [ 0.92],
       [17.  ],
       [30.  ],
       [24.  ],
       [18.  ],
       [31.  ],
       [40.  ],
       [

In [23]:
data['AGE'] = minmax.fit_transform(age)

In [24]:
data.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone,SEX,EMBARKED,AGE
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False,0,0,0.468892
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False,0,2,0.430956
6,0,1,male,54.0,0,0,51.8625,S,First,man,True,E,Southampton,no,True,1,2,0.671219
10,1,3,female,4.0,1,1,16.7,S,Third,child,False,G,Southampton,yes,False,0,2,0.038948
11,1,1,female,58.0,0,0,26.55,S,First,woman,False,C,Southampton,yes,True,0,2,0.721801


In [25]:
data.AGE.min()

0.0

In [26]:
data.AGE.max()

1.0

## Standard Scaler

In [27]:
sc = StandardScaler()

In [28]:
data['AGE_SC'] = sc.fit_transform(age)

In [29]:
data

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone,SEX,EMBARKED,AGE,AGE_SC
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False,0,0,0.468892,0.147792
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False,0,2,0.430956,-0.043930
6,0,1,male,54.0,0,0,51.8625,S,First,man,True,E,Southampton,no,True,1,2,0.671219,1.170310
10,1,3,female,4.0,1,1,16.7000,S,Third,child,False,G,Southampton,yes,False,0,2,0.038948,-2.025058
11,1,1,female,58.0,0,0,26.5500,S,First,woman,False,C,Southampton,yes,True,0,2,0.721801,1.425939
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
871,1,1,female,47.0,1,1,52.5542,S,First,woman,False,D,Southampton,yes,False,0,2,0.582701,0.722958
872,0,1,male,33.0,0,0,5.0000,S,First,man,True,B,Southampton,no,True,1,2,0.405665,-0.171745
879,1,1,female,56.0,0,1,83.1583,C,First,woman,False,C,Cherbourg,yes,False,0,0,0.696510,1.298124
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True,0,2,0.228629,-1.066448


In [30]:
data.AGE_SC.min()

-2.221892714597594

In [31]:
data.AGE_SC.max()

2.83190075622482

### Tasks
1. Try other Scaling methods
2. Try all these methods in car sales dataset