### Encoding

In [None]:
Data:
    1. Continuous Data
    2. Categorical Data
    
Categorical Data:
    1. Ordinal Data:
        High >> Medium >> Low
    2. Nominal Data (Data Without Order or Rank)
    
Ordinal Data >> Label Encoding 
Nominal Data >> One Hot Encoding

In [None]:
Nominal Data:
    1. Male - Female
    2. Movie Genre > Action, Comedy, Aniamtion, Rom-Com, Drama
    
Ordinal Data:
    1. Person's Qualification >> SSC, HSC, Graduation, PG, PHD

In [1]:
import pandas as pd
import numpy as np

In [19]:
data = {"Name":["Ramesh","Suresh","Dinesh","Jignesh","Mayuresh","Rupesh","Mahesh","Sukesh","Naresh","Rajesh","Kalpesh"],
       "Qualification":["SSC","PG","GRAD","PG","HSC","PHD","PG","GRAD","HSC","GRAD","SSC"],
       "Location":["Pune","Mumbai","Satara","Pune","Kolhapur","Mumbai","Nashik","Thane","Satara","Mumbai","Igatpuri"],
       "Genre":["Action","Drama","Comedy","Action","Animation","Adventure","Comedy","Crime","Drama","Horror","Animation"]}

df = pd.DataFrame(data)

In [20]:
df

Unnamed: 0,Name,Qualification,Location,Genre
0,Ramesh,SSC,Pune,Action
1,Suresh,PG,Mumbai,Drama
2,Dinesh,GRAD,Satara,Comedy
3,Jignesh,PG,Pune,Action
4,Mayuresh,HSC,Kolhapur,Animation
5,Rupesh,PHD,Mumbai,Adventure
6,Mahesh,PG,Nashik,Comedy
7,Sukesh,GRAD,Thane,Crime
8,Naresh,HSC,Satara,Drama
9,Rajesh,GRAD,Mumbai,Horror


Qualification is a Ordinal Data, we will have to use Label Encoding

In [22]:
df["Qualification"].value_counts().to_dict()

{'PG': 3, 'GRAD': 3, 'SSC': 2, 'HSC': 2, 'PHD': 1}

In [23]:
label = {'PG': 3, 'GRAD': 2, 'SSC': 0, 'HSC': 1, 'PHD': 4}

df.replace(label, inplace = True)

In [24]:
df

Unnamed: 0,Name,Qualification,Location,Genre
0,Ramesh,0,Pune,Action
1,Suresh,3,Mumbai,Drama
2,Dinesh,2,Satara,Comedy
3,Jignesh,3,Pune,Action
4,Mayuresh,1,Kolhapur,Animation
5,Rupesh,4,Mumbai,Adventure
6,Mahesh,3,Nashik,Comedy
7,Sukesh,2,Thane,Crime
8,Naresh,1,Satara,Drama
9,Rajesh,2,Mumbai,Horror


In [25]:
## Here the location is a Nominal Data, Hence we will have to use One Hot Encoding

In [26]:
## for one hot encoding we will be using pd.get_dummies

In [27]:
location_dummies = pd.get_dummies(df["Location"])
location_dummies

Unnamed: 0,Igatpuri,Kolhapur,Mumbai,Nashik,Pune,Satara,Thane
0,0,0,0,0,1,0,0
1,0,0,1,0,0,0,0
2,0,0,0,0,0,1,0
3,0,0,0,0,1,0,0
4,0,1,0,0,0,0,0
5,0,0,1,0,0,0,0
6,0,0,0,1,0,0,0
7,0,0,0,0,0,0,1
8,0,0,0,0,0,1,0
9,0,0,1,0,0,0,0


In [28]:
location_dummies = pd.get_dummies(df["Location"], prefix="Location")
location_dummies

Unnamed: 0,Location_Igatpuri,Location_Kolhapur,Location_Mumbai,Location_Nashik,Location_Pune,Location_Satara,Location_Thane
0,0,0,0,0,1,0,0
1,0,0,1,0,0,0,0
2,0,0,0,0,0,1,0
3,0,0,0,0,1,0,0
4,0,1,0,0,0,0,0
5,0,0,1,0,0,0,0
6,0,0,0,1,0,0,0
7,0,0,0,0,0,0,1
8,0,0,0,0,0,1,0
9,0,0,1,0,0,0,0


In [29]:
genre_dummies = pd.get_dummies(df["Genre"], prefix = "Genre")
genre_dummies

Unnamed: 0,Genre_Action,Genre_Adventure,Genre_Animation,Genre_Comedy,Genre_Crime,Genre_Drama,Genre_Horror
0,1,0,0,0,0,0,0
1,0,0,0,0,0,1,0
2,0,0,0,1,0,0,0
3,1,0,0,0,0,0,0
4,0,0,1,0,0,0,0
5,0,1,0,0,0,0,0
6,0,0,0,1,0,0,0
7,0,0,0,0,1,0,0
8,0,0,0,0,0,1,0
9,0,0,0,0,0,0,1


In [30]:
df_encoded = pd.concat([df,location_dummies,genre_dummies], axis = 1)

In [31]:
df_encoded

Unnamed: 0,Name,Qualification,Location,Genre,Location_Igatpuri,Location_Kolhapur,Location_Mumbai,Location_Nashik,Location_Pune,Location_Satara,Location_Thane,Genre_Action,Genre_Adventure,Genre_Animation,Genre_Comedy,Genre_Crime,Genre_Drama,Genre_Horror
0,Ramesh,0,Pune,Action,0,0,0,0,1,0,0,1,0,0,0,0,0,0
1,Suresh,3,Mumbai,Drama,0,0,1,0,0,0,0,0,0,0,0,0,1,0
2,Dinesh,2,Satara,Comedy,0,0,0,0,0,1,0,0,0,0,1,0,0,0
3,Jignesh,3,Pune,Action,0,0,0,0,1,0,0,1,0,0,0,0,0,0
4,Mayuresh,1,Kolhapur,Animation,0,1,0,0,0,0,0,0,0,1,0,0,0,0
5,Rupesh,4,Mumbai,Adventure,0,0,1,0,0,0,0,0,1,0,0,0,0,0
6,Mahesh,3,Nashik,Comedy,0,0,0,1,0,0,0,0,0,0,1,0,0,0
7,Sukesh,2,Thane,Crime,0,0,0,0,0,0,1,0,0,0,0,1,0,0
8,Naresh,1,Satara,Drama,0,0,0,0,0,1,0,0,0,0,0,0,1,0
9,Rajesh,2,Mumbai,Horror,0,0,1,0,0,0,0,0,0,0,0,0,0,1


In [17]:
df_titanic = pd.read_csv("titanic.csv")

In [18]:
df_titanic

Unnamed: 0,PassengerId,Survived,Pclass,Name,Gender,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [None]:
## Gender column has to be One Hot encoded,but since there are only two classes, we need not to have two seperate columns for it

In [32]:
df_titanic["Gender"].replace({"male":1,"female":0})

0      1
1      0
2      0
3      0
4      1
      ..
886    1
887    0
888    0
889    1
890    1
Name: Gender, Length: 891, dtype: int64

In [34]:
df_titanic["Gender"].replace({"male":1,"female":0}, inplace = True)

In [35]:
df_titanic

Unnamed: 0,PassengerId,Survived,Pclass,Name,Gender,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",1,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",0,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",0,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",1,26.0,0,0,111369,30.0000,C148,C


In [38]:
df_embarked = pd.get_dummies(df_titanic["Embarked"], prefix="Embarked")

In [39]:
df_embarked

Unnamed: 0,Embarked_C,Embarked_Q,Embarked_S
0,0,0,1
1,1,0,0
2,0,0,1
3,0,0,1
4,0,0,1
...,...,...,...
886,0,0,1
887,0,0,1
888,0,0,1
889,1,0,0


In [40]:
df_titanic = pd.concat([df_titanic,df_embarked], axis = 1)

In [41]:
df_titanic

Unnamed: 0,PassengerId,Survived,Pclass,Name,Gender,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Embarked_C,Embarked_Q,Embarked_S
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.2500,,S,0,0,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,C85,C,1,0,0
2,3,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.9250,,S,0,0,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1000,C123,S,0,0,1
4,5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.0500,,S,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",1,27.0,0,0,211536,13.0000,,S,0,0,1
887,888,1,1,"Graham, Miss. Margaret Edith",0,19.0,0,0,112053,30.0000,B42,S,0,0,1
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",0,,1,2,W./C. 6607,23.4500,,S,0,0,1
889,890,1,1,"Behr, Mr. Karl Howell",1,26.0,0,0,111369,30.0000,C148,C,1,0,0


In [43]:
df_titanic.drop("Embarked", axis = 1)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Gender,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked_C,Embarked_Q,Embarked_S
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.2500,,0,0,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,C85,1,0,0
2,3,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.9250,,0,0,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1000,C123,0,0,1
4,5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.0500,,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",1,27.0,0,0,211536,13.0000,,0,0,1
887,888,1,1,"Graham, Miss. Margaret Edith",0,19.0,0,0,112053,30.0000,B42,0,0,1
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",0,,1,2,W./C. 6607,23.4500,,0,0,1
889,890,1,1,"Behr, Mr. Karl Howell",1,26.0,0,0,111369,30.0000,C148,1,0,0
