# Dealing With Categorical Values

## Importing Libraries

In [34]:
import pandas as pd
import numpy as np

## Importing Dataset

In [35]:
dataset = pd.read_csv("Salary_Dataset.csv")

In [36]:
dataset.head(40)

Unnamed: 0,country,Salary,YearsExperience,Purchased
0,Dubai,39343.0,1.1,No
1,Canada,46205.0,1.3,Yes
2,Canada,37731.0,1.5,No
3,Canada,43525.0,2.0,No
4,USA,39891.0,2.2,No
5,Dubai,56642.0,2.9,No
6,Canada,60150.0,3.0,Yes
7,Australia,54445.0,3.2,No
8,Dubai,64445.0,3.2,Yes
9,Dubai,57189.0,3.7,No


# Lets perform Encoding

## Applying One-Hot Encoding

<h3> First Do it with pandas </h3>

In [37]:
# Get dummies with pandas in any variable
country_dummies=pd.get_dummies(dataset['country']).astype('int')
country_dummies

Unnamed: 0,Australia,Canada,Dubai,USA
0,0,0,1,0
1,0,1,0,0
2,0,1,0,0
3,0,1,0,0
4,0,0,0,1
5,0,0,1,0
6,0,1,0,0
7,1,0,0,0
8,0,0,1,0
9,0,0,1,0


In [38]:
# concatenate the dataframes into original dataframes.
dataset=pd.concat([dataset,country_dummies],axis=1)
dataset

Unnamed: 0,country,Salary,YearsExperience,Purchased,Australia,Canada,Dubai,USA
0,Dubai,39343.0,1.1,No,0,0,1,0
1,Canada,46205.0,1.3,Yes,0,1,0,0
2,Canada,37731.0,1.5,No,0,1,0,0
3,Canada,43525.0,2.0,No,0,1,0,0
4,USA,39891.0,2.2,No,0,0,0,1
5,Dubai,56642.0,2.9,No,0,0,1,0
6,Canada,60150.0,3.0,Yes,0,1,0,0
7,Australia,54445.0,3.2,No,1,0,0,0
8,Dubai,64445.0,3.2,Yes,0,0,1,0
9,Dubai,57189.0,3.7,No,0,0,1,0


In [39]:
# Drop the country column and rearrange the index of column.
dataset.drop(['country'],axis=1,inplace=True)
dataset.head()

Unnamed: 0,Salary,YearsExperience,Purchased,Australia,Canada,Dubai,USA
0,39343.0,1.1,No,0,0,1,0
1,46205.0,1.3,Yes,0,1,0,0
2,37731.0,1.5,No,0,1,0,0
3,43525.0,2.0,No,0,1,0,0
4,39891.0,2.2,No,0,0,0,1


# <h3> Let's do it with Scikit-Learn </h3>
Also Restart your kernel if you are using same notebook


In [53]:
# Lets first perform label encoding
# Labelencoding will be performed on Purchased column as it has only two unique value in it.
df=pd.read_csv("Salary_Dataset.csv")
from sklearn.preprocessing import LabelEncoder
df['Purchased']=LabelEncoder().fit_transform(df['Purchased'])
df.head()

Unnamed: 0,country,Salary,YearsExperience,Purchased
0,Dubai,39343.0,1.1,0
1,Canada,46205.0,1.3,1
2,Canada,37731.0,1.5,0
3,Canada,43525.0,2.0,0
4,USA,39891.0,2.2,0


In [65]:
# Lets perform one-hot encoding on country column.
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
df=pd.read_csv("Salary_Dataset.csv")
ct=ColumnTransformer(transformers=[('encoder',OneHotEncoder(),[0])],remainder='passthrough') #remainder --> 1. drop: To drop all the columns other than the column to which one hot encoding is perfomed. 2. passthrough: To keep the columns
#[0]--> Denotes the column to which one hot encoding is to be performed.
df=ct.fit_transform(df)
#One Hot Encoder-- return type is: Numpy array.
df=pd.DataFrame(df ,columns=['Australia','Canada','Dubai','USA','Salary','YearExpierence','Purchased'])
#df.columns = ['Australia','Canada','Dubai','USA','Salary', 'YearsExperience', 'Purchased']
df.head(10)

Unnamed: 0,Australia,Canada,Dubai,USA,Salary,YearExpierence,Purchased
0,0.0,0.0,1.0,0.0,39343.0,1.1,No
1,0.0,1.0,0.0,0.0,46205.0,1.3,Yes
2,0.0,1.0,0.0,0.0,37731.0,1.5,No
3,0.0,1.0,0.0,0.0,43525.0,2.0,No
4,0.0,0.0,0.0,1.0,39891.0,2.2,No
5,0.0,0.0,1.0,0.0,56642.0,2.9,No
6,0.0,1.0,0.0,0.0,60150.0,3.0,Yes
7,1.0,0.0,0.0,0.0,54445.0,3.2,No
8,0.0,0.0,1.0,0.0,64445.0,3.2,Yes
9,0.0,0.0,1.0,0.0,57189.0,3.7,No
