# Dealing With Categorical Values

## Importing Libraries

In [2]:
import pandas as pd
import numpy as np

## Importing Dataset

In [3]:
df = pd.read_csv("Salary_Dataset.csv")

In [4]:
df.head(10)

Unnamed: 0,country,Salary,YearsExperience,Purchased
0,Dubai,39343.0,1.1,No
1,Canada,46205.0,1.3,Yes
2,Canada,37731.0,1.5,No
3,Canada,43525.0,2.0,No
4,USA,39891.0,2.2,No
5,Dubai,56642.0,2.9,No
6,Canada,60150.0,3.0,Yes
7,Australia,54445.0,3.2,No
8,Dubai,64445.0,3.2,Yes
9,Dubai,57189.0,3.7,No


# Lets perform Encoding

## Applying One-Hot Encoding

<h3> First Do it with pandas </h3>

In [5]:
# Get dummies with pandas in any variable
hot_enconder = pd.get_dummies(df['country'], dtype=int)
hot_enconder.head(3)


Unnamed: 0,Australia,Canada,Dubai,USA
0,0,0,1,0
1,0,1,0,0
2,0,1,0,0


In [6]:
# concatenate the dataframes into original dataframes.
df = pd.concat([df, hot_enconder], axis=1)
df.head(3)

Unnamed: 0,country,Salary,YearsExperience,Purchased,Australia,Canada,Dubai,USA
0,Dubai,39343.0,1.1,No,0,0,1,0
1,Canada,46205.0,1.3,Yes,0,1,0,0
2,Canada,37731.0,1.5,No,0,1,0,0


In [7]:
# Drop the country column and rearrange the index of column.

df = df[['Australia','Canada','Dubai','USA','Salary','YearsExperience','Purchased']]

In [8]:
df.head(10)

Unnamed: 0,Australia,Canada,Dubai,USA,Salary,YearsExperience,Purchased
0,0,0,1,0,39343.0,1.1,No
1,0,1,0,0,46205.0,1.3,Yes
2,0,1,0,0,37731.0,1.5,No
3,0,1,0,0,43525.0,2.0,No
4,0,0,0,1,39891.0,2.2,No
5,0,0,1,0,56642.0,2.9,No
6,0,1,0,0,60150.0,3.0,Yes
7,1,0,0,0,54445.0,3.2,No
8,0,0,1,0,64445.0,3.2,Yes
9,0,0,1,0,57189.0,3.7,No


# <h3> Let's do it with Scikit-Learn </h3>
Also Restart your kernel if you are using same notebook


In [9]:
df1 = pd.read_csv("Salary_Dataset.csv")

In [10]:
df1.head(3)

Unnamed: 0,country,Salary,YearsExperience,Purchased
0,Dubai,39343.0,1.1,No
1,Canada,46205.0,1.3,Yes
2,Canada,37731.0,1.5,No


In [11]:
# Lets first perform label encoding
# Labelencoding will be performed on Purchased column as it has only two unique value in it.

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

df1['Purchased'] =le.fit_transform(df1['Purchased'])
df1.head(3)

Unnamed: 0,country,Salary,YearsExperience,Purchased
0,Dubai,39343.0,1.1,0
1,Canada,46205.0,1.3,1
2,Canada,37731.0,1.5,0


In [12]:
# Lets perform one-hot encoding on country column.

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder 

ct = ColumnTransformer([('hot_encoder',OneHotEncoder(),[0])], remainder='passthrough')

encode_df = ct.fit_transform(df1)


In [13]:
encode_df = pd.DataFrame(encode_df, columns= ['Australia','Canada','Dubai','USA','Salary','YearsExperience','Purchased'])
encode_df.head(4)

Unnamed: 0,Australia,Canada,Dubai,USA,Salary,YearsExperience,Purchased
0,0.0,0.0,1.0,0.0,39343.0,1.1,0.0
1,0.0,1.0,0.0,0.0,46205.0,1.3,1.0
2,0.0,1.0,0.0,0.0,37731.0,1.5,0.0
3,0.0,1.0,0.0,0.0,43525.0,2.0,0.0


In [17]:
encode_df.to_csv("C:/Users/Vishnu Sah/Desktop/python/data-processing\\5 - Splitting Datasets-20240910T153050Z-001\\5 - Splitting Datasets/Salary_Dataset_02.csv")

In [15]:
#Lets link the headers name.

Pandas 
One Hot encoding 
    pd.get_dumies(colname, dtype=int) - it will get dummy data for categorical data

sk-learn
LabelEncoder (import Label encoding)
    le = LabelEncoder()
    le.fit_transform(colname)

OneHotEncoder (import ColumnTransformer and HotEncoder)
    ColumnTransformer([('anyname', HotEncoder(), [0])], remaider='passthrough')
