# Encoding Categorical Data 

# Types of encoder used are as follows:
    1.Label Encoder(for categorical target column)
    2.Ordinal Encoder(for input ordinal categorical column)
    3.One-hot encoder(for input nominal categorical column)

# Label encoder and Ordinal encoder

In [1]:
import numpy as np
import pandas as pd

In [2]:
df=pd.read_csv("//Users//udayladdha//Desktop//DataSets//customer.csv")
df.head()

Unnamed: 0,age,gender,review,education,purchased
0,30,Female,Average,School,No
1,68,Female,Poor,UG,No
2,70,Female,Good,PG,No
3,72,Female,Good,PG,No
4,16,Female,Average,UG,No


In [3]:
# age is a numerical column(no encoding is required)
# review and education are ordinal categorical data (ordinal encoder)
# gender is a nominal categorical data(one-hot encoder)
# purchased is nominal categorical target coloumn(lable enocder)

In [4]:
# we will only use label and ordinal encoder first so extracted colmns accordingly
df=df.iloc[:,2:]

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
x=df.drop(columns=["purchased"])
y=df["purchased"]

In [7]:
x_train, x_test, y_train, y_test = train_test_split(x,y,train_size=0.8,random_state=23)

In [8]:
x_train.head()

Unnamed: 0,review,education
29,Average,UG
10,Good,UG
44,Average,UG
3,Good,PG
22,Poor,PG


In [9]:
print(df["education"].value_counts())
print(df["review"].value_counts())

PG        18
School    16
UG        16
Name: education, dtype: int64
Poor       18
Good       18
Average    14
Name: review, dtype: int64


In [10]:
y_train.head()

29    Yes
10    Yes
44     No
3      No
22    Yes
Name: purchased, dtype: object

In [11]:
from sklearn.preprocessing import LabelEncoder,OrdinalEncoder

In [12]:
lb=LabelEncoder()
oe=OrdinalEncoder(categories=[["Poor","Average","Good"],["School","UG","PG"]])

In [13]:
x_train_encoded=oe.fit_transform(x_train) # we learn from traning data and transform both of our data
x_test_encoded=oe.transform(x_test)

In [14]:
x_test_encoded

array([[0., 2.],
       [2., 0.],
       [0., 1.],
       [1., 1.],
       [1., 0.],
       [2., 2.],
       [1., 0.],
       [1., 1.],
       [2., 1.],
       [0., 0.]])

In [15]:
x_test

Unnamed: 0,review,education
14,Poor,PG
18,Good,School
17,Poor,UG
32,Average,UG
13,Average,School
33,Good,PG
20,Average,School
8,Average,UG
36,Good,UG
28,Poor,School


In [16]:
oe.categories

[['Poor', 'Average', 'Good'], ['School', 'UG', 'PG']]

In [17]:
y_train_encoded=lb.fit_transform(y_train)
y_test_encoded=lb.transform(y_test)

In [18]:
y_test

14    Yes
18     No
17    Yes
32    Yes
13     No
33    Yes
20    Yes
8      No
36    Yes
28     No
Name: purchased, dtype: object

In [19]:
y_test_encoded

array([1, 0, 1, 1, 0, 1, 1, 0, 1, 0])

# One Hot encoder

In [20]:
df=pd.read_csv("//Users//udayladdha//Desktop//DataSets//cars.csv")
df.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner
0,Maruti 800 AC BSII,1992,50000,100000,Petrol,Individual,Manual,Fourth & Above Owner
1,Maruti Gypsy E MG410W ST,1995,95000,100000,Petrol,Individual,Manual,Second Owner
2,Mahindra Jeep CL 500 MDI,1996,250000,35000,Diesel,Individual,Manual,Second Owner
3,Mahindra Jeep MM 540,1996,200000,60000,Diesel,Individual,Manual,First Owner
4,Mahindra Jeep CL 500 MDI,1997,150000,120000,Diesel,Individual,Manual,Third Owner


In [21]:
df=df.drop(columns="transmission")
df=df.drop(columns="seller_type")
df=df.drop(columns="year")


In [22]:
df.head()

Unnamed: 0,name,selling_price,km_driven,fuel,owner
0,Maruti 800 AC BSII,50000,100000,Petrol,Fourth & Above Owner
1,Maruti Gypsy E MG410W ST,95000,100000,Petrol,Second Owner
2,Mahindra Jeep CL 500 MDI,250000,35000,Diesel,Second Owner
3,Mahindra Jeep MM 540,200000,60000,Diesel,First Owner
4,Mahindra Jeep CL 500 MDI,150000,120000,Diesel,Third Owner


In [23]:
df["fuel"].value_counts()

Diesel      2153
Petrol      2123
CNG           40
LPG           23
Electric       1
Name: fuel, dtype: int64

In [24]:
df["owner"].value_counts()

First Owner             2832
Second Owner            1106
Third Owner              304
Fourth & Above Owner      81
Test Drive Car            17
Name: owner, dtype: int64

In [25]:
# fuel, owner, and name are ordinal categorical data and we will use one hot enoding on fuel and owner

# one hot encoding using pandas 

In [26]:
pd.get_dummies(df,columns=["fuel","owner"]) # one hot encoder applied 

Unnamed: 0,name,selling_price,km_driven,fuel_CNG,fuel_Diesel,fuel_Electric,fuel_LPG,fuel_Petrol,owner_First Owner,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti 800 AC BSII,50000,100000,0,0,0,0,1,0,1,0,0,0
1,Maruti Gypsy E MG410W ST,95000,100000,0,0,0,0,1,0,0,1,0,0
2,Mahindra Jeep CL 500 MDI,250000,35000,0,1,0,0,0,0,0,1,0,0
3,Mahindra Jeep MM 540,200000,60000,0,1,0,0,0,1,0,0,0,0
4,Mahindra Jeep CL 500 MDI,150000,120000,0,1,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4335,Hyundai Venue SX Opt Turbo BSIV,1050000,1100,0,0,0,0,1,1,0,0,0,0
4336,Hyundai Grand i10 1.2 Kappa Magna BSIV,545000,5000,0,0,0,0,1,1,0,0,0,0
4337,Ford Figo Aspire 1.5 TDCi Titanium,530000,45000,0,1,0,0,0,1,0,0,0,0
4338,Tata Harrier XE,426000,1000,0,1,0,0,0,1,0,0,0,0


# dropping first column using same code to reduce multicollinearity

In [27]:
pd.get_dummies(df,columns=["fuel","owner"],drop_first=True)

Unnamed: 0,name,selling_price,km_driven,fuel_Diesel,fuel_Electric,fuel_LPG,fuel_Petrol,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti 800 AC BSII,50000,100000,0,0,0,1,1,0,0,0
1,Maruti Gypsy E MG410W ST,95000,100000,0,0,0,1,0,1,0,0
2,Mahindra Jeep CL 500 MDI,250000,35000,1,0,0,0,0,1,0,0
3,Mahindra Jeep MM 540,200000,60000,1,0,0,0,0,0,0,0
4,Mahindra Jeep CL 500 MDI,150000,120000,1,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...
4335,Hyundai Venue SX Opt Turbo BSIV,1050000,1100,0,0,0,1,0,0,0,0
4336,Hyundai Grand i10 1.2 Kappa Magna BSIV,545000,5000,0,0,0,1,0,0,0,0
4337,Ford Figo Aspire 1.5 TDCi Titanium,530000,45000,1,0,0,0,0,0,0,0
4338,Tata Harrier XE,426000,1000,1,0,0,0,0,0,0,0


# one hot coding using sklearn

In [28]:
df.head()

Unnamed: 0,name,selling_price,km_driven,fuel,owner
0,Maruti 800 AC BSII,50000,100000,Petrol,Fourth & Above Owner
1,Maruti Gypsy E MG410W ST,95000,100000,Petrol,Second Owner
2,Mahindra Jeep CL 500 MDI,250000,35000,Diesel,Second Owner
3,Mahindra Jeep MM 540,200000,60000,Diesel,First Owner
4,Mahindra Jeep CL 500 MDI,150000,120000,Diesel,Third Owner


In [29]:
x=df.drop(columns=["selling_price"])
y=df["selling_price"]

In [30]:
x_train, x_test, y_train, y_test=train_test_split(x,y,train_size=0.8,random_state=23)

In [31]:
from sklearn.preprocessing import OneHotEncoder

In [32]:
ohe=OneHotEncoder(drop="first",sparse_output=False,dtype=np.int32) # if we dont use sparse paramter then we use .toarray attribute to see the sparse matric

In [33]:
# while using ohe we have to extract columns on which we have to apply ohe then we in last concatenate them to the main data frame again

In [34]:
x_train_encoded=ohe.fit_transform(x_train[["fuel","owner"]]) # by default ohe creates a sparse matrix

In [35]:
x_test_encoded=ohe.transform(x_test[["fuel","owner"]])

In [36]:
x_train_encoded

array([[1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 1, 0, 0],
       [1, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int32)

In [37]:
x_train[["name","km_driven"]].values # values attribute gives us in array format

array([['Tata New Safari DICOR 2.2 GX 4x2 BS IV', 80000],
       ['Hyundai Grand i10 Magna AT', 10510],
       ['Maruti Ertiga VXI CNG', 56600],
       ...,
       ['Maruti Wagon R VXI BS IV', 62000],
       ['Mahindra Scorpio VLS AT 2.2 mHAWK', 70000],
       ['Maruti Zen Estilo VXI BSIII', 120000]], dtype=object)

In [38]:
# horizontally stacking for concatenation
np.hstack((x_train[["name","km_driven"]].values,x_train_encoded))

array([['Tata New Safari DICOR 2.2 GX 4x2 BS IV', 80000, 1, ..., 0, 0, 0],
       ['Hyundai Grand i10 Magna AT', 10510, 0, ..., 0, 0, 0],
       ['Maruti Ertiga VXI CNG', 56600, 0, ..., 0, 0, 0],
       ...,
       ['Maruti Wagon R VXI BS IV', 62000, 0, ..., 1, 0, 0],
       ['Mahindra Scorpio VLS AT 2.2 mHAWK', 70000, 1, ..., 1, 0, 0],
       ['Maruti Zen Estilo VXI BSIII', 120000, 0, ..., 0, 0, 0]],
      dtype=object)

In [39]:
np.hstack((x_test[["name","km_driven"]].values,x_test_encoded))

array([['Skoda Rapid 1.6 MPI Elegance', 90000, 0, ..., 0, 0, 0],
       ['BMW 3 Series 320d Sport Line', 75800, 1, ..., 1, 0, 0],
       ['Hyundai Santro Xing XG AT', 70000, 0, ..., 1, 0, 0],
       ...,
       ['Mahindra Bolero DI DX 7 Seater', 120000, 1, ..., 0, 0, 0],
       ['Mercedes-Benz C-Class Progressive C 220d', 10000, 1, ..., 0, 0,
        0],
       ['Hyundai Verna CRDi ABS', 80000, 1, ..., 1, 0, 0]], dtype=object)

In [40]:
x_train.shape,x_train_encoded.shape

((3472, 4), (3472, 8))

In [41]:
x_test.shape,x_test_encoded.shape

((868, 4), (868, 8))

# one hot encoding when there are many categories  df["name"]

In [42]:
# above we did not encoded name column because there was many categories but now we will do it

In [43]:
counts=df["name"].value_counts() # stored value counts of name column in counts

In [44]:
df["name"].nunique()
threshold=15 # created a variable which is used for car names that are repeated atleast 15 time

In [45]:
repl=counts[counts<=threshold].index # name of cars that that repeated atlest 15 times

In [46]:
pd.get_dummies(df["name"].replace(repl,"uncommon")) # used one hot encoder on name column 
# all those values those was above threshold value 15 we created a seperrate column by name uncommon

Unnamed: 0,Chevrolet Beat Diesel LS,Chevrolet Beat Diesel LT,Hyundai Creta 1.6 CRDi SX,Hyundai EON Era Plus,Hyundai EON Magna Plus,Hyundai Santro Xing GLS,Hyundai i10 Magna,Mahindra XUV500 W8 2WD,Maruti 800 AC,Maruti Alto 800 LXI,...,Maruti Swift Dzire VDI,Maruti Swift VDI,Maruti Swift VDI BSIV,Maruti Wagon R LXI,Maruti Wagon R LXI Minor,Maruti Wagon R VXI BS IV,Renault Duster 85PS Diesel RxL,Renault KWID 1.0 RXT Optional,Renault KWID RXT,uncommon
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4335,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4336,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4337,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4338,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
