# Ordinal encoding

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('customer.csv')

In [3]:
df.head()

Unnamed: 0,age,gender,review,education,purchased
0,30,Female,Average,School,No
1,68,Female,Poor,UG,No
2,70,Female,Good,PG,No
3,72,Female,Good,PG,No
4,16,Female,Average,UG,No


In [4]:
#Gender is a nominal data, review and education is ordinal, purchased is output columns

In [5]:
# firstly we apply ordinal encoding on the review and education 
df = df.iloc[:, 2:]

In [6]:
df.head()

Unnamed: 0,review,education,purchased
0,Average,School,No
1,Poor,UG,No
2,Good,PG,No
3,Good,PG,No
4,Average,UG,No


In [7]:
from sklearn.model_selection import train_test_split

In [8]:
x_train, x_test, y_train, y_test = train_test_split(df.iloc[:, 0:2], df.iloc[:,-1:], test_size=0.2)

In [9]:
x_train.head()

Unnamed: 0,review,education
0,Average,School
27,Poor,PG
39,Poor,PG
33,Good,PG
10,Good,UG


In [10]:
#Apply ordinal econding
from sklearn.preprocessing import OrdinalEncoder

oe = OrdinalEncoder(categories=[['Poor','Average','Good'],['School','UG', 'PG']])


In [11]:
oe.fit(x_train)

In [12]:
x_train = oe.transform(x_train)
x_test = oe.transform(x_test)

In [13]:
x_train = pd.DataFrame(x_train, columns=['review','education'])
x_test = pd.DataFrame(x_test, columns=['review','education'])

In [14]:
x_train.info()
x_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40 entries, 0 to 39
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   review     40 non-null     float64
 1   education  40 non-null     float64
dtypes: float64(2)
memory usage: 772.0 bytes
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   review     10 non-null     float64
 1   education  10 non-null     float64
dtypes: float64(2)
memory usage: 292.0 bytes


In [15]:
from sklearn.preprocessing import LabelEncoder

In [16]:
lb = LabelEncoder()

In [17]:
lb.fit(y_train)

  y = column_or_1d(y, warn=True)


In [18]:
lb.classes_

array(['No', 'Yes'], dtype=object)

In [19]:
y_test = lb.transform(y_test)

  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


In [20]:
y_test

array([0, 0, 0, 1, 0, 1, 1, 1, 1, 0])

# One hot Encoding

In [21]:
df = pd.read_csv('cars.csv')
import matplotlib.pyplot as plt

In [22]:
df.head()

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
0,Maruti,145500,Diesel,First Owner,450000
1,Skoda,120000,Diesel,Second Owner,370000
2,Honda,140000,Petrol,Third Owner,158000
3,Hyundai,127000,Diesel,First Owner,225000
4,Maruti,NAN,Petrol,First Owner,130000


In [23]:
df['brand'].value_counts()
df['brand'].nunique

<bound method IndexOpsMixin.nunique of 0        Maruti
1         Skoda
2         Honda
3       Hyundai
4        Maruti
         ...   
8123    Hyundai
8124    Hyundai
8125     Maruti
8126       Tata
8127       Tata
Name: brand, Length: 8128, dtype: object>

In [24]:
df['fuel'].value_counts()

fuel
Diesel    4402
Petrol    3631
CNG         57
LPG         38
Name: count, dtype: int64

# 1. One hot encoding using python

In [25]:
pd.get_dummies(df, columns=['fuel','owner']).head()

Unnamed: 0,brand,km_driven,selling_price,fuel_CNG,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_First Owner,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,False,True,False,False,True,False,False,False,False
1,Skoda,120000,370000,False,True,False,False,False,False,True,False,False
2,Honda,140000,158000,False,False,False,True,False,False,False,False,True
3,Hyundai,127000,225000,False,True,False,False,True,False,False,False,False
4,Maruti,NAN,130000,False,False,False,True,True,False,False,False,False


2. k-1 Encoding(solve multicolinarity)

In [26]:
pd.get_dummies(df, columns=['fuel','owner'], drop_first=True).head()

Unnamed: 0,brand,km_driven,selling_price,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,True,False,False,False,False,False,False
1,Skoda,120000,370000,True,False,False,False,True,False,False
2,Honda,140000,158000,False,False,True,False,False,False,True
3,Hyundai,127000,225000,True,False,False,False,False,False,False
4,Maruti,NAN,130000,False,False,True,False,False,False,False


In [27]:
#There are problem in this pandas that it's does not remeber means if we run code again and again it give different region

# OneHotEncoding using sklearn

In [28]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(df.iloc[:,0:4], df.iloc[:,-1], test_size=0.2)

In [29]:
from sklearn.preprocessing import OneHotEncoder

In [30]:
ohe = OneHotEncoder()
# ohe = OneHotEncoder(drop='first', sparse=False, dtype=np.int32) This is used of multicolinarity solve

In [31]:
# we don't apply onehotencoder in specific coulmns so we do seprate columns and then apply encoding then agin merge

In [32]:
x_train_new = ohe.fit_transform(x_train[['fuel','owner']]).toarray()

In [33]:
x_test_new = ohe.transform(x_test[['fuel','owner']]).toarray()

In [34]:
x_train_new

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 1., 0., 0.],
       ...,
       [0., 1., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.]])

In [35]:
x_train.head()

Unnamed: 0,brand,km_driven,fuel,owner
2720,Maruti,13000,Petrol,First Owner
4449,Mahindra,80000,Diesel,First Owner
6988,Mahindra,133000,Diesel,Second Owner
3056,Hyundai,40000,Diesel,First Owner
5966,Maruti,40000,Petrol,First Owner


In [36]:
x_train[['brand', 'km_driven']].values

array([['Maruti', '13000'],
       ['Mahindra', '80000'],
       ['Mahindra', '133000'],
       ...,
       ['Renault', '110000'],
       ['Maruti', '60000'],
       ['Honda', '72000']], dtype=object)

In [37]:
np.hstack((x_train[['brand', 'km_driven']].values, x_train_new))

array([['Maruti', '13000', 0.0, ..., 0.0, 0.0, 0.0],
       ['Mahindra', '80000', 0.0, ..., 0.0, 0.0, 0.0],
       ['Mahindra', '133000', 0.0, ..., 1.0, 0.0, 0.0],
       ...,
       ['Renault', '110000', 0.0, ..., 1.0, 0.0, 0.0],
       ['Maruti', '60000', 0.0, ..., 0.0, 0.0, 0.0],
       ['Honda', '72000', 0.0, ..., 0.0, 0.0, 0.0]], dtype=object)

# One Hot Encoding with top category

In [38]:
count = df['brand'].value_counts()

In [39]:
threshold = 100

In [40]:
count

brand
Maruti           2448
Hyundai          1415
Mahindra          772
Tata              734
Toyota            488
Honda             467
Ford              397
Chevrolet         230
Renault           228
Volkswagen        186
BMW               120
Skoda             105
Nissan             81
Jaguar             71
Volvo              67
Datsun             65
Mercedes-Benz      54
Fiat               47
Audi               40
Lexus              34
Jeep               31
Mitsubishi         14
Force               6
Land                6
Isuzu               5
Kia                 4
Ambassador          4
Daewoo              3
MG                  3
Ashok               1
Opel                1
Peugeot             1
Name: count, dtype: int64

In [41]:
repl = count[count < threshold].index

In [42]:
pd.get_dummies(df['brand'].replace(repl, 'uncommon'))

Unnamed: 0,BMW,Chevrolet,Ford,Honda,Hyundai,Mahindra,Maruti,Renault,Skoda,Tata,Toyota,Volkswagen,uncommon
0,False,False,False,False,False,False,True,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,True,False,False,False,False
2,False,False,False,True,False,False,False,False,False,False,False,False,False
3,False,False,False,False,True,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,True,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8123,False,False,False,False,True,False,False,False,False,False,False,False,False
8124,False,False,False,False,True,False,False,False,False,False,False,False,False
8125,False,False,False,False,False,False,True,False,False,False,False,False,False
8126,False,False,False,False,False,False,False,False,False,True,False,False,False


# Column transformer

In [43]:
# With the help of this method we can perform without spliting the data

In [48]:
df = pd.read_csv('cars.csv')
df.head(10)

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
0,Maruti,145500,Diesel,First Owner,450000
1,Skoda,120000,Diesel,Second Owner,370000
2,Honda,140000,Petrol,Third Owner,158000
3,Hyundai,127000,Diesel,First Owner,225000
4,Maruti,NAN,Petrol,First Owner,130000
5,Hyundai,45000,Petrol,First Owner,440000
6,Maruti,175000,LPG,First Owner,96000
7,Maruti,5000,Petrol,Second Owner,45000
8,Toyota,90000,Diesel,First Owner,350000
9,Ford,169000,Diesel,First Owner,200000


In [47]:
from sklearn.compose import ColumnTransformer

In [53]:
transform = ColumnTransformer(transformers=[
    ('tnf1',OrdinalEncoder(categories=[['First Owner','Second Owner','Third Owner']]),['owner']),
    ('tnf2',OneHotEncoder(sparse_output=False,drop='first'),['Diesel','Petrol','LPG','CNG'])
],remainder='passthrough')

In [54]:
transform.fit_transform(x_train)

ValueError: A given column is not a column of the dataframe