## **Feature Transformation in Nominal Categorical Data using One Hot Encoding**

In [2]:
import numpy as np
import pandas as pd

In [3]:
data=pd.read_csv('/content/cars.csv')

In [5]:
data.sample(10)

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
3205,Hyundai,80000,Petrol,First Owner,380000
6199,Hyundai,75000,Petrol,Third Owner,300000
4201,Datsun,5000,Petrol,Second Owner,430000
7671,Hyundai,21000,Petrol,First Owner,420000
6323,Maruti,121000,Petrol,First Owner,200000
5228,Hyundai,200400,Petrol,Second Owner,187000
483,Maruti,80000,Diesel,First Owner,750000
3416,Honda,56494,Petrol,First Owner,550000
3347,Hyundai,42312,Diesel,First Owner,455000
7552,Hyundai,83000,Petrol,First Owner,475000


In [6]:
#To see the values and no. of values in each column we use 'value_counts()'
data['brand'].value_counts()

Maruti           2448
Hyundai          1415
Mahindra          772
Tata              734
Toyota            488
Honda             467
Ford              397
Chevrolet         230
Renault           228
Volkswagen        186
BMW               120
Skoda             105
Nissan             81
Jaguar             71
Volvo              67
Datsun             65
Mercedes-Benz      54
Fiat               47
Audi               40
Lexus              34
Jeep               31
Mitsubishi         14
Force               6
Land                6
Isuzu               5
Kia                 4
Ambassador          4
Daewoo              3
MG                  3
Ashok               1
Opel                1
Peugeot             1
Name: brand, dtype: int64

### **1. OneHotEncoding using Pandas**

In [7]:
pd.get_dummies(data, columns=['fuel', 'owner']) #It converts the string value into numeric but temporary

Unnamed: 0,brand,km_driven,selling_price,fuel_CNG,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_First Owner,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,0,1,0,0,1,0,0,0,0
1,Skoda,120000,370000,0,1,0,0,0,0,1,0,0
2,Honda,140000,158000,0,0,0,1,0,0,0,0,1
3,Hyundai,127000,225000,0,1,0,0,1,0,0,0,0
4,Maruti,120000,130000,0,0,0,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,0,0,0,1,1,0,0,0,0
8124,Hyundai,119000,135000,0,1,0,0,0,1,0,0,0
8125,Maruti,120000,382000,0,1,0,0,1,0,0,0,0
8126,Tata,25000,290000,0,1,0,0,1,0,0,0,0


In [8]:
data #Heres the proof that get dummies converts the dataframe temporarily

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
0,Maruti,145500,Diesel,First Owner,450000
1,Skoda,120000,Diesel,Second Owner,370000
2,Honda,140000,Petrol,Third Owner,158000
3,Hyundai,127000,Diesel,First Owner,225000
4,Maruti,120000,Petrol,First Owner,130000
...,...,...,...,...,...
8123,Hyundai,110000,Petrol,First Owner,320000
8124,Hyundai,119000,Diesel,Fourth & Above Owner,135000
8125,Maruti,120000,Diesel,First Owner,382000
8126,Tata,25000,Diesel,First Owner,290000


### **2. K-1 OneHotEncoding**

In [9]:
pd.get_dummies(data, columns=['fuel', 'owner'], drop_first=True) #Here dropfirst drops the first column of each converted

Unnamed: 0,brand,km_driven,selling_price,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,1,0,0,0,0,0,0
1,Skoda,120000,370000,1,0,0,0,1,0,0
2,Honda,140000,158000,0,0,1,0,0,0,1
3,Hyundai,127000,225000,1,0,0,0,0,0,0
4,Maruti,120000,130000,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,0,0,1,0,0,0,0
8124,Hyundai,119000,135000,1,0,0,1,0,0,0
8125,Maruti,120000,382000,1,0,0,0,0,0,0
8126,Tata,25000,290000,1,0,0,0,0,0,0


### **3. OneHotEncoding using Sklearn**

In [11]:
#train_test_split step
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test=train_test_split(data.iloc[:, 0:4], data.iloc[:, -1], test_size=0.2, random_state=2)

In [12]:
X_train.sample(10)

Unnamed: 0,brand,km_driven,fuel,owner
2748,Maruti,60000,Diesel,First Owner
4973,Volkswagen,120000,Diesel,Second Owner
7206,Maruti,59000,Petrol,Second Owner
2968,Renault,20000,Petrol,First Owner
4002,Maruti,10000,Petrol,First Owner
735,Mahindra,120000,Diesel,Third Owner
3738,Volkswagen,25000,Diesel,First Owner
6763,Maruti,5000,Petrol,First Owner
3877,Nissan,21500,Petrol,First Owner
4697,Hyundai,80000,Diesel,First Owner


In [22]:
data['fuel'].value_counts(), data['owner'].value_counts()


(Diesel    4402
 Petrol    3631
 CNG         57
 LPG         38
 Name: fuel, dtype: int64,
 First Owner             5289
 Second Owner            2105
 Third Owner              555
 Fourth & Above Owner     174
 Test Drive Car             5
 Name: owner, dtype: int64)

#### **Applying One Hot Encoding using Sklearn**

In [13]:
from sklearn.preprocessing import OneHotEncoder
ohe=OneHotEncoder(drop='first', sparse=False, dtype=np.int32)

In [14]:
X_train_new = ohe.fit_transform(X_train[['fuel','owner']])
X_test_new = ohe.transform(X_test[['fuel', 'owner']])



In [15]:
X_train_new

array([[1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       ...,
       [0, 0, 1, ..., 0, 0, 0],
       [1, 0, 0, ..., 1, 0, 0],
       [1, 0, 0, ..., 0, 0, 0]], dtype=int32)

In [16]:
X_train_new.shape

(6502, 7)

In [18]:
data2=np.hstack((X_train[['brand','km_driven']].values,X_train_new))

In [19]:
data2

array([['Hyundai', 35000, 1, ..., 0, 0, 0],
       ['Jeep', 60000, 1, ..., 0, 0, 0],
       ['Hyundai', 25000, 0, ..., 0, 0, 0],
       ...,
       ['Tata', 15000, 0, ..., 0, 0, 0],
       ['Maruti', 32500, 1, ..., 1, 0, 0],
       ['Isuzu', 121000, 1, ..., 0, 0, 0]], dtype=object)

In [24]:
pd.DataFrame(data2)

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,Hyundai,35000,1,0,0,0,0,0,0
1,Jeep,60000,1,0,0,0,0,0,0
2,Hyundai,25000,0,0,1,0,0,0,0
3,Mahindra,130000,1,0,0,0,1,0,0
4,Hyundai,155000,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...
6497,Ford,35000,1,0,0,0,0,0,0
6498,Maruti,120000,0,0,1,0,0,0,0
6499,Tata,15000,0,0,1,0,0,0,0
6500,Maruti,32500,1,0,0,0,1,0,0


### **4. OneHotEncoding with Top Categories**

In [26]:
counts = data['brand'].value_counts()

In [27]:
data['brand'].nunique()
threshold = 100

In [28]:
repl = counts[counts <= threshold].index

In [29]:
pd.get_dummies(data['brand'].replace(repl, 'uncommon')).sample(5)

Unnamed: 0,BMW,Chevrolet,Ford,Honda,Hyundai,Mahindra,Maruti,Renault,Skoda,Tata,Toyota,Volkswagen,uncommon
5130,0,0,0,0,1,0,0,0,0,0,0,0,0
4017,0,0,0,0,0,0,1,0,0,0,0,0,0
6307,0,0,0,0,0,0,1,0,0,0,0,0,0
6161,0,0,0,0,0,0,1,0,0,0,0,0,0
7505,0,0,0,0,0,1,0,0,0,0,0,0,0
