# Dealing with Nominal Categorical data 

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('cars.csv')
df.sample(5)

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
6251,Ford,69000,Diesel,Second Owner,459999
1804,Chevrolet,76000,Diesel,Second Owner,170000
6540,Ford,27000,Diesel,First Owner,950000
7171,Tata,90000,Diesel,First Owner,150000
1091,BMW,7500,Diesel,First Owner,5400000


### One Hot Encoding using Pandas

In [4]:
pd.get_dummies(data = df, columns= ['brand','fuel','owner'] )

Unnamed: 0,km_driven,selling_price,brand_Ambassador,brand_Ashok,brand_Audi,brand_BMW,brand_Chevrolet,brand_Daewoo,brand_Datsun,brand_Fiat,...,brand_Volvo,fuel_CNG,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_First Owner,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,145500,450000,0,0,0,0,0,0,0,0,...,0,0,1,0,0,1,0,0,0,0
1,120000,370000,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,1,0,0
2,140000,158000,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
3,127000,225000,0,0,0,0,0,0,0,0,...,0,0,1,0,0,1,0,0,0,0
4,120000,130000,0,0,0,0,0,0,0,0,...,0,0,0,0,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8123,110000,320000,0,0,0,0,0,0,0,0,...,0,0,0,0,1,1,0,0,0,0
8124,119000,135000,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,1,0,0,0
8125,120000,382000,0,0,0,0,0,0,0,0,...,0,0,1,0,0,1,0,0,0,0
8126,25000,290000,0,0,0,0,0,0,0,0,...,0,0,1,0,0,1,0,0,0,0


### K-1 One hot Encoding

In [5]:
# Drop one column to avoid the problem of multicollinearity

In [6]:
pd.get_dummies(data = df, columns= ['brand','fuel','owner'],drop_first=True )

Unnamed: 0,km_driven,selling_price,brand_Ashok,brand_Audi,brand_BMW,brand_Chevrolet,brand_Daewoo,brand_Datsun,brand_Fiat,brand_Force,...,brand_Toyota,brand_Volkswagen,brand_Volvo,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,145500,450000,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,120000,370000,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0
2,140000,158000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
3,127000,225000,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,120000,130000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8123,110000,320000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
8124,119000,135000,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,1,0,0,0
8125,120000,382000,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
8126,25000,290000,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


### Sklearn One hot encoding 

In [9]:
df.head()

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
0,Maruti,145500,Diesel,First Owner,450000
1,Skoda,120000,Diesel,Second Owner,370000
2,Honda,140000,Petrol,Third Owner,158000
3,Hyundai,127000,Diesel,First Owner,225000
4,Maruti,120000,Petrol,First Owner,130000


In [8]:
from sklearn.model_selection import train_test_split

In [10]:
X_train, X_test, y_train, y_test = train_test_split(df.drop('selling_price',axis = 1), df['selling_price'], test_size=0.33, random_state=42)

In [7]:
from sklearn.preprocessing import OneHotEncoder

In [12]:
ohe = OneHotEncoder(drop= 'first',sparse = False,dtype=np.int32)

In [19]:
X_train_new = ohe.fit_transform(X_train[['brand','fuel','owner']])

In [26]:
X_train_new

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 1, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 1, 0, 0]])

In [20]:
X_test_new = ohe.transform(X_train[['brand','fuel','owner']])

In [27]:
X_test_new

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 1, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 1, 0, 0]])

In [30]:
np.hstack((X_train[['km_driven']].values, X_train_new))

array([[40000,     0,     0, ...,     0,     0,     0],
       [82000,     0,     0, ...,     0,     0,     0],
       [81000,     0,     0, ...,     1,     0,     0],
       ...,
       [35000,     0,     0, ...,     0,     0,     0],
       [27000,     0,     0, ...,     0,     0,     0],
       [70000,     0,     0, ...,     1,     0,     0]], dtype=int64)