In [60]:

import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns


In [61]:
df = pd.read_csv("dataset/cars.csv")

df.sample(5)

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
5066,Maruti,100000,Petrol,Second Owner,200000
3627,Maruti,10000,Petrol,First Owner,430000
4812,Chevrolet,80000,Petrol,Second Owner,80000
423,BMW,7500,Diesel,First Owner,5400000
1494,Maruti,68000,Petrol,Second Owner,405000


In [62]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8128 entries, 0 to 8127
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   brand          8128 non-null   object
 1   km_driven      8128 non-null   int64 
 2   fuel           8128 non-null   object
 3   owner          8128 non-null   object
 4   selling_price  8128 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 317.6+ KB


In [63]:
df['brand'].value_counts()

brand
Maruti           2448
Hyundai          1415
Mahindra          772
Tata              734
Toyota            488
Honda             467
Ford              397
Chevrolet         230
Renault           228
Volkswagen        186
BMW               120
Skoda             105
Nissan             81
Jaguar             71
Volvo              67
Datsun             65
Mercedes-Benz      54
Fiat               47
Audi               40
Lexus              34
Jeep               31
Mitsubishi         14
Force               6
Land                6
Isuzu               5
Kia                 4
Ambassador          4
Daewoo              3
MG                  3
Ashok               1
Opel                1
Peugeot             1
Name: count, dtype: int64

In [64]:
df['brand'].unique() # total unique brand

array(['Maruti', 'Skoda', 'Honda', 'Hyundai', 'Toyota', 'Ford', 'Renault',
       'Mahindra', 'Tata', 'Chevrolet', 'Fiat', 'Datsun', 'Jeep',
       'Mercedes-Benz', 'Mitsubishi', 'Audi', 'Volkswagen', 'BMW',
       'Nissan', 'Lexus', 'Jaguar', 'Land', 'MG', 'Volvo', 'Daewoo',
       'Kia', 'Force', 'Ambassador', 'Ashok', 'Isuzu', 'Opel', 'Peugeot'],
      dtype=object)

In [65]:
df['brand'].nunique() # in total we have total 32 brand: 

32

In [66]:
brand = df["brand"].nunique()
fuel = df["fuel"].nunique()
owner = df['owner'].nunique()

brand+fuel+owner  # in total we have 41 (IF we do OHE then (38) very very big)

41

In [67]:
df['owner'].value_counts()

owner
First Owner             5289
Second Owner            2105
Third Owner              555
Fourth & Above Owner     174
Test Drive Car             5
Name: count, dtype: int64

<br>
<br>
 
# 1. one hot encoding with pandas: 

<br>
<br>

In [68]:

# we have 32 brand we handle it by using most frequent value:

pd.get_dummies(data=df,columns=["fuel","owner"])  # but in this we faces multi-colinearity problem:


Unnamed: 0,brand,km_driven,selling_price,fuel_CNG,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_First Owner,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,False,True,False,False,True,False,False,False,False
1,Skoda,120000,370000,False,True,False,False,False,False,True,False,False
2,Honda,140000,158000,False,False,False,True,False,False,False,False,True
3,Hyundai,127000,225000,False,True,False,False,True,False,False,False,False
4,Maruti,120000,130000,False,False,False,True,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,False,False,False,True,True,False,False,False,False
8124,Hyundai,119000,135000,False,True,False,False,False,True,False,False,False
8125,Maruti,120000,382000,False,True,False,False,True,False,False,False,False
8126,Tata,25000,290000,False,True,False,False,True,False,False,False,False


In [69]:

# (K-1) th One Hot Encoding:

# here solve the multi-colineary problem:

pd.get_dummies(data=df,columns=["fuel","owner"],drop_first=True)


Unnamed: 0,brand,km_driven,selling_price,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,True,False,False,False,False,False,False
1,Skoda,120000,370000,True,False,False,False,True,False,False
2,Honda,140000,158000,False,False,True,False,False,False,True
3,Hyundai,127000,225000,True,False,False,False,False,False,False
4,Maruti,120000,130000,False,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,False,False,True,False,False,False,False
8124,Hyundai,119000,135000,True,False,False,True,False,False,False
8125,Maruti,120000,382000,True,False,False,False,False,False,False
8126,Tata,25000,290000,True,False,False,False,False,False,False


`When we data analysis with pandas we use pd.get_dummies() but in the case of machine learning we use sk-learn inbuilt function. Beacuse, padas does do remember the position of the column.`

<br>

# 3. OneHotEncoding Using Sk-learn:

<br>

In [70]:
df.sample(3)

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
3447,Tata,140000,Diesel,First Owner,316000
5430,Mahindra,10000,Diesel,First Owner,1450000
2487,Maruti,48000,Petrol,Second Owner,160000


In [71]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

X_train,X_test,y_train,y_test = train_test_split(df.drop(columns=["selling_price"]),df["selling_price"],test_size=0.2,random_state=0)


<br>

`But prb is we can't implement OHE in (X_train or X_test) first, we need to seperate the categorical column.`

<br>

In [72]:

oht = OneHotEncoder()
oht.fit_transform(X_train[['fuel','owner']])

<6502x9 sparse matrix of type '<class 'numpy.float64'>'
	with 13004 stored elements in Compressed Sparse Row format>

In [73]:

# oht give us a sparse matrix convert it to array() and them with (numerial columns):

X_train_new = oht.fit_transform(X_train[['fuel','owner']]).toarray()
X_train_new

array([[0., 0., 1., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 1.],
       [0., 1., 0., ..., 1., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [74]:
X_train[['brand','km_driven']].values  # we get two nparray (now join them)

array([['Hyundai', 60000],
       ['Tata', 150000],
       ['Hyundai', 110000],
       ...,
       ['Hyundai', 90000],
       ['Volkswagen', 90000],
       ['Hyundai', 110000]], dtype=object)

In [75]:

# use tuple inside hstack

np.hstack((X_train[["brand","km_driven"]].values,X_train_new)).shape # with multi-colineary : 


(6502, 11)

<br>
<br>

# 2. remove muli - colinearity:

<br>
<br>

In [78]:

ohe = OneHotEncoder(drop='first',sparse_output=False)  # it give us directly array

X_train_New = ohe.fit_transform(X_train[["fuel","owner"]]) # don't need to convert into array:

np.hstack((X_train_New,X_train[['brand',"km_driven"]].values)).shape


(6502, 9)

<br>
<br>

# 3. OHE for Brand:

<br>
<br>

In [83]:
count = df['brand'].value_counts()
theshold = 100

rep = count[count<= theshold].index # index for column:
rep

Index(['Nissan', 'Jaguar', 'Volvo', 'Datsun', 'Mercedes-Benz', 'Fiat', 'Audi',
       'Lexus', 'Jeep', 'Mitsubishi', 'Force', 'Land', 'Isuzu', 'Kia',
       'Ambassador', 'Daewoo', 'MG', 'Ashok', 'Opel', 'Peugeot'],
      dtype='object', name='brand')

In [89]:
brand = pd.get_dummies(df["brand"].replace(rep,"uncommon"))

In [92]:
brand.replace({True:1,False:0})

  brand.replace({True:1,False:0})


Unnamed: 0,BMW,Chevrolet,Ford,Honda,Hyundai,Mahindra,Maruti,Renault,Skoda,Tata,Toyota,Volkswagen,uncommon
0,0,0,0,0,0,0,1,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,1,0,0,0,0
2,0,0,0,1,0,0,0,0,0,0,0,0,0
3,0,0,0,0,1,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8123,0,0,0,0,1,0,0,0,0,0,0,0,0
8124,0,0,0,0,1,0,0,0,0,0,0,0,0
8125,0,0,0,0,0,0,1,0,0,0,0,0,0
8126,0,0,0,0,0,0,0,0,0,1,0,0,0
