In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv("cars.csv")

In [3]:
df.sample(5)

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
638,Hyundai,90000,Diesel,Third Owner,270000
2363,Hyundai,50000,Diesel,First Owner,550000
7122,Maruti,40000,Petrol,First Owner,254000
3382,Maruti,35000,Petrol,First Owner,140000
6113,Maruti,60000,Diesel,First Owner,720000


In [4]:
df.describe()

Unnamed: 0,km_driven,selling_price
count,8128.0,8128.0
mean,69819.51,638271.8
std,56550.55,806253.4
min,1.0,29999.0
25%,35000.0,254999.0
50%,60000.0,450000.0
75%,98000.0,675000.0
max,2360457.0,10000000.0


In [5]:
df.isnull().sum()

brand            0
km_driven        0
fuel             0
owner            0
selling_price    0
dtype: int64

In [6]:
df.value_counts().sum()

8128

In [7]:
df["brand"].unique()

array(['Maruti', 'Skoda', 'Honda', 'Hyundai', 'Toyota', 'Ford', 'Renault',
       'Mahindra', 'Tata', 'Chevrolet', 'Fiat', 'Datsun', 'Jeep',
       'Mercedes-Benz', 'Mitsubishi', 'Audi', 'Volkswagen', 'BMW',
       'Nissan', 'Lexus', 'Jaguar', 'Land', 'MG', 'Volvo', 'Daewoo',
       'Kia', 'Force', 'Ambassador', 'Ashok', 'Isuzu', 'Opel', 'Peugeot'],
      dtype=object)

In [8]:
df["brand"].value_counts()

brand
Maruti           2448
Hyundai          1415
Mahindra          772
Tata              734
Toyota            488
Honda             467
Ford              397
Chevrolet         230
Renault           228
Volkswagen        186
BMW               120
Skoda             105
Nissan             81
Jaguar             71
Volvo              67
Datsun             65
Mercedes-Benz      54
Fiat               47
Audi               40
Lexus              34
Jeep               31
Mitsubishi         14
Force               6
Land                6
Isuzu               5
Kia                 4
Ambassador          4
Daewoo              3
MG                  3
Ashok               1
Opel                1
Peugeot             1
Name: count, dtype: int64

In [9]:
x_train, x_test, y_train, y_test = train_test_split(df.iloc[:, 0:4], df.iloc[:, -1], test_size=0.3, random_state=None)

In [11]:
x_train

Unnamed: 0,brand,km_driven,fuel,owner
5076,Nissan,120000,Diesel,First Owner
2156,Hyundai,15000,Petrol,First Owner
5794,Hyundai,120000,Diesel,First Owner
6726,Maruti,40000,Petrol,Third Owner
8117,Maruti,50000,Diesel,First Owner
...,...,...,...,...
7204,Maruti,90000,Petrol,Second Owner
7883,Ford,50699,Diesel,First Owner
7952,Maruti,5000,Petrol,First Owner
3271,Hyundai,60000,Petrol,Second Owner


In [20]:
ohe = OneHotEncoder(drop="first", sparse_output=False, dtype="int32")

In [34]:
x_train_new = ohe.fit_transform(x_train[["fuel", "owner"]])

In [35]:
x_test_new = ohe.transform(x_test[["fuel", "owner"]])

In [36]:
x_train_new

array([[1, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 1, ..., 0, 0, 0],
       [0, 0, 1, ..., 1, 0, 0],
       [0, 0, 1, ..., 0, 0, 0]], dtype=int32)

In [37]:
x_train[["brand", "km_driven"]].values

array([['Nissan', 120000],
       ['Hyundai', 15000],
       ['Hyundai', 120000],
       ...,
       ['Maruti', 5000],
       ['Hyundai', 60000],
       ['Volkswagen', 28100]], dtype=object)

In [38]:
np.hstack((x_train[["brand", "km_driven"]].values, x_train_new)).shape

(5689, 9)

In [39]:
np.hstack((x_train[["brand", "km_driven"]].values, x_train_new))

array([['Nissan', 120000, 1, ..., 0, 0, 0],
       ['Hyundai', 15000, 0, ..., 0, 0, 0],
       ['Hyundai', 120000, 1, ..., 0, 0, 0],
       ...,
       ['Maruti', 5000, 0, ..., 0, 0, 0],
       ['Hyundai', 60000, 0, ..., 1, 0, 0],
       ['Volkswagen', 28100, 0, ..., 0, 0, 0]], dtype=object)