In [1]:
import pandas as pd 
import numpy as np

In [2]:
df = pd.read_csv("quikr.csv")
df.head()

Unnamed: 0,name,company,year,Price,kms_driven,fuel_type
0,Hyundai Santro Xing XO eRLX Euro III,Hyundai,2007,80000,"45,000 kms",Petrol
1,Mahindra Jeep CL550 MDI,Mahindra,2006,425000,40 kms,Diesel
2,Maruti Suzuki Alto 800 Vxi,Maruti,2018,Ask For Price,"22,000 kms",Petrol
3,Hyundai Grand i10 Magna 1.2 Kappa VTVT,Hyundai,2014,325000,"28,000 kms",Petrol
4,Ford EcoSport Titanium 1.5L TDCi,Ford,2014,575000,"36,000 kms",Diesel


In [3]:
df.describe()

Unnamed: 0,name,company,year,Price,kms_driven,fuel_type
count,892,892,892,892,840,837
unique,525,48,61,274,258,3
top,Honda City,Maruti,2015,Ask For Price,"45,000 kms",Petrol
freq,13,235,117,35,30,440


In [4]:
df.shape

(892, 6)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 892 entries, 0 to 891
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   name        892 non-null    object
 1   company     892 non-null    object
 2   year        892 non-null    object
 3   Price       892 non-null    object
 4   kms_driven  840 non-null    object
 5   fuel_type   837 non-null    object
dtypes: object(6)
memory usage: 41.9+ KB


In [6]:
df.isnull().sum()

name           0
company        0
year           0
Price          0
kms_driven    52
fuel_type     55
dtype: int64

### Data Exploration
- year has many values which do not make any sense
- convert year from obj to int
- remove "Ask for price" from price column
- remove commas from the price column
- convert price from obj to int
- remove kms and commas from the kms_driven column
- convert kms_driven from obj to int
- deal with nan values in kms_driven column
- deal with nan values in fuel type
- keep the first 3 words from the name column so that we can make them as categorical variables

### Cleaning

In [7]:
backup_df = df.copy()
backup_df.head()

Unnamed: 0,name,company,year,Price,kms_driven,fuel_type
0,Hyundai Santro Xing XO eRLX Euro III,Hyundai,2007,80000,"45,000 kms",Petrol
1,Mahindra Jeep CL550 MDI,Mahindra,2006,425000,40 kms,Diesel
2,Maruti Suzuki Alto 800 Vxi,Maruti,2018,Ask For Price,"22,000 kms",Petrol
3,Hyundai Grand i10 Magna 1.2 Kappa VTVT,Hyundai,2014,325000,"28,000 kms",Petrol
4,Ford EcoSport Titanium 1.5L TDCi,Ford,2014,575000,"36,000 kms",Diesel


In [8]:
# Having only numeric values in the year column
df = df[df['year'].str.isnumeric()]
df.head()

Unnamed: 0,name,company,year,Price,kms_driven,fuel_type
0,Hyundai Santro Xing XO eRLX Euro III,Hyundai,2007,80000,"45,000 kms",Petrol
1,Mahindra Jeep CL550 MDI,Mahindra,2006,425000,40 kms,Diesel
2,Maruti Suzuki Alto 800 Vxi,Maruti,2018,Ask For Price,"22,000 kms",Petrol
3,Hyundai Grand i10 Magna 1.2 Kappa VTVT,Hyundai,2014,325000,"28,000 kms",Petrol
4,Ford EcoSport Titanium 1.5L TDCi,Ford,2014,575000,"36,000 kms",Diesel


In [9]:
# Converting the year column from obj to int
df.year = df.year.astype(int)

In [10]:
df[df.Price == 'Ask For Price'].count()

name          23
company       23
year          23
Price         23
kms_driven    21
fuel_type     21
dtype: int64

In [11]:
# Dropping all the rows having the value of Price as 'Ask For Price'
df = df[df.Price != 'Ask For Price']

In [12]:
# Removing commas and converting Price column from obj to int
df.Price = df.Price.str.replace(',','').astype(int)

In [13]:
# Removing kms and commas from kms_driven column
df.kms_driven = df.kms_driven.str.split(' ').str.get(0).str.replace(',','')

In [14]:
df = df[df.kms_driven.str.isnumeric()]
df.head()

Unnamed: 0,name,company,year,Price,kms_driven,fuel_type
0,Hyundai Santro Xing XO eRLX Euro III,Hyundai,2007,80000,45000,Petrol
1,Mahindra Jeep CL550 MDI,Mahindra,2006,425000,40,Diesel
3,Hyundai Grand i10 Magna 1.2 Kappa VTVT,Hyundai,2014,325000,28000,Petrol
4,Ford EcoSport Titanium 1.5L TDCi,Ford,2014,575000,36000,Diesel
6,Ford Figo,Ford,2012,175000,41000,Diesel


In [15]:
df.kms_driven = df.kms_driven.astype(int)

In [16]:
# Counting all the nan values in the column fuel_type
df.fuel_type.isnull().sum()

1

In [17]:
# Dropping all the rows containing nan values
df = df.dropna()

In [18]:
# Picking the first 3 words from the name column
df.name = df.name.str.split(" ").str.slice(0,3).str.join(" ")

In [19]:
df = df.reset_index(drop=True)

In [20]:
df.sample(5)

Unnamed: 0,name,company,year,Price,kms_driven,fuel_type
526,Tata Indica V2,Tata,2005,35000,150000,Diesel
474,Hyundai Santro Xing,Hyundai,2003,120000,50000,Petrol
394,Mahindra Scorpio W,Mahindra,2012,165000,65000,Diesel
270,Renault Scala RxL,Renault,2014,349999,49000,Diesel
451,Honda City,Honda,2015,499999,55000,Petrol


In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 816 entries, 0 to 815
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   name        816 non-null    object
 1   company     816 non-null    object
 2   year        816 non-null    int32 
 3   Price       816 non-null    int32 
 4   kms_driven  816 non-null    int32 
 5   fuel_type   816 non-null    object
dtypes: int32(3), object(3)
memory usage: 28.8+ KB


In [22]:
df.describe()

Unnamed: 0,year,Price,kms_driven
count,816.0,816.0,816.0
mean,2012.444853,411717.6,46275.531863
std,4.002992,475184.4,34297.428044
min,1995.0,30000.0,0.0
25%,2010.0,175000.0,27000.0
50%,2013.0,299999.0,41000.0
75%,2015.0,491250.0,56818.5
max,2019.0,8500003.0,400000.0


In [23]:
# Here we see that there is an outlier where the max car price of a car is 85L
df = df[df.Price < 6e6].reset_index(drop=True)

In [24]:
# Storing the cleaned data into a new data frame
cleaned_df = df.copy()
cleaned_df.shape

(815, 6)

### Model

In [25]:
# Dividing the input dataset and output dataset
X = df.drop(['Price'], axis=1)
y = df.Price

In [26]:
df.corr()['Price']

year          0.347764
Price         1.000000
kms_driven   -0.149621
Name: Price, dtype: float64

In [27]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

In [36]:
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=np.argmax(scores),test_size=0.2)

In [29]:
ohe = OneHotEncoder()
ohe.fit(X[['name','company','fuel_type']])

OneHotEncoder()

In [30]:
transformer = ColumnTransformer(transformers=[
    ('tnf1',OneHotEncoder(categories=ohe.categories_, drop='first', sparse=False), ['name','company','fuel_type'])
], remainder='passthrough')

In [31]:
lr = LinearRegression()

In [32]:
pipe = make_pipeline(transformer, lr)

In [37]:
pipe.fit(X_train,y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('tnf1',
                                                  OneHotEncoder(categories=[array(['Audi A3 Cabriolet', 'Audi A4 1.8', 'Audi A4 2.0', 'Audi A6 2.0',
       'Audi A8', 'Audi Q3 2.0', 'Audi Q5 2.0', 'Audi Q7', 'BMW 3 Series',
       'BMW 5 Series', 'BMW 7 Series', 'BMW X1', 'BMW X1 sDrive20d',
       'BMW X1 xDrive20d', 'Chevrolet Beat', 'Chevrolet Beat Diesel',
       '...
                                                                            array(['Audi', 'BMW', 'Chevrolet', 'Datsun', 'Fiat', 'Force', 'Ford',
       'Hindustan', 'Honda', 'Hyundai', 'Jaguar', 'Jeep', 'Land',
       'Mahindra', 'Maruti', 'Mercedes', 'Mini', 'Mitsubishi', 'Nissan',
       'Renault', 'Skoda', 'Tata', 'Toyota', 'Volkswagen', 'Volvo'],
      dtype=object),
                                                                            array(['Diesel', '

In [34]:
scores = []
for i in range(1000):
    X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=i,test_size=0.2)
    pipe = make_pipeline(transformer, lr)
    pipe.fit(X_train,y_train)
    y_pred = pipe.predict(X_test)
    scores.append(r2_score(y_test,y_pred))

In [35]:
# Finding the optimum value of random state so that we can get a higher accurate model
np.argmax(scores)

689

In [38]:
y_pred = pipe.predict(X_test)

In [39]:
r2_score(y_test,y_pred)

0.8509672842437312

In [54]:
pipe.predict(pd.DataFrame(columns=['name','company','year','kms_driven','fuel_type'],data=np.array(['Maruti Suzuki Swift','Maruti',2019,100,'Petrol']).reshape(1,5)))

array([415854.9570993])

In [47]:
pipe.feature_names_in_

array(['name', 'company', 'year', 'kms_driven', 'fuel_type'], dtype=object)