In [129]:
import pandas as pd 
import numpy as np 

In [130]:
car = pd.read_csv('car.csv')

In [131]:
car.shape

(892, 6)

## Quality 
-- Year has many non-year values
-- year object to int
-- price has ask for price 
-- price oject to int 
-- kmh driven object to int and nan value 
-- fuel_type has nan value 


## Cleaning


In [132]:
backup = car.copy()

In [133]:
car = car[car['year'].str.isnumeric()]

In [134]:
car['year'] = car['year'].astype(int)

In [135]:
car = car[car['Price'] != "Ask For Price"]

In [136]:
car['Price'] = car['Price'] .str.replace(',',"").astype(int)

In [137]:
car['kms_driven'] = car['kms_driven'].str.split(' ').str.get(0).str.replace(',','')


In [138]:
car['kms_driven']

0       45000
1          40
3       28000
4       36000
6       41000
        ...  
886    132000
888     27000
889     40000
890    Petrol
891    Petrol
Name: kms_driven, Length: 819, dtype: object

In [139]:
car=car[car['kms_driven'].str.isnumeric()]
car['kms_driven'] = car['kms_driven'].astype(int)

In [140]:
car.info()

<class 'pandas.core.frame.DataFrame'>
Index: 817 entries, 0 to 889
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   name        817 non-null    object
 1   company     817 non-null    object
 2   year        817 non-null    int32 
 3   Price       817 non-null    int32 
 4   kms_driven  817 non-null    int32 
 5   fuel_type   816 non-null    object
dtypes: int32(3), object(3)
memory usage: 35.1+ KB


In [141]:
car['name']=car['name'].str.split(' ').str.slice(0,3).str.join(' ')

In [142]:
car.head()

Unnamed: 0,name,company,year,Price,kms_driven,fuel_type
0,Hyundai Santro Xing,Hyundai,2007,80000,45000,Petrol
1,Mahindra Jeep CL550,Mahindra,2006,425000,40,Diesel
3,Hyundai Grand i10,Hyundai,2014,325000,28000,Petrol
4,Ford EcoSport Titanium,Ford,2014,575000,36000,Diesel
6,Ford Figo,Ford,2012,175000,41000,Diesel


In [143]:
car = car.reset_index(drop=True)

In [144]:
car.describe()

Unnamed: 0,year,Price,kms_driven
count,817.0,817.0,817.0
mean,2012.440636,411550.3,46250.71481
std,4.002354,474917.3,34283.745254
min,1995.0,30000.0,0.0
25%,2010.0,175000.0,27000.0
50%,2013.0,299999.0,41000.0
75%,2015.0,490000.0,56758.0
max,2019.0,8500003.0,400000.0


In [145]:
car[car['Price']>6e6]

Unnamed: 0,name,company,year,Price,kms_driven,fuel_type
535,Mahindra XUV500 W6,Mahindra,2014,8500003,45000,Diesel


In [146]:
car.to_csv('cleaned_car.csv')

## Model

In [147]:
x = car.drop(columns='Price')
y = car['Price']

In [148]:
from sklearn.model_selection import train_test_split 
x_train , x_test, y_train, y_test = train_test_split(x,y,test_size=0.2)

In [149]:
from sklearn.linear_model import LinearRegression 
from sklearn.metrics import r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer 
from sklearn.pipeline import make_pipeline

In [150]:
ohe = OneHotEncoder()
ohe.fit(x[['name','company','fuel_type']])

In [151]:
column_trans = make_column_transformer((OneHotEncoder(categories=ohe.categories_),['name','company','fuel_type']), remainder='passthrough')

In [152]:
lr = LinearRegression()

In [153]:
pipe = make_pipeline(column_trans,lr)

In [154]:
pipe.fit(x_train,y_train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [155]:
y_pred = pipe.predict(x_test)

In [156]:
r2_score(y_test,y_pred)

0.8037868874501772

In [162]:
scores=[]
for i in range(1000):
    x_train , x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=i)
    lr = LinearRegression()
    pipe = make_pipeline(column_trans,lr)
    pipe.fit(x_train,y_train)
    y_pred = pipe.predict(x_test)
    scores.append(r2_score(y_test,y_pred))

In [163]:
np.argmax(scores)

653

In [165]:
scores[np.argmax(scores)]

0.8176307384150114

In [167]:
x_train , x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=np.argmax(scores))
lr = LinearRegression()
pipe = make_pipeline(column_trans,lr)
pipe.fit(x_train,y_train)
y_pred = pipe.predict(x_test)
r2_score(y_test,y_pred)

0.8176307384150114

In [173]:
import pickle
pickle.dump(pipe,open('car_model.pkl','wb'))

In [179]:
pipe.predict(pd.DataFrame([['Maruti Suzuki Swift','Maruti',2019,100,'Petrol']], columns =['name','company','year','kms_driven','fuel_type']))

array([436818.42201649])