In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [2]:
cars_data = pd.read_csv('quikr_car.csv')

In [3]:
cars_data.head()

Unnamed: 0,name,company,year,Price,kms_driven,fuel_type
0,Hyundai Santro Xing XO eRLX Euro III,Hyundai,2007,80000,"45,000 kms",Petrol
1,Mahindra Jeep CL550 MDI,Mahindra,2006,425000,40 kms,Diesel
2,Maruti Suzuki Alto 800 Vxi,Maruti,2018,Ask For Price,"22,000 kms",Petrol
3,Hyundai Grand i10 Magna 1.2 Kappa VTVT,Hyundai,2014,325000,"28,000 kms",Petrol
4,Ford EcoSport Titanium 1.5L TDCi,Ford,2014,575000,"36,000 kms",Diesel


In [4]:
cars_data.shape

(892, 6)

In [5]:
cars_data.isnull().sum()

name           0
company        0
year           0
Price          0
kms_driven    52
fuel_type     55
dtype: int64

In [6]:
cars_data.dropna(inplace=True)

In [7]:
cars_data.shape

(837, 6)

In [8]:
cars_data

Unnamed: 0,name,company,year,Price,kms_driven,fuel_type
0,Hyundai Santro Xing XO eRLX Euro III,Hyundai,2007,80000,"45,000 kms",Petrol
1,Mahindra Jeep CL550 MDI,Mahindra,2006,425000,40 kms,Diesel
2,Maruti Suzuki Alto 800 Vxi,Maruti,2018,Ask For Price,"22,000 kms",Petrol
3,Hyundai Grand i10 Magna 1.2 Kappa VTVT,Hyundai,2014,325000,"28,000 kms",Petrol
4,Ford EcoSport Titanium 1.5L TDCi,Ford,2014,575000,"36,000 kms",Diesel
...,...,...,...,...,...,...
883,Maruti Suzuki Ritz VXI ABS,Maruti,2011,270000,"50,000 kms",Petrol
885,Tata Indica V2 DLE BS III,Tata,2009,110000,"30,000 kms",Diesel
886,Toyota Corolla Altis,Toyota,2009,300000,"1,32,000 kms",Petrol
888,Tata Zest XM Diesel,Tata,2018,260000,"27,000 kms",Diesel


In [9]:
cars_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 837 entries, 0 to 889
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   name        837 non-null    object
 1   company     837 non-null    object
 2   year        837 non-null    object
 3   Price       837 non-null    object
 4   kms_driven  837 non-null    object
 5   fuel_type   837 non-null    object
dtypes: object(6)
memory usage: 78.1+ KB


In [10]:
for col in cars_data.columns:
    print('Unique values of ' + col)
    print(cars_data[col].unique())
    print("======================")

Unique values of name
['Hyundai Santro Xing XO eRLX Euro III' 'Mahindra Jeep CL550 MDI'
 'Maruti Suzuki Alto 800 Vxi' 'Hyundai Grand i10 Magna 1.2 Kappa VTVT'
 'Ford EcoSport Titanium 1.5L TDCi' 'Ford Figo' 'Hyundai Eon'
 'Ford EcoSport Ambiente 1.5L TDCi' 'Maruti Suzuki Alto K10 VXi AMT'
 'Skoda Fabia Classic 1.2 MPI' 'Maruti Suzuki Stingray VXi'
 'Hyundai Elite i20 Magna 1.2' 'Mahindra Scorpio SLE BS IV' 'Audi A8'
 'Audi Q7' 'Mahindra Scorpio S10' 'Maruti Suzuki Alto 800'
 'Hyundai i20 Sportz 1.2' 'Maruti Suzuki Alto 800 Lx'
 'Maruti Suzuki Vitara Brezza ZDi' 'Maruti Suzuki Alto LX'
 'Mahindra Bolero DI' 'Maruti Suzuki Swift Dzire ZDi'
 'Mahindra Scorpio S10 4WD' 'Maruti Suzuki Swift Vdi BSIII'
 'Maruti Suzuki Wagon R VXi BS III' 'Maruti Suzuki Wagon R VXi Minor'
 'Toyota Innova 2.0 G 8 STR BS IV' 'Renault Lodgy 85 PS RXL'
 'Skoda Yeti Ambition 2.0 TDI CR 4x2' 'Maruti Suzuki Baleno Delta 1.2'
 'Renault Duster 110 PS RxZ Diesel Plus' 'Renault Duster 85 PS RxE Diesel'
 'Honda City 1.5 

In [11]:
def get_brand_name(car_name):
    car_name = car_name.split(' ')[0]
    return car_name.strip()

In [12]:
def clean_data(value):
    value = value.split(' ')[0]
    value = value.strip()
    if value == '':
        value = 0
    return float(value)

In [13]:
get_brand_name('Maruti  Swift Dzire VDI')

'Maruti'

In [14]:
cars_data['name'] = cars_data['name'].apply(get_brand_name)

In [15]:
cars_data['name'].unique()

array(['Hyundai', 'Mahindra', 'Maruti', 'Ford', 'Skoda', 'Audi', 'Toyota',
       'Renault', 'Honda', 'Datsun', 'Mitsubishi', 'Tata', 'Volkswagen',
       'Chevrolet', 'Mini', 'BMW', 'Nissan', 'Hindustan', 'Fiat', 'Force',
       'Mercedes', 'Land', 'Jaguar', 'Jeep', 'Volvo'], dtype=object)

In [17]:
for col in cars_data.columns:
    print('Unique values of ' + col)
    print(cars_data[col].unique())
    print("======================")

Unique values of name
['Hyundai' 'Mahindra' 'Maruti' 'Ford' 'Skoda' 'Audi' 'Toyota' 'Renault'
 'Honda' 'Datsun' 'Mitsubishi' 'Tata' 'Volkswagen' 'Chevrolet' 'Mini'
 'BMW' 'Nissan' 'Hindustan' 'Fiat' 'Force' 'Mercedes' 'Land' 'Jaguar'
 'Jeep' 'Volvo']
Unique values of company
['Hyundai' 'Mahindra' 'Maruti' 'Ford' 'Skoda' 'Audi' 'Toyota' 'Renault'
 'Honda' 'Datsun' 'Mitsubishi' 'Tata' 'Volkswagen' 'Chevrolet' 'Mini'
 'BMW' 'Nissan' 'Hindustan' 'Fiat' 'Force' 'Mercedes' 'Land' 'Jaguar'
 'Jeep' 'Volvo']
Unique values of year
['2007' '2006' '2018' '2014' '2015' '2012' '2013' '2016' '2010' '2017'
 '2008' '2011' '2019' '2009' '2005' '2000' '2003' '2004' '1995' '2002'
 '2001']
Unique values of Price
['80,000' '4,25,000' 'Ask For Price' '3,25,000' '5,75,000' '1,75,000'
 '1,90,000' '8,30,000' '2,50,000' '1,82,000' '3,15,000' '4,15,000'
 '3,20,000' '10,00,000' '5,00,000' '3,50,000' '1,60,000' '3,10,000'
 '75,000' '1,00,000' '2,90,000' '95,000' '1,80,000' '3,85,000' '1,05,000'
 '6,50,000' '6,89,99

In [47]:
cars_data['name'].replace(['Hyundai', 'Mahindra', 'Maruti', 'Ford', 'Skoda', 'Audi', 'Toyota',
       'Renault', 'Honda', 'Datsun', 'Mitsubishi', 'Tata', 'Volkswagen',
       'Chevrolet', 'Mini', 'BMW', 'Nissan', 'Hindustan', 'Fiat', 'Force',
       'Mercedes', 'Land', 'Jaguar', 'Jeep', 'Volvo'],
                          [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25]
                          ,inplace=True)

In [60]:
cars_data['company'].unique()

array(['Hyundai', 'Mahindra', 'Maruti', 'Ford', 'Skoda', 'Audi', 'Toyota',
       'Renault', 'Honda', 'Datsun', 'Mitsubishi', 'Tata', 'Volkswagen',
       'Chevrolet', 'Mini', 'BMW', 'Nissan', 'Hindustan', 'Fiat', 'Force',
       'Mercedes', 'Land', 'Jaguar', 'Jeep', 'Volvo'], dtype=object)

In [67]:
cars_data['company'].replace(['Hyundai', 'Mahindra', 'Maruti', 'Ford', 'Skoda', 'Audi', 'Toyota',
       'Renault', 'Honda', 'Datsun', 'Mitsubishi', 'Tata', 'Volkswagen',
       'Chevrolet', 'Mini', 'BMW', 'Nissan', 'Hindustan', 'Fiat', 'Force',
       'Mercedes', 'Land', 'Jaguar', 'Jeep', 'Volvo'],
                          [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25]
                          ,inplace=True)

In [81]:
cars_data['kms_driven']=cars_data['kms_driven'].str.split().str.get(0).str.replace(',','')

In [83]:
car=cars_data[cars_data['kms_driven'].str.isnumeric()]

In [84]:
cars_data['kms_driven']=cars_data['kms_driven'].astype(int)

In [85]:
cars_data=cars_data[cars_data['Price']!='Ask For Price']

In [98]:
cars_data['Price']=cars_data['Price'].str.replace(',','').astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cars_data['Price']=cars_data['Price'].str.replace(',','').astype(int)


In [99]:
cars_data['fuel_type'].unique()

array([2, 1, 3])

In [100]:
cars_data['fuel_type'].replace(['Diesel', 'Petrol', 'LPG'],[1,2,3], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cars_data['fuel_type'].replace(['Diesel', 'Petrol', 'LPG'],[1,2,3], inplace=True)


In [101]:
cars_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 816 entries, 0 to 815
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   level_0     816 non-null    int64 
 1   name        816 non-null    int64 
 2   company     816 non-null    int64 
 3   year        816 non-null    object
 4   Price       816 non-null    int64 
 5   kms_driven  816 non-null    int64 
 6   fuel_type   816 non-null    int64 
dtypes: int64(6), object(1)
memory usage: 44.8+ KB


In [102]:
cars_data.reset_index(inplace=True)

In [103]:
cars_data

Unnamed: 0,index,level_0,name,company,year,Price,kms_driven,fuel_type
0,0,0,4,1,2007,80000,45000,2
1,1,1,8,2,2006,425000,40,1
2,2,3,4,1,2014,325000,28000,2
3,3,4,6,4,2014,575000,36000,1
4,4,6,6,4,2012,175000,41000,1
...,...,...,...,...,...,...,...,...
811,811,832,1,3,2011,270000,50000,2
812,812,833,9,12,2009,110000,30000,1
813,813,834,5,7,2009,300000,132000,2
814,814,835,9,12,2018,260000,27000,1


In [104]:
cars_data.drop(columns=['index'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cars_data.drop(columns=['index'], inplace=True)


In [105]:
for col in cars_data.columns:
    print('------------')
    print(col)
    print(cars_data[col].unique())

------------
level_0
[  0   1   3   4   6   7   8   9  10  11  12  13  14  15  16  17  18  19
  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36  37
  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53  54  55
  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71  72  73
  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89  90  91
  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107 108 109
 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127
 128 129 130 131 132 133 134 136 137 138 139 140 141 142 143 144 145 146
 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164
 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182
 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200
 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218
 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236
 237 238 239 240 241 242 243 2

In [120]:
cars_data.drop(columns=['level_0'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cars_data.drop(columns=['level_0'], inplace=True)


In [136]:
cars_data.drop(columns=['name'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cars_data.drop(columns=['name'], inplace=True)


In [137]:
cars_data.isnull().sum()

company       0
year          0
Price         0
kms_driven    0
fuel_type     0
dtype: int64

In [138]:
input_data = cars_data.drop(columns=['Price'])
output_data =cars_data['Price']

In [139]:
x_train, x_test, y_train, y_test = train_test_split(input_data, output_data, test_size=0.2)

In [140]:
model = LinearRegression()

In [141]:
model.fit(x_train, y_train)

In [142]:
predict = model.predict(x_test)

In [143]:
predict

array([ 451950.90015632,  381386.5477662 ,  609968.32502687,
        384115.83550619,  471336.90005918,  602384.87399816,
        264150.75267489,  710311.90604243,  627274.44526546,
        131130.65163032,  294944.52129545,  614903.49531344,
        547639.08155147,  414026.92242276,  959354.9465423 ,
        626810.9114341 ,  495664.9322353 ,  444958.72203799,
        337640.64537805,  460354.44118764,  452779.00021942,
        452583.01042778,  221661.56823667,  511530.08486321,
        677684.67723438,  615500.4748703 ,  133604.84162072,
        558234.09208357,  584990.23347198,  443496.39527588,
        367030.52229933,  127494.67723806,  753146.69704314,
        442336.86068432,  586762.58756657,  514868.46671864,
        270384.24167657,  537908.2628844 ,  686901.45655679,
        576798.52707209,  211385.36999611,  567746.2141325 ,
        571066.32605666,  710854.47518916,  469321.18366309,
        498626.82563525,  547106.77227553,  599531.24572881,
         66335.22822501,

In [144]:
x_train.head(1)

Unnamed: 0,company,year,kms_driven,fuel_type
95,14,2012,45000,1


In [145]:
input_data_model = pd.DataFrame(
    [[14,2012,45000,1]],
    columns=['company','year','kms_driven','fuel_type'])

In [146]:
input_data_model

Unnamed: 0,company,year,kms_driven,fuel_type
0,14,2012,45000,1


In [147]:
model.predict(input_data_model)

array([627042.67834979])

In [148]:
import pickle as pk

In [149]:
pk.dump(model,open('model.pkl','wb'))