In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

In [None]:
cars_data = pd.read_csv('Cardetails.csv')
cars_data.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,torque,seats
0,Maruti Swift Dzire VDI,2014,450000,145500,Diesel,Individual,Manual,First Owner,23.4 kmpl,1248 CC,74 bhp,190Nm@ 2000rpm,5.0
1,Skoda Rapid 1.5 TDI Ambition,2014,370000,120000,Diesel,Individual,Manual,Second Owner,21.14 kmpl,1498 CC,103.52 bhp,250Nm@ 1500-2500rpm,5.0
2,Honda City 2017-2020 EXi,2006,158000,140000,Petrol,Individual,Manual,Third Owner,17.7 kmpl,1497 CC,78 bhp,"12.7@ 2,700(kgm@ rpm)",5.0
3,Hyundai i20 Sportz Diesel,2010,225000,127000,Diesel,Individual,Manual,First Owner,23.0 kmpl,1396 CC,90 bhp,22.4 kgm at 1750-2750rpm,5.0
4,Maruti Swift VXI BSIII,2007,130000,120000,Petrol,Individual,Manual,First Owner,16.1 kmpl,1298 CC,88.2 bhp,"11.5@ 4,500(kgm@ rpm)",5.0


In [None]:
cars_data.drop(columns=['torque'], inplace=True)
# inplace=True: This argument tells Pandas to modify the original DataFrame directly. In other words, after executing this line, the 'cars_data' DataFrame will no longer contain the 'torque' column.

In [None]:
cars_data.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,seats
0,Maruti Swift Dzire VDI,2014,450000,145500,Diesel,Individual,Manual,First Owner,23.4 kmpl,1248 CC,74 bhp,5.0
1,Skoda Rapid 1.5 TDI Ambition,2014,370000,120000,Diesel,Individual,Manual,Second Owner,21.14 kmpl,1498 CC,103.52 bhp,5.0
2,Honda City 2017-2020 EXi,2006,158000,140000,Petrol,Individual,Manual,Third Owner,17.7 kmpl,1497 CC,78 bhp,5.0
3,Hyundai i20 Sportz Diesel,2010,225000,127000,Diesel,Individual,Manual,First Owner,23.0 kmpl,1396 CC,90 bhp,5.0
4,Maruti Swift VXI BSIII,2007,130000,120000,Petrol,Individual,Manual,First Owner,16.1 kmpl,1298 CC,88.2 bhp,5.0


In [None]:
cars_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8128 entries, 0 to 8127
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   name           8128 non-null   object 
 1   year           8128 non-null   int64  
 2   selling_price  8128 non-null   int64  
 3   km_driven      8128 non-null   int64  
 4   fuel           8128 non-null   object 
 5   seller_type    8128 non-null   object 
 6   transmission   8128 non-null   object 
 7   owner          8128 non-null   object 
 8   mileage        7907 non-null   object 
 9   engine         7907 non-null   object 
 10  max_power      7913 non-null   object 
 11  seats          7907 non-null   float64
dtypes: float64(1), int64(3), object(8)
memory usage: 762.1+ KB


In [None]:
cars_data.isnull().sum()

Unnamed: 0,0
name,0
year,0
selling_price,0
km_driven,0
fuel,0
seller_type,0
transmission,0
owner,0
mileage,221
engine,221


We will be removing all the null values from the dataset. Which are there in mileage, engine, max_power and seats.

In [None]:
cars_data.describe()

Unnamed: 0,year,selling_price,km_driven,seats
count,8128.0,8128.0,8128.0,7907.0
mean,2013.804011,638271.8,69819.51,5.416719
std,4.044249,806253.4,56550.55,0.959588
min,1983.0,29999.0,1.0,2.0
25%,2011.0,254999.0,35000.0,5.0
50%,2015.0,450000.0,60000.0,5.0
75%,2017.0,675000.0,98000.0,5.0
max,2020.0,10000000.0,2360457.0,14.0


In [None]:
cars_data.shape

(8128, 12)

In [None]:
cars_data.dropna(inplace=True)

In [None]:
cars_data.shape

(7907, 12)

In [None]:
# Duplicate Check
cars_data.duplicated().sum()

1189

In [None]:
cars_data.drop_duplicates(inplace=True)

In [None]:
cars_data.shape


(6718, 12)

In [None]:
cars_data.info()
# This will give us the information about the dataset

<class 'pandas.core.frame.DataFrame'>
Index: 6718 entries, 0 to 8125
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   name           6718 non-null   object 
 1   year           6718 non-null   int64  
 2   selling_price  6718 non-null   int64  
 3   km_driven      6718 non-null   int64  
 4   fuel           6718 non-null   object 
 5   seller_type    6718 non-null   object 
 6   transmission   6718 non-null   object 
 7   owner          6718 non-null   object 
 8   mileage        6718 non-null   object 
 9   engine         6718 non-null   object 
 10  max_power      6718 non-null   object 
 11  seats          6718 non-null   float64
dtypes: float64(1), int64(3), object(8)
memory usage: 682.3+ KB


**Starting the DATA ANALYSIS**

In [None]:
for i in cars_data.columns:
    print(i)
    print(cars_data[i].value_counts())

name
name
Maruti Swift Dzire VDI                          118
Maruti Alto 800 LXI                              76
Maruti Alto LXi                                  69
Maruti Swift VDI                                 60
Maruti Alto K10 VXI                              47
                                               ... 
BMW X7 xDrive 30d DPE                             1
Skoda Superb Elegance 1.8 TSI AT                  1
Skoda Fabia 1.2 TDI Ambition Plus                 1
Skoda Rapid 1.6 MPI AT Ambition BSIV              1
Toyota Innova 2.5 GX (Diesel) 8 Seater BS IV      1
Name: count, Length: 1983, dtype: int64
year
year
2017    802
2016    691
2015    680
2018    607
2014    580
2012    576
2013    560
2011    535
2010    361
2019    347
2009    225
2008    192
2007    166
2006    102
2005     70
2020     63
2004     49
2003     36
2002     19
2000     15
1999     13
1998      9
1997      9
2001      6
1994      2
1996      2
1995      1
Name: count, dtype: int64
selling_price
sell

In [None]:
for col in cars_data.columns:
    print('Unique values of ' + col)
    print(cars_data[col].unique())
    print("--------------------------------------------------------------------")

Unique values of name
['Maruti Swift Dzire VDI' 'Skoda Rapid 1.5 TDI Ambition'
 'Honda City 2017-2020 EXi' ... 'Tata Nexon 1.5 Revotorq XT'
 'Ford Freestyle Titanium Plus Diesel BSIV'
 'Toyota Innova 2.5 GX (Diesel) 8 Seater BS IV']
--------------------------------------------------------------------
Unique values of year
[2014 2006 2010 2007 2017 2001 2011 2013 2005 2009 2016 2012 2002 2015
 2018 2019 2008 2020 1999 2000 2003 2004 1994 1998 1997 1995 1996]
--------------------------------------------------------------------
Unique values of selling_price
[  450000   370000   158000   225000   130000   440000    96000    45000
   350000   200000   500000    92000   280000   180000   400000   778000
   150000   680000   174000   950000   525000   600000   575000   275000
   300000   220000   254999   670000   730000   650000   330000   366000
  1149000   425000  2100000   925000   675000   819999   390000  1500000
   700000  1450000  1090000   850000  1650000  1750000  1590000  1689999


In [None]:
categorical_cols = cars_data.select_dtypes(include=['object', 'category']).columns
print(categorical_cols)
print("--------------------------------------------------------------------")
numerical_cols = cars_data.select_dtypes(include=['int64', 'float64']).columns
print(numerical_cols)
print("Number of Categorical Data: ", len(categorical_cols))
print("Number of Numerical Data: ", len(numerical_cols))

#Index Objects are added using len() whereas dataframes and series are counted using .count()


Index(['name', 'fuel', 'seller_type', 'transmission', 'owner', 'mileage',
       'engine', 'max_power'],
      dtype='object')
--------------------------------------------------------------------
Index(['year', 'selling_price', 'km_driven', 'seats'], dtype='object')
Number of Categorical Data:  8
Number of Numerical Data:  4


In [None]:
def get_brand_name(car_name):
    return car_name.split()[0]


In [None]:
def clean_data(value):
  value=value.split(' ')[0]
  value = value.strip()
  if value == '':
    value=0
  return float(value)

In [None]:
get_brand_name('Maruti Suzuki Swift')

'Maruti'

In [None]:
# apply to the compete dataset

In [None]:
cars_data['name'] = cars_data['name'].apply(get_brand_name)

In [None]:
cars_data['name'].unique()

array(['Maruti', 'Skoda', 'Honda', 'Hyundai', 'Toyota', 'Ford', 'Renault',
       'Mahindra', 'Tata', 'Chevrolet', 'Datsun', 'Jeep', 'Mercedes-Benz',
       'Mitsubishi', 'Audi', 'Volkswagen', 'BMW', 'Nissan', 'Lexus',
       'Jaguar', 'Land', 'MG', 'Volvo', 'Daewoo', 'Kia', 'Fiat', 'Force',
       'Ambassador', 'Ashok', 'Isuzu', 'Opel'], dtype=object)

In [None]:
# applying the same function on mileage,engine and max_power
cars_data['mileage'] = cars_data['mileage'].apply(get_brand_name)
cars_data['engine'] = cars_data['engine'].apply(get_brand_name)
cars_data['max_power'] = cars_data['max_power'].apply(clean_data)

In [None]:
for col in cars_data.columns:
    print('Unique values of ' + col)
    print(cars_data[col].unique())
    print("--------------------------------------------------------------------")

Unique values of name
['Maruti' 'Skoda' 'Honda' 'Hyundai' 'Toyota' 'Ford' 'Renault' 'Mahindra'
 'Tata' 'Chevrolet' 'Datsun' 'Jeep' 'Mercedes-Benz' 'Mitsubishi' 'Audi'
 'Volkswagen' 'BMW' 'Nissan' 'Lexus' 'Jaguar' 'Land' 'MG' 'Volvo' 'Daewoo'
 'Kia' 'Fiat' 'Force' 'Ambassador' 'Ashok' 'Isuzu' 'Opel']
--------------------------------------------------------------------
Unique values of year
[2014 2006 2010 2007 2017 2001 2011 2013 2005 2009 2016 2012 2002 2015
 2018 2019 2008 2020 1999 2000 2003 2004 1994 1998 1997 1995 1996]
--------------------------------------------------------------------
Unique values of selling_price
[  450000   370000   158000   225000   130000   440000    96000    45000
   350000   200000   500000    92000   280000   180000   400000   778000
   150000   680000   174000   950000   525000   600000   575000   275000
   300000   220000   254999   670000   730000   650000   330000   366000
  1149000   425000  2100000   925000   675000   819999   390000  1500000
   70

***Converting the Categorical Data to Numerical Data***

In [None]:
cars_data['name'].replace(['Maruti', 'Skoda', 'Honda', 'Hyundai', 'Toyota', 'Ford', 'Renault',
       'Mahindra', 'Tata', 'Chevrolet', 'Datsun', 'Jeep', 'Mercedes-Benz',
       'Mitsubishi', 'Audi', 'Volkswagen', 'BMW', 'Nissan', 'Lexus',
       'Jaguar', 'Land', 'MG', 'Volvo', 'Daewoo', 'Kia', 'Fiat', 'Force',
       'Ambassador', 'Ashok', 'Isuzu', 'Opel'],[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],inplace=True)


In [None]:
cars_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6718 entries, 0 to 8125
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   name           6718 non-null   int64  
 1   year           6718 non-null   int64  
 2   selling_price  6718 non-null   int64  
 3   km_driven      6718 non-null   int64  
 4   fuel           6718 non-null   object 
 5   seller_type    6718 non-null   object 
 6   transmission   6718 non-null   object 
 7   owner          6718 non-null   object 
 8   mileage        6718 non-null   object 
 9   engine         6718 non-null   object 
 10  max_power      6718 non-null   float64
 11  seats          6718 non-null   float64
dtypes: float64(2), int64(4), object(6)
memory usage: 682.3+ KB


In [None]:
pd.get_dummies(cars_data['fuel'])

Unnamed: 0,CNG,Diesel,LPG,Petrol
0,False,True,False,False
1,False,True,False,False
2,False,False,False,True
3,False,True,False,False
4,False,False,False,True
...,...,...,...,...
8121,False,False,False,True
8122,False,True,False,False
8123,False,False,False,True
8124,False,True,False,False


In [None]:
pd.get_dummies(cars_data['fuel'],drop_first=True,dtype=int).head()

Unnamed: 0,Diesel,LPG,Petrol
0,1,0,0
1,1,0,0
2,0,0,1
3,1,0,0
4,0,0,1


In [None]:
#perform this onehot encoding on all categorical data .... fuel, seller_type, transmission, owner, mileage, engine, max_power
fuel= pd.get_dummies(cars_data['fuel'],drop_first=True,dtype=int).head()

In [None]:
cars_data

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,seats
0,1,2014,450000,145500,Diesel,Individual,Manual,First Owner,23.4,1248,74.00,5.0
1,2,2014,370000,120000,Diesel,Individual,Manual,Second Owner,21.14,1498,103.52,5.0
2,3,2006,158000,140000,Petrol,Individual,Manual,Third Owner,17.7,1497,78.00,5.0
3,4,2010,225000,127000,Diesel,Individual,Manual,First Owner,23.0,1396,90.00,5.0
4,1,2007,130000,120000,Petrol,Individual,Manual,First Owner,16.1,1298,88.20,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...
8121,1,2013,260000,50000,Petrol,Individual,Manual,Second Owner,18.9,998,67.10,5.0
8122,4,2014,475000,80000,Diesel,Individual,Manual,Second Owner,22.54,1396,88.73,5.0
8123,4,2013,320000,110000,Petrol,Individual,Manual,First Owner,18.5,1197,82.85,5.0
8124,4,2007,135000,119000,Diesel,Individual,Manual,Fourth & Above Owner,16.8,1493,110.00,5.0


In [None]:
cars_data.reset_index(inplace=True)

In [None]:
cars_data

Unnamed: 0,index,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,seats
0,0,1,2014,450000,145500,Diesel,Individual,Manual,First Owner,23.4,1248,74.00,5.0
1,1,2,2014,370000,120000,Diesel,Individual,Manual,Second Owner,21.14,1498,103.52,5.0
2,2,3,2006,158000,140000,Petrol,Individual,Manual,Third Owner,17.7,1497,78.00,5.0
3,3,4,2010,225000,127000,Diesel,Individual,Manual,First Owner,23.0,1396,90.00,5.0
4,4,1,2007,130000,120000,Petrol,Individual,Manual,First Owner,16.1,1298,88.20,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6713,8121,1,2013,260000,50000,Petrol,Individual,Manual,Second Owner,18.9,998,67.10,5.0
6714,8122,4,2014,475000,80000,Diesel,Individual,Manual,Second Owner,22.54,1396,88.73,5.0
6715,8123,4,2013,320000,110000,Petrol,Individual,Manual,First Owner,18.5,1197,82.85,5.0
6716,8124,4,2007,135000,119000,Diesel,Individual,Manual,Fourth & Above Owner,16.8,1493,110.00,5.0


** fuel

seller_type

transmission

owner     

mileage   

 engine

 max_power   **

In [None]:
cars_data['transmission'].replace(['Manual', 'Automatic'],[0,1],inplace=True)
cars_data['owner'].replace(['First Owner', 'Second Owner', 'Third Owner', 'Fourth & Above Owner', 'Test Drive Car'],[1,2,3,4,5],inplace=True)
cars_data['seller_type'].replace(['Individual', 'Dealer', 'Trustmark Dealer'],[1,2,3],inplace=True)
cars_data['fuel'].replace(['Petrol', 'Diesel', 'CNG', 'LPG'],[1,2,3,4],inplace=True)
cars_data['max_power'] = cars_data['max_power'].astype(float)

In [None]:
cars_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6718 entries, 0 to 6717
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   index          6718 non-null   int64  
 1   name           6718 non-null   int64  
 2   year           6718 non-null   int64  
 3   selling_price  6718 non-null   int64  
 4   km_driven      6718 non-null   int64  
 5   fuel           6718 non-null   int64  
 6   seller_type    6718 non-null   int64  
 7   transmission   6718 non-null   int64  
 8   owner          6718 non-null   int64  
 9   mileage        6718 non-null   object 
 10  engine         6718 non-null   object 
 11  max_power      6718 non-null   float64
 12  seats          6718 non-null   float64
dtypes: float64(2), int64(9), object(2)
memory usage: 682.4+ KB


In [None]:
# max_power is object type but has numerical value so no problem

In [None]:
cars_data

Unnamed: 0,index,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,seats
0,0,1,2014,450000,145500,2,1,0,1,23.4,1248,74.00,5.0
1,1,2,2014,370000,120000,2,1,0,2,21.14,1498,103.52,5.0
2,2,3,2006,158000,140000,1,1,0,3,17.7,1497,78.00,5.0
3,3,4,2010,225000,127000,2,1,0,1,23.0,1396,90.00,5.0
4,4,1,2007,130000,120000,1,1,0,1,16.1,1298,88.20,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6713,8121,1,2013,260000,50000,1,1,0,2,18.9,998,67.10,5.0
6714,8122,4,2014,475000,80000,2,1,0,2,22.54,1396,88.73,5.0
6715,8123,4,2013,320000,110000,1,1,0,1,18.5,1197,82.85,5.0
6716,8124,4,2007,135000,119000,2,1,0,4,16.8,1493,110.00,5.0


In [None]:
# We will be drop the index column now
cars_data.drop(columns=['index'],inplace=True)

In [None]:
cars_data

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,seats
0,1,2014,450000,145500,2,1,0,1,23.4,1248,74.00,5.0
1,2,2014,370000,120000,2,1,0,2,21.14,1498,103.52,5.0
2,3,2006,158000,140000,1,1,0,3,17.7,1497,78.00,5.0
3,4,2010,225000,127000,2,1,0,1,23.0,1396,90.00,5.0
4,1,2007,130000,120000,1,1,0,1,16.1,1298,88.20,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...
6713,1,2013,260000,50000,1,1,0,2,18.9,998,67.10,5.0
6714,4,2014,475000,80000,2,1,0,2,22.54,1396,88.73,5.0
6715,4,2013,320000,110000,1,1,0,1,18.5,1197,82.85,5.0
6716,4,2007,135000,119000,2,1,0,4,16.8,1493,110.00,5.0


In [None]:
# Lets Split the input features and output features
input_data = cars_data.drop(columns=['selling_price'])
output_data = cars_data['selling_price']



In [None]:
x_train, x_test, y_train, y_test = train_test_split(input_data, output_data, test_size=0.2, random_state=42)

In [None]:
model = LinearRegression()

In [None]:
model.fit(x_train, y_train)

In [None]:
predict = model.predict(x_test)

In [None]:
predict

array([ 770846.62050249,  686491.5248367 ,  534669.93620495, ...,
       1069317.41773961, -247302.72143367,   43861.58898005])

In [None]:
x_train.head(1)

Unnamed: 0,name,year,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,seats
2481,3,2008,110000,1,1,0,2,17.7,1497,78.0,5.0


In [None]:
input_sample_model = pd.DataFrame([[5,2022,1200,1,1,1,1,12.99,2494.0,100.6,4.0]], columns=['name' , 'year', 'km_driven','fuel', 'seller_type', 'transmission','owner','mileage','engine','max_power','seats'])



In [None]:
model.predict(input_sample_model)

array([1301540.64495629])

In [None]:
import pickle as pk
pk.dump(model, open('model.pkl', 'wb'))