In [290]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [291]:
# loading the data from csv file to pandas dataframe
car_dataset = pd.read_csv(r'D:\batch -3\Machine learning project\Project 8. Car Price Prediction\CAR DETAILS FROM CAR DEKHO.csv')

In [292]:
# inspecting the first 5 rows of the dataframe
car_dataset.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner
0,Maruti 800 AC,2007,60000,70000,Petrol,Individual,Manual,First Owner
1,Maruti Wagon R LXI Minor,2007,135000,50000,Petrol,Individual,Manual,First Owner
2,Hyundai Verna 1.6 SX,2012,600000,100000,Diesel,Individual,Manual,First Owner
3,Datsun RediGO T Option,2017,250000,46000,Petrol,Individual,Manual,First Owner
4,Honda Amaze VX i-DTEC,2014,450000,141000,Diesel,Individual,Manual,Second Owner


In [293]:
# checking the number of rows and columns
car_dataset.shape

(4340, 8)

In [294]:
# getting some information about the dataset
car_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4340 entries, 0 to 4339
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   name           4340 non-null   object
 1   year           4340 non-null   int64 
 2   selling_price  4340 non-null   int64 
 3   km_driven      4340 non-null   int64 
 4   fuel           4340 non-null   object
 5   seller_type    4340 non-null   object
 6   transmission   4340 non-null   object
 7   owner          4340 non-null   object
dtypes: int64(3), object(5)
memory usage: 271.4+ KB


In [295]:
# checking the number of missing values
car_dataset.isnull().sum()

name             0
year             0
selling_price    0
km_driven        0
fuel             0
seller_type      0
transmission     0
owner            0
dtype: int64

In [296]:
# Drop rows with NaN values
car_dataset = car_dataset.dropna()

# Or fill missing values with a default value
car_dataset.fillna(0, inplace=True)

In [297]:
car_dataset.isnull().sum()

name             0
year             0
selling_price    0
km_driven        0
fuel             0
seller_type      0
transmission     0
owner            0
dtype: int64

In [298]:
print(car_dataset.head())


                       name  year  selling_price  km_driven    fuel  \
0             Maruti 800 AC  2007          60000      70000  Petrol   
1  Maruti Wagon R LXI Minor  2007         135000      50000  Petrol   
2      Hyundai Verna 1.6 SX  2012         600000     100000  Diesel   
3    Datsun RediGO T Option  2017         250000      46000  Petrol   
4     Honda Amaze VX i-DTEC  2014         450000     141000  Diesel   

  seller_type transmission         owner  
0  Individual       Manual   First Owner  
1  Individual       Manual   First Owner  
2  Individual       Manual   First Owner  
3  Individual       Manual   First Owner  
4  Individual       Manual  Second Owner  


In [299]:
print(car_dataset.columns)


Index(['name', 'year', 'selling_price', 'km_driven', 'fuel', 'seller_type',
       'transmission', 'owner'],
      dtype='object')


In [300]:
# checking the distribution of categorical data
car_dataset.describe()

Unnamed: 0,year,selling_price,km_driven
count,4340.0,4340.0,4340.0
mean,2013.090783,504127.3,66215.777419
std,4.215344,578548.7,46644.102194
min,1992.0,20000.0,1.0
25%,2011.0,208749.8,35000.0
50%,2014.0,350000.0,60000.0
75%,2016.0,600000.0,90000.0
max,2020.0,8900000.0,806599.0


In [301]:
# encoding the values
car_dataset.replace({"fuel":{"Petrol":0,"  Diesel":1,"CNG":2,"LPG":3}},inplace=True)
car_dataset.replace({"seller_type":{"Dealer":0,"Individual":1}},inplace=True)
car_dataset.replace({"transmission":{"Manual":0,"Automatic":1}},inplace=True)
car_dataset.head()

  car_dataset.replace({"transmission":{"Manual":0,"Automatic":1}},inplace=True)


Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner
0,Maruti 800 AC,2007,60000,70000,0,1,0,First Owner
1,Maruti Wagon R LXI Minor,2007,135000,50000,0,1,0,First Owner
2,Hyundai Verna 1.6 SX,2012,600000,100000,Diesel,1,0,First Owner
3,Datsun RediGO T Option,2017,250000,46000,0,1,0,First Owner
4,Honda Amaze VX i-DTEC,2014,450000,141000,Diesel,1,0,Second Owner


In [302]:
car_dataset = car_dataset.drop(columns='owner', axis=1)

In [303]:
X = car_dataset.drop(["name","selling_price"],axis=1)
Y = car_dataset['selling_price']

In [304]:
car_dataset.columns

Index(['name', 'year', 'selling_price', 'km_driven', 'fuel', 'seller_type',
       'transmission'],
      dtype='object')

In [305]:
print(X)

      year  km_driven    fuel seller_type  transmission
0     2007      70000       0           1             0
1     2007      50000       0           1             0
2     2012     100000  Diesel           1             0
3     2017      46000       0           1             0
4     2014     141000  Diesel           1             0
...    ...        ...     ...         ...           ...
4335  2014      80000  Diesel           1             0
4336  2014      80000  Diesel           1             0
4337  2009      83000       0           1             0
4338  2016      90000  Diesel           1             0
4339  2016      40000       0           1             0

[4340 rows x 5 columns]


In [306]:
print(Y)

0        60000
1       135000
2       600000
3       250000
4       450000
         ...  
4335    409999
4336    409999
4337    110000
4338    865000
4339    225000
Name: selling_price, Length: 4340, dtype: int64


In [307]:
print(X.dtypes)


year             int64
km_driven        int64
fuel            object
seller_type     object
transmission     int64
dtype: object


In [308]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.1, random_state=2)

In [309]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X_train, Y_train)



ValueError: could not convert string to float: 'Diesel'

In [None]:
# R squared Error
pred = model.predict(X_train)
error_score = metrics.r2_score(Y_train, pred)
print("R squared Error : ", error_score)

ValueError: could not convert string to float: 'Diesel'