# Multiple Linear Regression- Car dataset

## Importing the libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## Importing the dataset

In [2]:
dataset=pd.read_csv('CAR.csv')

In [4]:
dataset.head()

Unnamed: 0,year,selling_price,km_driven,fuel,seller_type,transmission,owner
0,2007,60000,70000,Petrol,Individual,Manual,First Owner
1,2007,135000,50000,Petrol,Individual,Manual,First Owner
2,2012,600000,100000,Diesel,Individual,Manual,First Owner
3,2017,250000,46000,Petrol,Individual,Manual,First Owner
4,2014,450000,141000,Diesel,Individual,Manual,Second Owner


In [5]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4337 entries, 0 to 4336
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   year           4337 non-null   int64 
 1   selling_price  4337 non-null   int64 
 2   km_driven      4337 non-null   int64 
 3   fuel           4337 non-null   object
 4   seller_type    4337 non-null   object
 5   transmission   4337 non-null   object
 6   owner          4337 non-null   object
dtypes: int64(3), object(4)
memory usage: 237.3+ KB


In [7]:
dataset['fuel'].value_counts()

Diesel      2151
Petrol      2122
CNG           40
LPG           23
Electric       1
Name: fuel, dtype: int64

In [8]:
dataset['seller_type'].value_counts()

Individual          3242
Dealer               993
Trustmark Dealer     102
Name: seller_type, dtype: int64

In [9]:
dataset['transmission'].value_counts()

Manual       3889
Automatic     448
Name: transmission, dtype: int64

In [10]:
dataset['owner'].value_counts()

First Owner             2831
Second Owner            1104
Third Owner              304
Fourth & Above Owner      81
Test Drive Car            17
Name: owner, dtype: int64

### Split the dataset into independent and Dependent variables

In [13]:
X=dataset.drop('selling_price',axis=1)
y=dataset[['selling_price']]

### Work with the catagorical data

In [14]:
X.columns

Index(['year', 'km_driven', 'fuel', 'seller_type', 'transmission', 'owner'], dtype='object')

In [27]:
X=pd.get_dummies(dataset[['year',
                          'km_driven',
                          'fuel', 
                          'seller_type', 
                          'transmission',
                          'owner']],
                drop_first=True)

In [28]:
X.head()

Unnamed: 0,year,km_driven,fuel_Diesel,fuel_Electric,fuel_LPG,fuel_Petrol,seller_type_Individual,seller_type_Trustmark Dealer,transmission_Manual,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,2007,70000,0,0,0,1,1,0,1,0,0,0,0
1,2007,50000,0,0,0,1,1,0,1,0,0,0,0
2,2012,100000,1,0,0,0,1,0,1,0,0,0,0
3,2017,46000,0,0,0,1,1,0,1,0,0,0,0
4,2014,141000,1,0,0,0,1,0,1,0,1,0,0


## Splitting the dataset into the Training set and Test set
- Random State 20

In [29]:
from sklearn.model_selection import train_test_split
X_train,X_rest,y_train,y_rest=train_test_split(X,y,
                                              test_size=.2,
                                              random_state=20)

In [30]:
X_test,X_val,y_test,y_val=train_test_split(X_rest,y_rest,
                                          test_size=.5,
                                          random_state=20)

## Training the Multiple Linear Regression model on the Training set

In [31]:
from sklearn.linear_model import LinearRegression
regressor=LinearRegression()
regressor.fit(X_train.values,y_train)

LinearRegression()

## Intercept and Coefficient

In [32]:
print(f'The coefficients are: {regressor.coef_}')
print(f'The intercept is: {regressor.intercept_}')

The coefficients are: [[ 3.62245575e+04 -8.31584195e-01  2.88702246e+05  1.60071068e-10
   4.65464254e+04  2.64057958e+03 -6.04980455e+04  1.71882689e+05
  -8.64323880e+05 -3.50851884e+03 -4.04890692e+04  1.83178786e+05
  -2.83903020e+04]]
The intercept is: [-71683645.58006921]


In [33]:
X_train.columns

Index(['year', 'km_driven', 'fuel_Diesel', 'fuel_Electric', 'fuel_LPG',
       'fuel_Petrol', 'seller_type_Individual', 'seller_type_Trustmark Dealer',
       'transmission_Manual', 'owner_Fourth & Above Owner',
       'owner_Second Owner', 'owner_Test Drive Car', 'owner_Third Owner'],
      dtype='object')

### When we match the coefficients to the variables in order, we have the formula for predicting Selling Price based on these variables.
### The formula is: -71683645.6 + 3.6(Year) - 8.3(KM Driven) + 2.9(Fuel Diesel) + 1.6(Fuel Electric) - 4.7(Fuel LPG) - 2.6(Fuel Petrol) - 6.0(Seller:Individual) + 1.7(Seller:TrustmarkDealer) - 8.65(Transmission:Manual) - 3.5(Fourth&Above Owner) - 4.0(Second Owner) + 1.8(Test Drive Car) - 2.8(Third Owner)

## Validation Set

In [36]:
y_predval=regressor.predict(X_val.values)
y_validate=y_val.to_numpy()

In [39]:
np.hstack((y_predval,y_validate)).round()

array([[ 280944.,  350000.],
       [ 821100.,  780000.],
       [ 465916.,  434999.],
       [ 644135.,  425000.],
       [ 425422.,  280000.],
       [1317666.,  850000.],
       [ 163473.,  120000.],
       [ 357444.,  250000.],
       [ 465344.,  198000.],
       [ 182292.,  120000.],
       [ 432081.,  350000.],
       [ 464254.,  125000.],
       [ 146019.,  225000.],
       [ 236403.,  300000.],
       [ 582334.,  490000.],
       [1310282.,  750000.],
       [ 163473.,  175000.],
       [ 289260.,  164000.],
       [ 601309.,  525000.],
       [ 511497.,  550000.],
       [ 302981.,  229999.],
       [ 477193.,  350000.],
       [ 402091.,  270000.],
       [ 615704.,  250000.],
       [ -25484.,   75000.],
       [ 370909.,  220000.],
       [ 510765.,  550000.],
       [ 380528.,  160000.],
       [  37858.,   80000.],
       [1491394., 1750000.],
       [ 115149.,  210000.],
       [   4453.,   80000.],
       [ 288586.,  450000.],
       [ 629335.,  415000.],
       [ 48701

## Predicting the Test set results

In [40]:
y_pred=regressor.predict(X_test.values)

### Calculate RMSE, R-Square

In [46]:
from sklearn.metrics import mean_squared_error, r2_score
import math
print(f'Mean Squared Error (MSE): {mean_squared_error(y_pred,y_test):.2f}')
print(f'Root Mean Squared Error (RMSE): {math.sqrt(mean_squared_error(y_pred,y_test)):.2f}')
print(f"R-Squared: {r2_score(y_pred,y_test):.2f}")

Mean Squared Error (MSE): 154171100550.96
Root Mean Squared Error (RMSE): 392646.28
R-Squared: -0.05


## Validation case scenario:
#### 1. Predict how much will be the car selling price for a car of 
- year 2014 
- 70000 km driven 
- fuel type Diesel
- Seller type Dealer
- manual transmission
- first owner

** 465000 ** actual


In [47]:
X_test.head()

Unnamed: 0,year,km_driven,fuel_Diesel,fuel_Electric,fuel_LPG,fuel_Petrol,seller_type_Individual,seller_type_Trustmark Dealer,transmission_Manual,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
2549,2018,26000,1,0,0,0,0,0,1,0,0,0,0
3500,2002,40000,0,0,0,1,1,0,1,1,0,0,0
1137,2013,71000,0,0,0,0,1,0,1,0,1,0,0
761,2019,8000,0,0,0,1,1,0,1,0,0,0,0
2516,2008,50000,0,0,0,1,1,0,1,0,0,0,0


In [49]:
regressor.predict([[2014,70000,1,0,0,0,0,0,1,0,0,0,0]])

array([[638780.60333154]])

### The predicted price of the described car is 638,780.60 dollars. The actual price of a car with these features in the dataset is 465,000 dollars.
#### This model is not very accurate - MSE and RMSE are very high which indicated there is a lot of error between the predicted values and actual values. Additionally, the R2 score is very low, meaning only about 5% of the data is behaving as this regression equation predicts.