### Linear Regression
#### Second Hand Car Price Prediction
Dataset Link: https://www.kaggle.com/datasets/sujithmandala/second-hand-car-price-prediction

In [20]:
# Importing necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import LabelEncoder, MinMaxScaler


In [21]:
# Load the dataset
car_df = pd.read_csv("dataset/cars.csv")
car_df.head()

Unnamed: 0,Car_ID,Brand,Model,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Price
0,1,Toyota,Corolla,2018,50000,Petrol,Manual,First,15,1498,108,5,800000
1,2,Honda,Civic,2019,40000,Petrol,Automatic,Second,17,1597,140,5,1000000
2,3,Ford,Mustang,2017,20000,Petrol,Automatic,First,10,4951,395,4,2500000
3,4,Maruti,Swift,2020,30000,Diesel,Manual,Third,23,1248,74,5,600000
4,5,Hyundai,Sonata,2016,60000,Diesel,Automatic,Second,18,1999,194,5,850000


In [22]:
# Do one-hot Encoding with the Brand, the Model columns
car_df_fe = pd.get_dummies(car_df, columns=["Brand","Model"])
car_df_fe.head()

Unnamed: 0,Car_ID,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,...,Model_Vento,Model_Venue,Model_Verna,Model_Vitara,Model_WR-V,Model_X1,Model_X3,Model_X5,Model_XUV300,Model_Yaris
0,1,2018,50000,Petrol,Manual,First,15,1498,108,5,...,False,False,False,False,False,False,False,False,False,False
1,2,2019,40000,Petrol,Automatic,Second,17,1597,140,5,...,False,False,False,False,False,False,False,False,False,False
2,3,2017,20000,Petrol,Automatic,First,10,4951,395,4,...,False,False,False,False,False,False,False,False,False,False
3,4,2020,30000,Diesel,Manual,Third,23,1248,74,5,...,False,False,False,False,False,False,False,False,False,False
4,5,2016,60000,Diesel,Automatic,Second,18,1999,194,5,...,False,False,False,False,False,False,False,False,False,False


In [23]:
# Do label encoding with Fuel_Type, Transmission, and Owner_Type
label_encoder = LabelEncoder()
car_df_fe["Fuel_Type"] =  label_encoder.fit_transform(car_df_fe["Fuel_Type"])
car_df_fe["Transmission"] =  label_encoder.fit_transform(car_df_fe["Transmission"])
car_df_fe["Owner_Type"] =  label_encoder.fit_transform(car_df_fe["Owner_Type"])
car_df_fe.head()

Unnamed: 0,Car_ID,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,...,Model_Vento,Model_Venue,Model_Verna,Model_Vitara,Model_WR-V,Model_X1,Model_X3,Model_X5,Model_XUV300,Model_Yaris
0,1,2018,50000,1,1,0,15,1498,108,5,...,False,False,False,False,False,False,False,False,False,False
1,2,2019,40000,1,0,1,17,1597,140,5,...,False,False,False,False,False,False,False,False,False,False
2,3,2017,20000,1,0,0,10,4951,395,4,...,False,False,False,False,False,False,False,False,False,False
3,4,2020,30000,0,1,2,23,1248,74,5,...,False,False,False,False,False,False,False,False,False,False
4,5,2016,60000,0,0,1,18,1999,194,5,...,False,False,False,False,False,False,False,False,False,False


In [24]:
# Scale the features with minmax scaler
scaler = MinMaxScaler()
car_df_fe[["Kilometers_Driven", "Mileage", "Engine", "Power", "Seats"]] = scaler.fit_transform(car_df_fe[["Kilometers_Driven", "Mileage", "Engine", "Power", "Seats"]])
car_df_fe.head()


Unnamed: 0,Car_ID,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,...,Model_Vento,Model_Venue,Model_Verna,Model_Vitara,Model_WR-V,Model_X1,Model_X3,Model_X5,Model_XUV300,Model_Yaris
0,1,2018,0.8,1,1,0,0.333333,0.126265,0.121951,0.333333,...,False,False,False,False,False,False,False,False,False,False
1,2,2019,0.6,1,0,1,0.466667,0.151316,0.219512,0.333333,...,False,False,False,False,False,False,False,False,False,False
2,3,2017,0.2,1,0,0,0.0,1.0,0.996951,0.0,...,False,False,False,False,False,False,False,False,False,False
3,4,2020,0.4,0,1,2,0.866667,0.063006,0.018293,0.333333,...,False,False,False,False,False,False,False,False,False,False
4,5,2016,1.0,0,0,1,0.533333,0.253036,0.384146,0.333333,...,False,False,False,False,False,False,False,False,False,False


In [25]:
# Split the data into input data and output data
X = car_df_fe.drop(columns=["Price"])
y = car_df["Price"]

In [26]:
# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [27]:
# Training the linear regression model
LR = LinearRegression()
LR.fit(X_train, y_train)

In [28]:
# See the coefficient
for feature, coef in zip(LR.feature_names_in_,LR.coef_):
    print(f"{feature}: {coef}")

Car_ID: -1.0052898951877303e-09
Year: 47597.10707862848
Kilometers_Driven: 26702.783166348818
Fuel_Type: -240350.9892567462
Transmission: -428880.3851546823
Owner_Type: -146661.69948733645
Mileage: -450054.2592200559
Engine: -278647.0155219311
Power: 1339032.148311131
Seats: -8534.027782289078
Brand_Audi: 500586.0125822514
Brand_BMW: 743848.4284066339
Brand_Ford: -53028.76334711974
Brand_Honda: -274434.94302182674
Brand_Hyundai: -319646.11903066485
Brand_Mahindra: -423029.00165370526
Brand_Maruti: -291628.6808051828
Brand_Mercedes: 644291.8401839342
Brand_Tata: -340364.55403156456
Brand_Toyota: 136500.2097788176
Brand_Volkswagen: -321015.91653612186
Model_3 Series: -40484.09890423983
Model_5 Series: 0.28746703657816397
Model_7 Series: 374269.9933449429
Model_A3: -30160.597702998348
Model_A4: -271706.79800735414
Model_A5: 51760.58895935613
Model_A6: 612782.0545613277
Model_Altroz: -75714.47204725962
Model_Ameo: -314255.99576070666
Model_Aspire: -63496.477002172585
Model_BR-V: -38437.995

In [29]:

# Making predictions on the testing set
y_pred = LR.predict(X_test)

In [30]:
# Calculating the mean squared error
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
print("Mean Squared Error:", mse)
print("Mean Absolute Error:", mae)

Mean Squared Error: 100312360634.17107
Mean Absolute Error: 201490.82791410983


In [34]:
car_df["Price"].max()

4000000