In [1]:
# importing packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
import pickle

In [2]:
# configuring notebook
pd.set_option("display.max_columns", None)
sns.set_theme(style = "whitegrid")
warnings.filterwarnings("ignore")
plt.rcParams['figure.figsize'] = (20, 5)

In [3]:
df = pd.read_csv("cars24.csv")
df.head()

Unnamed: 0,selling_price,year,km_driven,mileage,engine,max_power,age,make,model,Individual,Trustmark Dealer,Diesel,Electric,LPG,Petrol,Manual,5,>5
0,1.2,2012.0,120000,19.7,796.0,46.3,11.0,MARUTI,ALTO STD,1,0,0,0,0,1,1,1,0
1,5.5,2016.0,20000,18.9,1197.0,82.0,7.0,HYUNDAI,GRAND I10 ASTA,1,0,0,0,0,1,1,1,0
2,2.15,2010.0,60000,17.0,1197.0,80.0,13.0,HYUNDAI,I20 ASTA,1,0,0,0,0,1,1,1,0
3,2.26,2012.0,37000,20.92,998.0,67.1,11.0,MARUTI,ALTO K10 2010-2014 VXI,1,0,0,0,0,1,1,1,0
4,5.7,2015.0,30000,22.77,1498.0,98.59,8.0,FORD,ECOSPORT 2015-2021 1.5 TDCI TITANIUM BSIV,0,0,1,0,0,0,1,1,0


In [4]:
# number of unique make types
df["make"].nunique()

41

In [5]:
# number of unique models
df["model"].nunique()

3233

In [6]:
# groupby "make" and find the average selling_price
df.groupby("make")["selling_price"].mean()

make
AMBASSADOR        1.452500
AUDI             17.177585
BENTLEY          20.902500
BMW              17.349017
CHEVROLET         2.723621
DAEWOO            0.780000
DATSUN            3.205176
DC               11.925000
FERRARI          20.902500
FIAT              3.152795
FORCE             5.520000
FORD              5.858258
HONDA             5.979902
HYUNDAI           5.458819
ISUZU            14.037708
JAGUAR           20.187500
JEEP             16.261890
KIA              15.731515
LAMBORGHINI      20.902500
LAND             20.560147
LEXUS            20.902500
MAHINDRA          7.315421
MARUTI            4.684721
MASERATI         20.902500
MERCEDES-AMG     20.902500
MERCEDES-BENZ    17.071479
MG               17.529474
MINI             18.529457
MITSUBISHI        8.631026
NISSAN            4.611877
OPEL              1.500000
OPELCORSA         1.516667
PORSCHE          20.866400
PREMIER           2.550000
RENAULT           4.606651
ROLLS-ROYCE      20.902500
SKODA             7.182

In [7]:
# transforming the DataFrame using the average selling_price
df.groupby("make")["selling_price"].transform("mean")

0         4.684721
1         5.458819
2         5.458819
3         4.684721
4         5.858258
           ...    
19815    10.532763
19816     4.684721
19817     7.182097
19818     7.315421
19819     5.979902
Name: selling_price, Length: 19820, dtype: float64

In [8]:
# using the above to perform target encoding
df["make"] = df.groupby("make")["selling_price"].transform("mean")
df["model"] = df.groupby("model")["selling_price"].transform("mean")
df.head()

Unnamed: 0,selling_price,year,km_driven,mileage,engine,max_power,age,make,model,Individual,Trustmark Dealer,Diesel,Electric,LPG,Petrol,Manual,5,>5
0,1.2,2012.0,120000,19.7,796.0,46.3,11.0,4.684721,1.18,1,0,0,0,0,1,1,1,0
1,5.5,2016.0,20000,18.9,1197.0,82.0,7.0,5.458819,4.81875,1,0,0,0,0,1,1,1,0
2,2.15,2010.0,60000,17.0,1197.0,80.0,13.0,5.458819,3.394,1,0,0,0,0,1,1,1,0
3,2.26,2012.0,37000,20.92,998.0,67.1,11.0,4.684721,2.242676,1,0,0,0,0,1,1,1,0
4,5.7,2015.0,30000,22.77,1498.0,98.59,8.0,5.858258,6.777576,0,0,1,0,0,0,1,1,0


In [9]:
# separating the features from the DataFrame
x = df.drop(columns = ["selling_price"])
x

Unnamed: 0,year,km_driven,mileage,engine,max_power,age,make,model,Individual,Trustmark Dealer,Diesel,Electric,LPG,Petrol,Manual,5,>5
0,2012.0,120000,19.70,796.0,46.30,11.0,4.684721,1.180000,1,0,0,0,0,1,1,1,0
1,2016.0,20000,18.90,1197.0,82.00,7.0,5.458819,4.818750,1,0,0,0,0,1,1,1,0
2,2010.0,60000,17.00,1197.0,80.00,13.0,5.458819,3.394000,1,0,0,0,0,1,1,1,0
3,2012.0,37000,20.92,998.0,67.10,11.0,4.684721,2.242676,1,0,0,0,0,1,1,1,0
4,2015.0,30000,22.77,1498.0,98.59,8.0,5.858258,6.777576,0,0,1,0,0,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19815,2017.0,69480,23.59,1364.0,67.05,6.0,10.532763,7.075000,0,0,1,0,0,0,1,1,0
19816,2019.0,18000,17.50,1373.0,91.10,4.0,4.684721,7.128571,0,0,0,0,0,1,1,0,1
19817,2015.0,67000,21.14,1498.0,103.52,8.0,7.182097,4.454000,0,0,1,0,0,0,1,1,0
19818,2016.0,3800000,16.00,2179.0,140.00,7.0,7.315421,8.096522,0,0,1,0,0,0,1,0,1


In [10]:
# separating the target variable from the DataFrame
y = df["selling_price"]
y

0         1.20
1         5.50
2         2.15
3         2.26
4         5.70
         ...  
19815     6.50
19816     9.25
19817     4.25
19818    12.25
19819    12.00
Name: selling_price, Length: 19820, dtype: float64

In [11]:
# performing train-test split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 42)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((15856, 17), (3964, 17), (15856,), (3964,))

In [12]:
# scaling
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

In [13]:
# building the model
model = LinearRegression()
model

In [14]:
# training the model
model.fit(x_train, y_train)

In [15]:
# saving the model
with open("model.pkl", "wb") as file:
    pickle.dump(model, file)