# Importing the required libraries

In [2]:
import numpy as np, pandas as pd, plotly.express as px, joblib as jb
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error, r2_score

# Loading the dataset

In [3]:
car_df = pd.read_csv("car data.csv")
car_df.head()

Unnamed: 0,Car_Name,Year,Selling_Price,Present_Price,Driven_kms,Fuel_Type,Selling_type,Transmission,Owner
0,ritz,2014,3.35,5.59,27000,Petrol,Dealer,Manual,0
1,sx4,2013,4.75,9.54,43000,Diesel,Dealer,Manual,0
2,ciaz,2017,7.25,9.85,6900,Petrol,Dealer,Manual,0
3,wagon r,2011,2.85,4.15,5200,Petrol,Dealer,Manual,0
4,swift,2014,4.6,6.87,42450,Diesel,Dealer,Manual,0


# Data Description

In [4]:
car_df.head()

Unnamed: 0,Car_Name,Year,Selling_Price,Present_Price,Driven_kms,Fuel_Type,Selling_type,Transmission,Owner
0,ritz,2014,3.35,5.59,27000,Petrol,Dealer,Manual,0
1,sx4,2013,4.75,9.54,43000,Diesel,Dealer,Manual,0
2,ciaz,2017,7.25,9.85,6900,Petrol,Dealer,Manual,0
3,wagon r,2011,2.85,4.15,5200,Petrol,Dealer,Manual,0
4,swift,2014,4.6,6.87,42450,Diesel,Dealer,Manual,0


In [5]:
car_df.shape

(301, 9)

In [6]:
for i in car_df.columns:
    print(f"{i}:\n", car_df[i].unique(), "\n\n")

Car_Name:
 ['ritz' 'sx4' 'ciaz' 'wagon r' 'swift' 'vitara brezza' 's cross'
 'alto 800' 'ertiga' 'dzire' 'alto k10' 'ignis' '800' 'baleno' 'omni'
 'fortuner' 'innova' 'corolla altis' 'etios cross' 'etios g' 'etios liva'
 'corolla' 'etios gd' 'camry' 'land cruiser' 'Royal Enfield Thunder 500'
 'UM Renegade Mojave' 'KTM RC200' 'Bajaj Dominar 400'
 'Royal Enfield Classic 350' 'KTM RC390' 'Hyosung GT250R'
 'Royal Enfield Thunder 350' 'KTM 390 Duke ' 'Mahindra Mojo XT300'
 'Bajaj Pulsar RS200' 'Royal Enfield Bullet 350'
 'Royal Enfield Classic 500' 'Bajaj Avenger 220' 'Bajaj Avenger 150'
 'Honda CB Hornet 160R' 'Yamaha FZ S V 2.0' 'Yamaha FZ 16'
 'TVS Apache RTR 160' 'Bajaj Pulsar 150' 'Honda CBR 150' 'Hero Extreme'
 'Bajaj Avenger 220 dtsi' 'Bajaj Avenger 150 street' 'Yamaha FZ  v 2.0'
 'Bajaj Pulsar  NS 200' 'Bajaj Pulsar 220 F' 'TVS Apache RTR 180'
 'Hero Passion X pro' 'Bajaj Pulsar NS 200' 'Yamaha Fazer '
 'Honda Activa 4G' 'TVS Sport ' 'Honda Dream Yuga '
 'Bajaj Avenger Street 220' '

In [7]:
car_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 301 entries, 0 to 300
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Car_Name       301 non-null    object 
 1   Year           301 non-null    int64  
 2   Selling_Price  301 non-null    float64
 3   Present_Price  301 non-null    float64
 4   Driven_kms     301 non-null    int64  
 5   Fuel_Type      301 non-null    object 
 6   Selling_type   301 non-null    object 
 7   Transmission   301 non-null    object 
 8   Owner          301 non-null    int64  
dtypes: float64(2), int64(3), object(4)
memory usage: 21.3+ KB


In [8]:
car_df.describe(include="all")

Unnamed: 0,Car_Name,Year,Selling_Price,Present_Price,Driven_kms,Fuel_Type,Selling_type,Transmission,Owner
count,301,301.0,301.0,301.0,301.0,301,301,301,301.0
unique,98,,,,,3,2,2,
top,city,,,,,Petrol,Dealer,Manual,
freq,26,,,,,239,195,261,
mean,,2013.627907,4.661296,7.628472,36947.20598,,,,0.043189
std,,2.891554,5.082812,8.642584,38886.883882,,,,0.247915
min,,2003.0,0.1,0.32,500.0,,,,0.0
25%,,2012.0,0.9,1.2,15000.0,,,,0.0
50%,,2014.0,3.6,6.4,32000.0,,,,0.0
75%,,2016.0,6.0,9.9,48767.0,,,,0.0


# Data Preprocessing

## Handling missing values

In [9]:
car_df.isnull().sum()

Car_Name         0
Year             0
Selling_Price    0
Present_Price    0
Driven_kms       0
Fuel_Type        0
Selling_type     0
Transmission     0
Owner            0
dtype: int64

## Handling duplicate values

In [10]:
car_df.duplicated().sum()

2

In [11]:
car_df.drop_duplicates(inplace=True)
car_df.duplicated().sum()
car_df.reset_index(drop=True, inplace=True)

In [12]:
car_df["Driven_kms"]

0      27000
1      43000
2       6900
3       5200
4      42450
       ...  
294    33988
295    60000
296    87934
297     9000
298     5464
Name: Driven_kms, Length: 299, dtype: int64

## Handling outliers

In [13]:
numerical_cols = ["Selling_Price", "Present_Price", "Driven_kms"]
for i in numerical_cols:
    px.box(car_df[i], x=i).show()

In [14]:
def Remove_outliers(data):
    sorted(data)
    q1, q3 = data.quantile([0.25, 0.75])
    IQR = q3 - q1
    l = q1 - (1.5 * IQR)
    u = q3 + (1.5 * IQR)
    return l, u

In [15]:
for i in numerical_cols:
    l, u = Remove_outliers(car_df[i])
    car_df[i] = np.where(car_df[i]>u, u, car_df[i])
    car_df[i] = np.where(car_df[i]<l, l, car_df[i])

In [16]:
for i in numerical_cols:
    px.box(car_df[i], x=i).show()

## Feature Scaling

In [17]:
from sklearn.preprocessing import MinMaxScaler
Driven_Mean = car_df["Driven_kms"].mean()
Driven_std = car_df["Driven_kms"].std()
Driven_min = car_df["Driven_kms"].min()
Driven_max = car_df["Driven_kms"].max()
Year_min = car_df["Year"].min()
Year_max = car_df["Year"].max()

non_boolean_numerical_features = ['Year', 'Driven_kms']
scaler = MinMaxScaler()
car_df[non_boolean_numerical_features] = scaler.fit_transform(car_df[non_boolean_numerical_features])
car_df

Unnamed: 0,Car_Name,Year,Selling_Price,Present_Price,Driven_kms,Fuel_Type,Selling_type,Transmission,Owner
0,ritz,0.733333,3.35,5.59,0.267114,Petrol,Dealer,Manual,0
1,sx4,0.666667,4.75,9.54,0.428390,Diesel,Dealer,Manual,0
2,ciaz,0.933333,7.25,9.85,0.064510,Petrol,Dealer,Manual,0
3,wagon r,0.533333,2.85,4.15,0.047375,Petrol,Dealer,Manual,0
4,swift,0.733333,4.60,6.87,0.422846,Diesel,Dealer,Manual,0
...,...,...,...,...,...,...,...,...,...
294,city,0.866667,9.50,11.60,0.337551,Diesel,Dealer,Manual,0
295,brio,0.800000,4.00,5.90,0.599745,Petrol,Dealer,Manual,0
296,city,0.400000,3.35,11.00,0.881313,Petrol,Dealer,Manual,0
297,city,0.933333,11.50,12.50,0.085678,Diesel,Dealer,Manual,0


In [18]:
car_df.replace({"Fuel_Type":{"Petrol": 0, "Diesel": 1, "CNG": 2},
                "Selling_type":{"Dealer": 0, "Individual": 1},  
                "Transmission":{"Manual": 0, "Automatic": 1}}, inplace=True)

In [19]:
car_df.drop(columns=["Car_Name"], inplace=True)
car_df

Unnamed: 0,Year,Selling_Price,Present_Price,Driven_kms,Fuel_Type,Selling_type,Transmission,Owner
0,0.733333,3.35,5.59,0.267114,0,0,0,0
1,0.666667,4.75,9.54,0.428390,1,0,0,0
2,0.933333,7.25,9.85,0.064510,0,0,0,0
3,0.533333,2.85,4.15,0.047375,0,0,0,0
4,0.733333,4.60,6.87,0.422846,1,0,0,0
...,...,...,...,...,...,...,...,...
294,0.866667,9.50,11.60,0.337551,1,0,0,0
295,0.800000,4.00,5.90,0.599745,0,0,0,0
296,0.400000,3.35,11.00,0.881313,0,0,0,0
297,0.933333,11.50,12.50,0.085678,1,0,0,0


In [20]:
Year_max, "\n", Year_min, "\n", Driven_Mean, "\n", Driven_std

(2018, '\n', 2003, '\n', 34387.09698996656, '\n', 23819.91377279199)

# Spliting the target feature from the dataset for prediction purposes

In [21]:
x = car_df.drop(columns="Selling_Price", axis=1)
y = car_df["Selling_Price"].reset_index().drop(columns="index")

In [22]:
x

Unnamed: 0,Year,Present_Price,Driven_kms,Fuel_Type,Selling_type,Transmission,Owner
0,0.733333,5.59,0.267114,0,0,0,0
1,0.666667,9.54,0.428390,1,0,0,0
2,0.933333,9.85,0.064510,0,0,0,0
3,0.533333,4.15,0.047375,0,0,0,0
4,0.733333,6.87,0.422846,1,0,0,0
...,...,...,...,...,...,...,...
294,0.866667,11.60,0.337551,1,0,0,0
295,0.800000,5.90,0.599745,0,0,0,0
296,0.400000,11.00,0.881313,0,0,0,0
297,0.933333,12.50,0.085678,1,0,0,0


In [23]:
y

Unnamed: 0,Selling_Price
0,3.35
1,4.75
2,7.25
3,2.85
4,4.60
...,...
294,9.50
295,4.00
296,3.35
297,11.50


In [24]:
x_train, x_test, y_train, y_test = train_test_split(x.values, y.values, test_size=0.3, random_state= 42)

## Linear Regression

In [25]:
LR = LinearRegression()
LR.fit(x_train, y_train)

In [26]:
LR_pred = LR.predict(x_test)
lr_mse = mean_squared_error(y_test, LR_pred)
lr_r2 = r2_score(y_test, LR_pred)
print(lr_mse, "\n", lr_r2)

3.786895799900037 
 0.7433504787371993


## Lasso Regression

In [27]:
lass = Lasso(random_state=42)
lass.fit(x_train, y_train)
param_grid= {
    "alpha" : [0.1, 0.01, 0.001],
    "warm_start" : [True, False],
    "selection" : ['cyclic', 'random'],
    "max_iter" : [50, 100, 500, 1000, 5000, 10000]
}

lass_reg = RandomizedSearchCV(estimator=lass, param_distributions=param_grid, cv=5, n_jobs=8, random_state=1)
lass_reg.fit(x_train, y_train)

In [28]:
lass_reg.fit(x_train, y_train)
best_parameters = lass_reg.best_params_
print(best_parameters)
best_lass = Lasso(**best_parameters)
best_lass.fit(x_train,y_train)

{'warm_start': False, 'selection': 'random', 'max_iter': 100, 'alpha': 0.001}


In [29]:
lass_pred = lass_reg.predict(x_test)
lr_mse = mean_squared_error(y_test, lass_pred)
lr_r2 = r2_score(y_test, lass_pred)
print(lr_mse, "\n", lr_r2)

3.7775880596782345 
 0.7439812927860112


## Decision Tree

In [30]:
param_grid = {
    'max_depth': [3, 5, 7, 9],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

DT = DecisionTreeRegressor(random_state=42)
reg_rand = RandomizedSearchCV(estimator=DT,
                              param_distributions=param_grid,
                              cv=5,
                              n_jobs=8
)

In [31]:
reg_rand.fit(x_train, y_train)
best_parameters = reg_rand.best_params_
print(best_parameters)
best_DT = DecisionTreeRegressor(**best_parameters)
best_DT.fit(x_train,y_train)

{'min_samples_split': 5, 'min_samples_leaf': 2, 'max_depth': 9}


In [32]:
y_pred_dt_train = best_DT.predict(x_train)
y_pred_dt_test = best_DT.predict(x_test)

train_mse = mean_squared_error(y_train, y_pred_dt_train)
test_mse = mean_squared_error(y_test, y_pred_dt_test)
train_r2 = r2_score(y_train, y_pred_dt_train)
test_r2 = r2_score(y_test, y_pred_dt_test)

print(f"Training MSE: {train_mse}")
print(f"Testing MSE: {test_mse}")
print(f"Training R2 Score: {train_r2}")
print(f"Testing R2 Score: {test_r2}")

Training MSE: 0.11501336864889497
Testing MSE: 4.8293315791131235
Training R2 Score: 0.991448203330167
Testing R2 Score: 0.6727014147494034


## Random Forest Regressor

In [33]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
rf_regressor = RandomForestRegressor(random_state=42)
random_search = RandomizedSearchCV(estimator=rf_regressor, param_distributions=param_grid, cv=5, scoring='neg_mean_squared_error', verbose=1, n_jobs=8)
# random_search = BayesSearchCV(estimator=rf_regressor, search_spaces=param_grid, cv=5, scoring='neg_mean_squared_error', verbose=1, n_jobs=8)
random_search.fit(x_train, y_train)


Fitting 5 folds for each of 10 candidates, totalling 50 fits



A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().



In [34]:
best_parameters_rf = random_search.best_params_
best_RF = RandomForestRegressor(**best_parameters_rf)
best_RF.fit(x_train, y_train)


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().



In [35]:
y_pred_rf_train = random_search.predict(x_train)
y_pred_rf_test = random_search.predict(x_test)

train_mse = mean_squared_error(y_train, y_pred_rf_train)
test_mse = mean_squared_error(y_test, y_pred_rf_test)
train_r2 = r2_score(y_train, y_pred_rf_train)
test_r2 = r2_score(y_test, y_pred_rf_test)

print(f"Training MSE: {train_mse}")
print(f"Testing MSE: {test_mse}")
print(f"Training R2 Score: {train_r2}")
print(f"Testing R2 Score: {test_r2}")

Training MSE: 0.061797630430622114
Testing MSE: 4.436970279048565
Training R2 Score: 0.9954050492014239
Testing R2 Score: 0.6992929411986595


# Comparision of the models based on R2 and MSE training and testing scores

In [36]:
models = {
    'Linear Regression': LR,
    'Lasso': lass_reg,
    'Decision Tree': best_DT,
    'Random Forest': best_RF
}
results = []
for name, model in models.items():
    model.fit(x_train, y_train)
    y_pred_train = model.predict(x_train)
    y_pred_test = model.predict(x_test)
    train_mse = mean_squared_error(y_train, y_pred_train)
    test_mse = mean_squared_error(y_test, y_pred_test)
    train_r2 = r2_score(y_train, y_pred_train)
    test_r2 = r2_score(y_test, y_pred_test)
    r2_diff = train_r2 - test_r2
    mse_diff = train_mse - test_mse
    results.append({'Model': name,
                    'Training MSE': train_mse,
                    'Testing MSE': test_mse,
                    'Training R2 Score': train_r2,
                    'Testing R2 Score': test_r2,
                    'R2 score difference (Train - Test)': r2_diff,
                    'MSE score difference (Train - Test)': mse_diff})
results_df = pd.DataFrame(results)
px.line(results_df, x="Model", y=["Training MSE", "Testing MSE"], range_y=(0,7.5), title="Comparision of the models").show()
px.line(results_df, x="Model", y=["Training R2 Score", "Testing R2 Score"], range_y=(0,1.5), title="Comparision of the models").show()
results_df



A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().



Unnamed: 0,Model,Training MSE,Testing MSE,Training R2 Score,Testing R2 Score,R2 score difference (Train - Test),MSE score difference (Train - Test)
0,Linear Regression,1.138309,3.786896,0.915361,0.74335,0.172011,-2.648587
1,Lasso,1.138408,3.777588,0.915354,0.743981,0.171373,-2.63918
2,Decision Tree,0.115013,4.811764,0.991448,0.673892,0.317556,-4.69675
3,Random Forest,0.059035,4.475545,0.99561,0.696679,0.298932,-4.41651


# Prediction on new data

In [37]:
yr = int(input())
pp = float(input())
dkms = int(input())
ft = int(input())
st = int(input())
tran = int(input())
owner = int(input())
yr = (yr - Year_min) / (Year_max- Year_min)
dkms = (dkms - Driven_min) / (Driven_max- Driven_min)
new_car = [[yr, pp, dkms, ft, st, tran, owner]]
predicted_price_lr = LR.predict(new_car)
print('Predicted Selling Price by Linear Regression:', predicted_price_lr[0][0])
predicted_price_la = lass_reg.predict(new_car)
print('Predicted Selling Price by Lasso Regression:', predicted_price_la[0])
predicted_price_dt = best_DT.predict(new_car)
print('Predicted Selling Price by Decision Tree Regressor:', predicted_price_dt[0])
predicted_price_rf = best_RF.predict(new_car)
print('Predicted Selling Price by Random Forest Regressor:', predicted_price_rf[0])

Predicted Selling Price by Linear Regression: 4.33233261846174
Predicted Selling Price by Lasso Regression: 4.292474355266873
Predicted Selling Price by Decision Tree Regressor: 4.221428571428571
Predicted Selling Price by Random Forest Regressor: 4.763749999999991


# Storing the models in sav files

In [38]:
jb.dump(LR, "LR.sav")
jb.dump(lass_reg, "Lasso.sav")
jb.dump(best_DT, "DT.sav")
jb.dump(best_RF, "RF.sav")

['RF.sav']