### Importing Libraries

In [105]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # data visualization
import warnings
warnings.filterwarnings('ignore')

In [106]:
df = pd.read_csv("car_prediction_data.csv")
df.head()

Unnamed: 0,Car_Name,Year,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
0,ritz,2014,3.35,5.59,27000,Petrol,Dealer,Manual,0
1,sx4,2013,4.75,9.54,43000,Diesel,Dealer,Manual,0
2,ciaz,2017,7.25,9.85,6900,Petrol,Dealer,Manual,0
3,wagon r,2011,2.85,4.15,5200,Petrol,Dealer,Manual,0
4,swift,2014,4.6,6.87,42450,Diesel,Dealer,Manual,0


In [107]:
df.shape

(301, 9)

### Exploratory Data Analysis

In [108]:
df['Car_Name'].unique()

array(['ritz', 'sx4', 'ciaz', 'wagon r', 'swift', 'vitara brezza',
       's cross', 'alto 800', 'ertiga', 'dzire', 'alto k10', 'ignis',
       '800', 'baleno', 'omni', 'fortuner', 'innova', 'corolla altis',
       'etios cross', 'etios g', 'etios liva', 'corolla', 'etios gd',
       'camry', 'land cruiser', 'Royal Enfield Thunder 500',
       'UM Renegade Mojave', 'KTM RC200', 'Bajaj Dominar 400',
       'Royal Enfield Classic 350', 'KTM RC390', 'Hyosung GT250R',
       'Royal Enfield Thunder 350', 'KTM 390 Duke ',
       'Mahindra Mojo XT300', 'Bajaj Pulsar RS200',
       'Royal Enfield Bullet 350', 'Royal Enfield Classic 500',
       'Bajaj Avenger 220', 'Bajaj Avenger 150', 'Honda CB Hornet 160R',
       'Yamaha FZ S V 2.0', 'Yamaha FZ 16', 'TVS Apache RTR 160',
       'Bajaj Pulsar 150', 'Honda CBR 150', 'Hero Extreme',
       'Bajaj Avenger 220 dtsi', 'Bajaj Avenger 150 street',
       'Yamaha FZ  v 2.0', 'Bajaj Pulsar  NS 200', 'Bajaj Pulsar 220 F',
       'TVS Apache RTR 180', 

In [109]:
print(df['Fuel_Type'].unique())
print(df['Seller_Type'].unique())
print(df['Transmission'].unique())
print(df['Owner'].unique())

['Petrol' 'Diesel' 'CNG']
['Dealer' 'Individual']
['Manual' 'Automatic']
[0 1 3]


In [110]:
#checking for null values
df.isnull().sum()

Car_Name         0
Year             0
Selling_Price    0
Present_Price    0
Kms_Driven       0
Fuel_Type        0
Seller_Type      0
Transmission     0
Owner            0
dtype: int64

In [111]:
df.describe()

Unnamed: 0,Year,Selling_Price,Present_Price,Kms_Driven,Owner
count,301.0,301.0,301.0,301.0,301.0
mean,2013.627907,4.661296,7.628472,36947.20598,0.043189
std,2.891554,5.082812,8.644115,38886.883882,0.247915
min,2003.0,0.1,0.32,500.0,0.0
25%,2012.0,0.9,1.2,15000.0,0.0
50%,2014.0,3.6,6.4,32000.0,0.0
75%,2016.0,6.0,9.9,48767.0,0.0
max,2018.0,35.0,92.6,500000.0,3.0


In [112]:
data = df.drop(['Car_Name'], axis=1)
data['Current_year'] = 2023
data.head()

Unnamed: 0,Year,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner,Current_year
0,2014,3.35,5.59,27000,Petrol,Dealer,Manual,0,2023
1,2013,4.75,9.54,43000,Diesel,Dealer,Manual,0,2023
2,2017,7.25,9.85,6900,Petrol,Dealer,Manual,0,2023
3,2011,2.85,4.15,5200,Petrol,Dealer,Manual,0,2023
4,2014,4.6,6.87,42450,Diesel,Dealer,Manual,0,2023


In [113]:
#creating a column of how old a car is
data['number_of_year'] = data['Current_year'] - data['Year']
data.drop(columns={'Year','Current_year'},axis=1,inplace=True)
data.head()

Unnamed: 0,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner,number_of_year
0,3.35,5.59,27000,Petrol,Dealer,Manual,0,9
1,4.75,9.54,43000,Diesel,Dealer,Manual,0,10
2,7.25,9.85,6900,Petrol,Dealer,Manual,0,6
3,2.85,4.15,5200,Petrol,Dealer,Manual,0,12
4,4.6,6.87,42450,Diesel,Dealer,Manual,0,9


In [114]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()

data['Fuel_Type']= label_encoder.fit_transform(data['Fuel_Type'])
data['Seller_Type']= label_encoder.fit_transform(data['Seller_Type'])
data['Transmission']= label_encoder.fit_transform(data['Transmission'])

data.head()

Unnamed: 0,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner,number_of_year
0,3.35,5.59,27000,2,0,1,0,9
1,4.75,9.54,43000,1,0,1,0,10
2,7.25,9.85,6900,2,0,1,0,6
3,2.85,4.15,5200,2,0,1,0,12
4,4.6,6.87,42450,1,0,1,0,9


In [115]:
data.columns

Index(['Selling_Price', 'Present_Price', 'Kms_Driven', 'Fuel_Type',
       'Seller_Type', 'Transmission', 'Owner', 'number_of_year'],
      dtype='object')

### Feature Importance

In [116]:
X = data.iloc[:,1:]
y = data.iloc[:, 0]

In [117]:
from sklearn.ensemble import ExtraTreesRegressor

model = ExtraTreesRegressor()
model.fit(X, y)

In [118]:
print(model.feature_importances_)

[0.38821486 0.03781504 0.14760338 0.26085125 0.08693721 0.00114998
 0.07742827]


### Model Traning

In [119]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=0)

In [120]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV

model = RandomForestRegressor()

In [121]:
# random_grid = {'n_estimators': [int(x) for x in np.linspace(start = 100, stop = 1200, num = 12)],
#                'max_features': ['auto', 'sqrt'],
#                'max_depth': [int(x) for x in np.linspace(5, 30, num = 6)],
#                'min_samples_split': [2, 5, 10, 15, 100],
#                'min_samples_leaf': [1, 2, 5, 10]
#             }
# print(random_grid)

In [122]:
# model=RandomizedSearchCV(
#     estimator = model,
#     param_distributions = random_grid,
#     scoring='neg_mean_squared_error',
#     n_iter = 10, cv = 5, verbose=2,
#     random_state=42, n_jobs = 1
# )

In [123]:
model.fit(X_train, y_train)

In [124]:
prediction = model.predict(X_test)

In [125]:
data_frame = {
    'Actual_Value': y_test,
    'Prediction_Value':prediction
}

In [126]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

def evaluate_metrics(y_test, prediction):
    print('MAE:', mean_absolute_error(y_test, prediction))
    print('MSE:', mean_squared_error(y_test, prediction))
    print('RMSE:', np.sqrt(mean_squared_error(y_test, prediction)))

evaluate_metrics(y_test, prediction)

MAE: 0.7554252747252748
MSE: 2.5643795409890093
RMSE: 1.6013680217204942


### Predicitve Model

In [128]:
X.head()

Unnamed: 0,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner,number_of_year
0,5.59,27000,2,0,1,0,9
1,9.54,43000,1,0,1,0,10
2,9.85,6900,2,0,1,0,6
3,4.15,5200,2,0,1,0,12
4,6.87,42450,1,0,1,0,9


In [129]:
custom_input = (5.59 , 27000, 2, 0, 1, 0, 9)
input = np.asarray(custom_input)
input = input.reshape(1,-1)

pred = model.predict(input)
print(pred)

[3.7115]


### Exporting Model

In [130]:
import pickle

filename = 'car_price.sav'
pickle.dump(model, open(filename, 'wb'))