In [None]:
# src: https://www.kaggle.com/code/venkatganesh98/ford-car-price-prediction-randomforestregr-acc-93

In [3]:
# common
import os
import numpy as np
import pandas as pd

# data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# pre-processing

## encoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

## scaling
from sklearn.preprocessing import StandardScaler

## splitting the data
from sklearn.model_selection import train_test_split

## model
from sklearn.ensemble import RandomForestRegressor

## evaluation metrics
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score

# fine tuning
from sklearn.model_selection import GridSearchCV

# warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
data_dir = r"D:\KKU_World\1_2\DBMS\termProject\dataset\archive\ford.csv"
df = pd.read_csv(data_dir)
# view the data frame
# df.head()
# df.shape
# df.info()
df.isna().sum()

model           0
year            0
price           0
transmission    0
mileage         0
fuelType        0
tax             0
mpg             0
engineSize      0
dtype: int64

In [5]:
df['model'].value_counts()

 Fiesta                   6557
 Focus                    4588
 Kuga                     2225
 EcoSport                 1143
 C-MAX                     543
 Ka+                       531
 Mondeo                    526
 B-MAX                     355
 S-MAX                     296
 Grand C-MAX               247
 Galaxy                    228
 Edge                      208
 KA                        199
 Puma                       80
 Tourneo Custom             69
 Grand Tourneo Connect      59
 Mustang                    57
 Tourneo Connect            33
 Fusion                     16
 Streetka                    2
 Ranger                      1
 Escort                      1
 Transit Tourneo             1
Focus                        1
Name: model, dtype: int64

In [6]:
df['year'].value_counts()

2017    4888
2018    4014
2019    3194
2016    2331
2015    1368
2014     805
2013     609
2020     258
2012     115
2011      94
2009      91
2010      67
2008      57
2007      32
2005      16
2006      13
2004       4
2002       3
2003       3
1998       1
1996       1
2000       1
2060       1
Name: year, dtype: int64

In [7]:
df['transmission'].value_counts()

Manual       15518
Automatic     1361
Semi-Auto     1087
Name: transmission, dtype: int64

In [8]:
df['fuelType'].value_counts()

Petrol      12179
Diesel       5762
Hybrid         22
Electric        2
Other           1
Name: fuelType, dtype: int64

In [9]:
df['engineSize'].value_counts()

1.0    7765
1.5    3418
2.0    3311
1.2    1626
1.6     923
1.1     559
1.4     112
2.3      80
0.0      51
5.0      45
1.8      35
2.2      13
2.5      13
1.3      13
3.2       1
1.7       1
Name: engineSize, dtype: int64

In [3]:
labelencoder = LabelEncoder()

In [4]:
# encoding the car model with label encoder
df['model'] = labelencoder.fit_transform(df['model'])

# encoding the transmission with label encoder
df['transmission'] = labelencoder.fit_transform(df['transmission'])

# encoding the fuel type with label encoder
df['fuelType'] = labelencoder.fit_transform(df['fuelType'])

# view the encoded data frame
df.head()

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize
0,5,2017,12000,0,15944,4,150,57.7,1.0
1,6,2018,14000,1,9083,4,150,57.7,1.0
2,6,2017,13000,1,12456,4,150,57.7,1.0
3,5,2019,17500,1,10460,4,145,40.3,1.5
4,5,2019,16500,0,1482,4,145,48.7,1.0


In [5]:
# splitting into feature and label
x = df.drop('price', axis=1)
y = df['price']
x.head(), x.shape, y.head(), y.shape

(   model  year  transmission  mileage  fuelType  tax   mpg  engineSize
 0      5  2017             0    15944         4  150  57.7         1.0
 1      6  2018             1     9083         4  150  57.7         1.0
 2      6  2017             1    12456         4  150  57.7         1.0
 3      5  2019             1    10460         4  145  40.3         1.5
 4      5  2019             0     1482         4  145  48.7         1.0,
 (17966, 8),
 0    12000
 1    14000
 2    13000
 3    17500
 4    16500
 Name: price, dtype: int64,
 (17966,))

In [6]:
# splitting into train and test split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((14372, 8), (3594, 8), (14372,), (3594,))

In [7]:
regr = RandomForestRegressor(n_estimators=1000, max_depth=20, random_state=42)
regr.fit(x_train, y_train.values.ravel())

In [17]:
# make prediction
predictions = regr.predict(x_test)

In [25]:
result = x_test
result['price'] = y_test
result['prediction'] = predictions.tolist()
result.head()

Unnamed: 0,model,year,transmission,mileage,fuelType,tax,mpg,engineSize,price,prediction
17610,5,2014,1,21793,4,0,65.7,1.0,6995,8045.992972
7076,5,2017,1,29020,4,150,65.7,1.0,8999,9000.140095
1713,6,2015,1,46155,0,20,67.3,1.6,7998,8005.422131
1611,0,2013,1,43036,4,30,55.4,1.0,5491,6019.281637
16830,14,2009,1,79000,0,200,48.7,1.8,3790,3247.95


In [27]:
x_test

Unnamed: 0,model,year,transmission,mileage,fuelType,tax,mpg,engineSize,price,prediction
17610,5,2014,1,21793,4,0,65.7,1.0,6995,8045.992972
7076,5,2017,1,29020,4,150,65.7,1.0,8999,9000.140095
1713,6,2015,1,46155,0,20,67.3,1.6,7998,8005.422131
1611,0,2013,1,43036,4,30,55.4,1.0,5491,6019.281637
16830,14,2009,1,79000,0,200,48.7,1.8,3790,3247.950000
...,...,...,...,...,...,...,...,...,...,...
7731,14,2019,0,8865,0,145,58.9,2.0,22000,20426.851771
11655,5,2018,1,14173,4,145,58.9,1.0,13699,12245.787963
10536,14,2020,0,2213,2,135,47.9,2.0,24979,24164.605381
11885,5,2017,1,22578,4,145,65.7,1.0,10500,9969.510341


In [12]:
import pickle

In [13]:
# pickle.dump(regr, open('randomForestModel.sav', 'wb'))
with open('random_forest_model.pkl', 'wb') as f:
    pickle.dump(regr, f)

In [14]:
# load_model = pickle.load(open('randomForestModel.sav', 'rb'))
with open('random_forest_model.pkl', 'rb') as f:
    loaded_model = pickle.load(f)

In [23]:
y_pred = regr.predict(x_test)
print(y_pred)


[ 8045.99297197  9000.14009533  8005.42213137 ... 24164.60538095
  9969.51034097 16709.10133333]


In [24]:
mse = mean_squared_error(y_test.values.ravel(), predictions)
mae = mean_absolute_error(y_test.values.ravel(), predictions)
r2 = r2_score(y_test.values.ravel(), predictions)

print(f"MSE: {round(mse, 2)}")
print(f"MAE: {round(mae, 2)}")
print(f"R2 Score: {round(r2, 2)}")

MSE: 1543204.62
MAE: 848.96
R2 Score: 0.93
