In [None]:
# src: https://www.kaggle.com/code/venkatganesh98/ford-car-price-prediction-randomforestregr-acc-93

In [2]:
# common
import os
import numpy as np
import pandas as pd

# data visualization
import matplotlib.pyplot as plt
import seaborn as snsai

# pre-processing

## encoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

## scaling
from sklearn.preprocessing import StandardScaler

## splitting the data
from sklearn.model_selection import train_test_split

## model
from sklearn.ensemble import RandomForestRegressor

## evaluation metrics
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score

# fine tuning
from sklearn.model_selection import GridSearchCV

# warnings
import warnings
warnings.filterwarnings('ignore')

In [3]:
data_dir = r"D:\KKU_World\1_2\DBMS\termProject\python\Ai\dataset\archive\ford.csv"
df = pd.read_csv(data_dir)
# view the data frame
# df.head()
# df.shape
# df.info()
df.isna().sum()

model           0
year            0
price           0
transmission    0
mileage         0
fuelType        0
tax             0
mpg             0
engineSize      0
dtype: int64

In [35]:
df['model'].value_counts()

Fiesta                   6557
Focus                    4589
Kuga                     2225
EcoSport                 1143
C-MAX                     543
Ka+                       531
Mondeo                    526
B-MAX                     355
S-MAX                     296
Grand C-MAX               247
Galaxy                    228
Edge                      208
KA                        199
Puma                       80
Tourneo Custom             69
Grand Tourneo Connect      59
Mustang                    57
Tourneo Connect            33
Fusion                     16
Streetka                    2
Ranger                      1
Escort                      1
Transit Tourneo             1
Name: model, dtype: int64

In [6]:
df['year'].value_counts()

2017    4888
2018    4014
2019    3194
2016    2331
2015    1368
2014     805
2013     609
2020     258
2012     115
2011      94
2009      91
2010      67
2008      57
2007      32
2005      16
2006      13
2004       4
2002       3
2003       3
1998       1
1996       1
2000       1
2060       1
Name: year, dtype: int64

In [3]:
df['transmission'].value_counts()

Manual       15518
Automatic     1361
Semi-Auto     1087
Name: transmission, dtype: int64

In [4]:
df['fuelType'].value_counts()

Petrol      12179
Diesel       5762
Hybrid         22
Electric        2
Other           1
Name: fuelType, dtype: int64

In [9]:
df['engineSize'].value_counts()

1.0    7765
1.5    3418
2.0    3311
1.2    1626
1.6     923
1.1     559
1.4     112
2.3      80
0.0      51
5.0      45
1.8      35
2.2      13
2.5      13
1.3      13
3.2       1
1.7       1
Name: engineSize, dtype: int64

In [5]:
model_mapping = {
    'Fiesta': 0,
    'Focus': 1,
    'Kuga': 2,
    'EcoSport': 3,
    'C-MAX': 4,
    'Ka+': 5,
    'Mondeo': 6,
    'B-MAX': 7,
    'S-MAX': 8,
    'Grand C-MAX': 9,
    'Galaxy': 10,
    'Edge': 11,
    'KA': 12,
    'Puma': 13,
    'Tourneo Custom': 14,
    'Grand Tourneo Connect': 15,
    'Mustang': 16,
    'Tourneo Connect': 17,
    'Fusion': 18,
    'Streetka': 19,
    'Ranger': 20,
    'Escort': 21,
    'Transit Tourneo': 22
}

transmission_mapping = {
    'Manual': 0,
    'Automatic': 1,
    'Semi-Auto': 2
}

fuel_mapping = {
    'Petrol': 0,
    'Diesel': 1,
    'Hybrid': 2,
    'Electric': 3,
    'Other': 4
}


In [10]:
keys = tuple(model_mapping.keys())


In [6]:
for i in range(len(df['model'])):
    df['model'][i] = model_mapping[df['model'][i]]
    # print(i)
df['model'].value_counts()

0     6557
1     4589
2     2225
3     1143
4      543
5      531
6      526
7      355
8      296
9      247
10     228
11     208
12     199
13      80
14      69
15      59
16      57
17      33
18      16
19       2
20       1
21       1
22       1
Name: model, dtype: int64

In [7]:
for i in range(len(df['transmission'])):
    df['transmission'][i] = transmission_mapping[df['transmission'][i]]

df['transmission'].value_counts()

0    15518
1     1361
2     1087
Name: transmission, dtype: int64

In [8]:
for i in range(len(df['fuelType'])):
    df['fuelType'][i] = fuel_mapping[df['fuelType'][i]]

df['fuelType'].value_counts()

0    12179
1     5762
2       22
3        2
4        1
Name: fuelType, dtype: int64

In [41]:
df

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize
0,0,2017,12000,1,15944,0,150,57.7,1.0
1,1,2018,14000,0,9083,0,150,57.7,1.0
2,1,2017,13000,0,12456,0,150,57.7,1.0
3,0,2019,17500,0,10460,0,145,40.3,1.5
4,0,2019,16500,1,1482,0,145,48.7,1.0
...,...,...,...,...,...,...,...,...,...
17961,7,2017,8999,0,16700,0,150,47.1,1.4
17962,7,2014,7499,0,40700,0,30,57.7,1.0
17963,1,2015,9999,0,7010,1,20,67.3,1.6
17964,12,2018,8299,0,5007,0,145,57.7,1.2


In [9]:
# splitting into feature and label
x = df.drop(['price', 'tax'], axis=1)
y = df['price']
x.head(), x.shape, y.head(), y.shape

(  model  year transmission  mileage fuelType   mpg  engineSize
 0     0  2017            1    15944        0  57.7         1.0
 1     1  2018            0     9083        0  57.7         1.0
 2     1  2017            0    12456        0  57.7         1.0
 3     0  2019            0    10460        0  40.3         1.5
 4     0  2019            1     1482        0  48.7         1.0,
 (17966, 7),
 0    12000
 1    14000
 2    13000
 3    17500
 4    16500
 Name: price, dtype: int64,
 (17966,))

In [10]:
# splitting into train and test split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((14372, 7), (3594, 7), (14372,), (3594,))

In [10]:
regr = RandomForestRegressor(n_estimators=1000, max_depth=20, random_state=42)
regr.fit(x_train, y_train.values.ravel())

In [11]:
# make prediction
predictions = regr.predict(x_test)

In [12]:
result = x_test
result['price'] = y_test
result['prediction'] = predictions.tolist()
result.head()

Unnamed: 0,model,year,transmission,mileage,fuelType,mpg,engineSize,price,prediction
17610,0,2014,0,21793,0,65.7,1.0,6995,8011.494959
7076,0,2017,0,29020,0,65.7,1.0,8999,9144.228082
1713,1,2015,0,46155,1,67.3,1.6,7998,8048.740816
1611,7,2013,0,43036,0,55.4,1.0,5491,6008.28639
16830,6,2009,0,79000,1,48.7,1.8,3790,3172.297


In [13]:
x_test

Unnamed: 0,model,year,transmission,mileage,fuelType,mpg,engineSize,price,prediction
17610,0,2014,0,21793,0,65.7,1.0,6995,8011.494959
7076,0,2017,0,29020,0,65.7,1.0,8999,9144.228082
1713,1,2015,0,46155,1,67.3,1.6,7998,8048.740816
1611,7,2013,0,43036,0,55.4,1.0,5491,6008.286390
16830,6,2009,0,79000,1,48.7,1.8,3790,3172.297000
...,...,...,...,...,...,...,...,...,...
7731,6,2019,1,8865,1,58.9,2.0,22000,20014.604635
11655,0,2018,0,14173,0,58.9,1.0,13699,12213.051960
10536,6,2020,1,2213,2,47.9,2.0,24979,23990.146429
11885,0,2017,0,22578,0,65.7,1.0,10500,9872.165952


In [11]:
import pickle

In [15]:
# pickle.dump(regr, open('randomForestModel.sav', 'wb'))
with open('random_forest_model.pkl', 'wb') as f:
    pickle.dump(regr, f)

In [12]:
# load_model = pickle.load(open('randomForestModel.sav', 'rb'))
with open('random_forest_model.pkl', 'rb') as f:
    loaded_model = pickle.load(f)

In [14]:
y_pred = loaded_model.predict(x_test)
print(y_pred)


[ 8011.49495942  9144.22808226  8048.74081553 ... 23990.14642857
  9872.16595186 14392.728     ]


In [16]:
mse = mean_squared_error(y_test.values.ravel(), y_pred)
mae = mean_absolute_error(y_test.values.ravel(), y_pred)
r2 = r2_score(y_test.values.ravel(), y_pred)

print(f"MSE: {round(mse, 2)}")
print(f"MAE: {round(mae, 2)}")
print(f"R2 Score: {round(r2, 2)}")

MSE: 1634060.59
MAE: 870.13
R2 Score: 0.93


In [17]:
x_test

Unnamed: 0,model,year,transmission,mileage,fuelType,mpg,engineSize
17610,0,2014,0,21793,0,65.7,1.0
7076,0,2017,0,29020,0,65.7,1.0
1713,1,2015,0,46155,1,67.3,1.6
1611,7,2013,0,43036,0,55.4,1.0
16830,6,2009,0,79000,1,48.7,1.8
...,...,...,...,...,...,...,...
7731,6,2019,1,8865,1,58.9,2.0
11655,0,2018,0,14173,0,58.9,1.0
10536,6,2020,1,2213,2,47.9,2.0
11885,0,2017,0,22578,0,65.7,1.0
