In [1]:
import pandas as pd

## Loading data

In [2]:
df = pd.read_csv('datasets/hyundi.csv')
df

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax(£),mpg,engineSize
0,I20,2017,7999,Manual,17307,Petrol,145,58.9,1.2
1,Tucson,2016,14499,Automatic,25233,Diesel,235,43.5,2.0
2,Tucson,2016,11399,Manual,37877,Diesel,30,61.7,1.7
3,I10,2016,6499,Manual,23789,Petrol,20,60.1,1.0
4,IX35,2015,10199,Manual,33177,Diesel,160,51.4,2.0
...,...,...,...,...,...,...,...,...,...
4855,I30,2016,8680,Manual,25906,Diesel,0,78.4,1.6
4856,I40,2015,7830,Manual,59508,Diesel,30,65.7,1.7
4857,I10,2017,6830,Manual,13810,Petrol,20,60.1,1.0
4858,Tucson,2018,13994,Manual,23313,Petrol,145,44.8,1.6


In [3]:
df['model'] = df['model'].str.strip()

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4860 entries, 0 to 4859
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   model         4860 non-null   object 
 1   year          4860 non-null   int64  
 2   price         4860 non-null   int64  
 3   transmission  4860 non-null   object 
 4   mileage       4860 non-null   int64  
 5   fuelType      4860 non-null   object 
 6   tax(£)        4860 non-null   int64  
 7   mpg           4860 non-null   float64
 8   engineSize    4860 non-null   float64
dtypes: float64(2), int64(4), object(3)
memory usage: 341.8+ KB


## Importing ML libraries

In [5]:
import numpy as np
import sklearn.metrics as m
import plotly.express as px
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
# cross validation
from sklearn.model_selection import cross_val_score
# grid search
from sklearn.model_selection import GridSearchCV
# outlier
from sklearn.neighbors import LocalOutlierFactor
from sklearn.compose import ColumnTransformer

## Feature Selection

In [6]:
X = df.drop(columns=['price'])
y = df['price']

In [7]:
X.rename(columns={'tax(£)':'tax'}, inplace=True)

## Column Extraction

In [8]:
bin_cols = ['transmission', 'fuelType']
num_cols = ['year', 'mileage', 'tax', 'mpg', 'engineSize']
cat_cols = ['model']

## Preprocessing Pipeline

In [9]:
# create pipeline
num_pipeline = Pipeline([
    ('std_scaler', StandardScaler())
])
binary_pipeline = Pipeline([
    ('one_hot_encoder', OrdinalEncoder())
])
cat_pipeline = Pipeline([
    ('one_hot_encoder', OneHotEncoder(drop='first'))
])

# create column transformer
preprocessor = ColumnTransformer([
    ('num', num_pipeline, num_cols),
    ('binary', binary_pipeline, bin_cols),
    ('cat', cat_pipeline, cat_cols)
])

preprocessor

## Outlier Removal
- this help to remove the outliers from the dataset

In [10]:
outlier_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('outlier', LocalOutlierFactor(n_neighbors=20, contamination=0.1))
])

outlier_pipeline

In [11]:
yhat = outlier_pipeline.fit_predict(X)
X = X[yhat==1].copy()
y = y[yhat==1].copy()

In [12]:
df = pd.concat([X, y], axis=1)

In [13]:
X # clean data

Unnamed: 0,model,year,transmission,mileage,fuelType,tax,mpg,engineSize
0,I20,2017,Manual,17307,Petrol,145,58.9,1.2
1,Tucson,2016,Automatic,25233,Diesel,235,43.5,2.0
2,Tucson,2016,Manual,37877,Diesel,30,61.7,1.7
3,I10,2016,Manual,23789,Petrol,20,60.1,1.0
4,IX35,2015,Manual,33177,Diesel,160,51.4,2.0
...,...,...,...,...,...,...,...,...
4854,Santa Fe,2019,Semi-Auto,1567,Diesel,145,39.8,2.2
4855,I30,2016,Manual,25906,Diesel,0,78.4,1.6
4856,I40,2015,Manual,59508,Diesel,30,65.7,1.7
4857,I10,2017,Manual,13810,Petrol,20,60.1,1.0


## Model Pipeline

In [14]:
pipeline1 = Pipeline([
    ('preprocessor', preprocessor),
    ('model', DecisionTreeRegressor())
])
pipeline1

In [15]:
pipeline2 = Pipeline([
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor())
])
pipeline2

In [16]:
pipeline3 = Pipeline([
    ('preprocessor', preprocessor),
    ('model', LinearRegression())
])
pipeline3

In [17]:
pipeline4 = Pipeline([
    ('preprocessor', preprocessor),
    ('model', SVR())
])
pipeline4

## Training Model and Evaluation

In [18]:
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=42)
model1 = pipeline1.fit(train_X, train_y)
yhat = model1.predict(test_X)
mae = m.mean_absolute_error(test_y, yhat)
mse = m.mean_squared_error(test_y, yhat)
r2 = m.r2_score(test_y, yhat)
rmse = np.sqrt(mse)
print("Decision Tree Regressor")
print(f'MAE: {mae} - tells us how far off our predictions are on average.')
print(f'MSE: {mse} - tells us how far off our predictions are on average squared.')
print(f'RMSE: {rmse} - tells us how far off our predictions are on average squared root.')
print(f'R2: {r2} - tells us how much of the variance in the target variable is explained by the model.')

Decision Tree Regressor
MAE: 991.6858476190476 - tells us how far off our predictions are on average.
MSE: 2072835.1134085716 - tells us how far off our predictions are on average squared.
RMSE: 1439.7343898818879 - tells us how far off our predictions are on average squared root.
R2: 0.9343077851963322 - tells us how much of the variance in the target variable is explained by the model.


In [19]:
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=42)
model2 = pipeline2.fit(train_X, train_y)
yhat = model2.predict(test_X)
mae = m.mean_absolute_error(test_y, yhat)
mse = m.mean_squared_error(test_y, yhat)
r2 = m.r2_score(test_y, yhat)
rmse = np.sqrt(mse)
print("Random Forest Regressor")
print(f'MAE: {mae} - tells us how far off our predictions are on average.')
print(f'MSE: {mse} - tells us how far off our predictions are on average squared.')
print(f'RMSE: {rmse} - tells us how far off our predictions are on average squared root.')
print(f'R2: {r2} - tells us how much of the variance in the target variable is explained by the model.')

Random Forest Regressor
MAE: 801.1268724389392 - tells us how far off our predictions are on average.
MSE: 1313643.68411279 - tells us how far off our predictions are on average squared.
RMSE: 1146.1429597187212 - tells us how far off our predictions are on average squared root.
R2: 0.9583680522806691 - tells us how much of the variance in the target variable is explained by the model.


In [20]:
model3 = pipeline3.fit(train_X, train_y)
yhat = model3.predict(test_X)
mae = m.mean_absolute_error(test_y, yhat)
mse = m.mean_squared_error(test_y, yhat)
r2 = m.r2_score(test_y, yhat)
rmse = np.sqrt(mse)
print("Linear Regression")
print(f'MAE: {mae}')
print(f'MSE: {mse}')
print(f'RMSE: {rmse}')
print(f'R2: {r2}')

Linear Regression
MAE: 1412.3924414160706
MSE: 4113663.97286276
RMSE: 2028.2169442302666
R2: 0.8696299114254995


In [21]:
model4 = pipeline4.fit(train_X, train_y)
yhat = model4.predict(test_X)
mae = m.mean_absolute_error(test_y, yhat)
mse = m.mean_squared_error(test_y, yhat)
r2 = m.r2_score(test_y, yhat)
rmse = np.sqrt(mse)
print("Support Vector Regression")
print(f'MAE: {mae}')
print(f'MSE: {mse}')
print(f'RMSE: {rmse}')
print(f'R2: {r2}')

Support Vector Regression
MAE: 4185.244386739116
MSE: 29796364.419550024
RMSE: 5458.604622021092
R2: 0.05569470569286383


In [22]:
df['dtree_pred'] = model1.predict(df.drop(columns=['price']))
df['rf_pred'] = model2.predict(df.drop(columns=['price']))
df['lr_pred'] = model3.predict(df.drop(columns=['price']))
df['svr_pred'] = model4.predict(df.drop(columns=['price']))

## visualization

In [23]:
# histogram
fig = px.histogram(df, x='price', nbins=100, title='Price Distribution')
fig.show()

## saving the model

In [24]:
import joblib

In [25]:
joblib.dump(model1, 'hyundi/dtree_model.pkl')
joblib.dump(model2, 'hyundi/rf_model.pkl')
joblib.dump(model3, 'hyundi/lr_model.pkl')
joblib.dump(model4, 'hyundi/svr_model.pkl')

['hyundi/svr_model.pkl']

In [26]:
X.fuelType.unique()

array(['Petrol', 'Diesel', 'Hybrid'], dtype=object)

In [27]:
X

Unnamed: 0,model,year,transmission,mileage,fuelType,tax,mpg,engineSize
0,I20,2017,Manual,17307,Petrol,145,58.9,1.2
1,Tucson,2016,Automatic,25233,Diesel,235,43.5,2.0
2,Tucson,2016,Manual,37877,Diesel,30,61.7,1.7
3,I10,2016,Manual,23789,Petrol,20,60.1,1.0
4,IX35,2015,Manual,33177,Diesel,160,51.4,2.0
...,...,...,...,...,...,...,...,...
4854,Santa Fe,2019,Semi-Auto,1567,Diesel,145,39.8,2.2
4855,I30,2016,Manual,25906,Diesel,0,78.4,1.6
4856,I40,2015,Manual,59508,Diesel,30,65.7,1.7
4857,I10,2017,Manual,13810,Petrol,20,60.1,1.0


In [28]:
X.model.unique().tolist()

['I20',
 'Tucson',
 'I10',
 'IX35',
 'I30',
 'I40',
 'Ioniq',
 'Kona',
 'I800',
 'IX20',
 'Veloster',
 'Santa Fe',
 'Getz',
 'Amica']

In [29]:
X.transmission.unique()

array(['Manual', 'Automatic', 'Semi-Auto'], dtype=object)