In [3]:
import pandas as pd

In [4]:
df = pd.read_csv('datasets/merc.csv')
df

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize
0,SLK,2005,5200,Automatic,63000,Petrol,325,32.1,1.8
1,S Class,2017,34948,Automatic,27000,Hybrid,20,61.4,2.1
2,SL CLASS,2016,49948,Automatic,6200,Petrol,555,28.0,5.5
3,G Class,2016,61948,Automatic,16000,Petrol,325,30.4,4.0
4,G Class,2016,73948,Automatic,4000,Petrol,325,30.1,4.0
...,...,...,...,...,...,...,...,...,...
13114,C Class,2020,35999,Automatic,500,Diesel,145,55.4,2.0
13115,B Class,2020,24699,Automatic,2500,Diesel,145,55.4,2.0
13116,GLC Class,2019,30999,Automatic,11612,Diesel,145,41.5,2.1
13117,CLS Class,2019,37990,Automatic,2426,Diesel,145,45.6,2.0


In [5]:
df['model'] = df['model'].str.strip()

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13119 entries, 0 to 13118
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   model         13119 non-null  object 
 1   year          13119 non-null  int64  
 2   price         13119 non-null  int64  
 3   transmission  13119 non-null  object 
 4   mileage       13119 non-null  int64  
 5   fuelType      13119 non-null  object 
 6   tax           13119 non-null  int64  
 7   mpg           13119 non-null  float64
 8   engineSize    13119 non-null  float64
dtypes: float64(2), int64(4), object(3)
memory usage: 922.6+ KB


In [7]:
df.isnull().sum()

model           0
year            0
price           0
transmission    0
mileage         0
fuelType        0
tax             0
mpg             0
engineSize      0
dtype: int64

## Importing ML Libraries

In [8]:
import numpy as np
import sklearn.metrics as m
import plotly.express as px
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
# cross validation
from sklearn.model_selection import cross_val_score
# grid search
from sklearn.model_selection import GridSearchCV
# outlier
from sklearn.neighbors import LocalOutlierFactor
from sklearn.compose import ColumnTransformer

## Feature Selection

In [9]:
X = df.drop(columns=['price'])
y = df['price']

In [10]:
X #cleaned data

Unnamed: 0,model,year,transmission,mileage,fuelType,tax,mpg,engineSize
0,SLK,2005,Automatic,63000,Petrol,325,32.1,1.8
1,S Class,2017,Automatic,27000,Hybrid,20,61.4,2.1
2,SL CLASS,2016,Automatic,6200,Petrol,555,28.0,5.5
3,G Class,2016,Automatic,16000,Petrol,325,30.4,4.0
4,G Class,2016,Automatic,4000,Petrol,325,30.1,4.0
...,...,...,...,...,...,...,...,...
13114,C Class,2020,Automatic,500,Diesel,145,55.4,2.0
13115,B Class,2020,Automatic,2500,Diesel,145,55.4,2.0
13116,GLC Class,2019,Automatic,11612,Diesel,145,41.5,2.1
13117,CLS Class,2019,Automatic,2426,Diesel,145,45.6,2.0


### Column Extraction

In [11]:
bin_cols = ['transmission', 'fuelType']
num_cols = ['year', 'mileage', 'tax', 'mpg', 'engineSize']
cat_cols = ['model']

## Preprocessing Pipeline

In [12]:
# create pipeline
num_pipeline = Pipeline([
    ('std_scaler', StandardScaler())
])
binary_pipeline = Pipeline([
    ('one_hot_encoder', OrdinalEncoder())
])
cat_pipeline = Pipeline([
    ('one_hot_encoder', OneHotEncoder(drop='first'))
])

# create column transformer
preprocessor = ColumnTransformer([
    ('num', num_pipeline, num_cols),
    ('binary', binary_pipeline, bin_cols),
    ('cat', cat_pipeline, cat_cols)
])

preprocessor

## Outlier Removal
- this help to remove the outliers from the dataset

In [13]:
outlier_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('outlier', LocalOutlierFactor(n_neighbors=20, contamination=0.1))
])

outlier_pipeline

In [14]:
yhat = outlier_pipeline.fit_predict(X)
X = X[yhat==1].copy()
y = y[yhat==1].copy()

In [15]:
df = pd.concat([X, y], axis=1)

In [16]:
X#clean data

Unnamed: 0,model,year,transmission,mileage,fuelType,tax,mpg,engineSize
0,SLK,2005,Automatic,63000,Petrol,325,32.1,1.8
2,SL CLASS,2016,Automatic,6200,Petrol,555,28.0,5.5
3,G Class,2016,Automatic,16000,Petrol,325,30.4,4.0
4,G Class,2016,Automatic,4000,Petrol,325,30.1,4.0
5,SL CLASS,2011,Automatic,3000,Petrol,570,21.4,6.2
...,...,...,...,...,...,...,...,...
13109,A Class,2017,Automatic,20477,Diesel,145,68.9,2.1
13111,B Class,2019,Automatic,15257,Petrol,145,45.6,1.3
13112,C Class,2019,Automatic,5000,Diesel,145,61.4,2.0
13116,GLC Class,2019,Automatic,11612,Diesel,145,41.5,2.1


## Model Pipeline

In [17]:
pipeline1 = Pipeline([
    ('preprocessor', preprocessor),
    ('model', DecisionTreeRegressor())
])
pipeline1

In [18]:
pipeline2 = Pipeline([
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor())
])
pipeline2

In [19]:
pipeline3 = Pipeline([
    ('preprocessor', preprocessor),
    ('model', LinearRegression())
])
pipeline3

In [20]:
pipeline4 = Pipeline([
    ('preprocessor', preprocessor),
    ('model', SVR())
])
pipeline4

## Training Model and Evaluation

In [21]:
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=42)
model1 = pipeline1.fit(train_X, train_y)
yhat = model1.predict(test_X)
mae = m.mean_absolute_error(test_y, yhat)
mse = m.mean_squared_error(test_y, yhat)
r2 = m.r2_score(test_y, yhat)
rmse = np.sqrt(mse)
print("Decision Tree Regressor")
print(f'MAE: {mae} - tells us how far off our predictions are on average.')
print(f'MSE: {mse} - tells us how far off our predictions are on average squared.')
print(f'RMSE: {rmse} - tells us how far off our predictions are on average squared root.')
print(f'R2: {r2} - tells us how much of the variance in the target variable is explained by the model.')

Decision Tree Regressor
MAE: 1909.109048621432 - tells us how far off our predictions are on average.
MSE: 10370161.4794259 - tells us how far off our predictions are on average squared.
RMSE: 3220.273510033876 - tells us how far off our predictions are on average squared root.
R2: 0.9252863761564654 - tells us how much of the variance in the target variable is explained by the model.


In [22]:
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=42)
model2 = pipeline2.fit(train_X, train_y)
yhat = model2.predict(test_X)
mae = m.mean_absolute_error(test_y, yhat)
mse = m.mean_squared_error(test_y, yhat)
r2 = m.r2_score(test_y, yhat)
rmse = np.sqrt(mse)
print("Random Forest Regressor")
print(f'MAE: {mae} - tells us how far off our predictions are on average.')
print(f'MSE: {mse} - tells us how far off our predictions are on average squared.')
print(f'RMSE: {rmse} - tells us how far off our predictions are on average squared root.')
print(f'R2: {r2} - tells us how much of the variance in the target variable is explained by the model.')


Random Forest Regressor
MAE: 1556.0319770274152 - tells us how far off our predictions are on average.
MSE: 7128741.265443634 - tells us how far off our predictions are on average squared.
RMSE: 2669.970274262175 - tells us how far off our predictions are on average squared root.
R2: 0.948639749299861 - tells us how much of the variance in the target variable is explained by the model.


In [23]:
model3 = pipeline3.fit(train_X, train_y)
yhat = model3.predict(test_X)
mae = m.mean_absolute_error(test_y, yhat)
mse = m.mean_squared_error(test_y, yhat)
r2 = m.r2_score(test_y, yhat)
rmse = np.sqrt(mse)
print("Linear Regression")
print(f'MAE: {mae}')
print(f'MSE: {mse}')
print(f'RMSE: {rmse}')
print(f'R2: {r2}')

Linear Regression
MAE: 3528.334328734736
MSE: 30906946.252077095
RMSE: 5559.401609173157
R2: 0.777325554571994


In [24]:
model4 = pipeline4.fit(train_X, train_y)
yhat = model4.predict(test_X)
mae = m.mean_absolute_error(test_y, yhat)
mse = m.mean_squared_error(test_y, yhat)
r2 = m.r2_score(test_y, yhat)
rmse = np.sqrt(mse)
print("Support Vector Regression")
print(f'MAE: {mae}')
print(f'MSE: {mse}')
print(f'RMSE: {rmse}')
print(f'R2: {r2}')

Support Vector Regression
MAE: 7568.611506277129
MSE: 137092080.9659243
RMSE: 11708.632753909582
R2: 0.012296366885262033


In [30]:
df['dtree_pred'] = model1.predict(df.drop(columns=['price']))
df['rf_pred'] = model2.predict(df.drop(columns=['price']))
df['lr_pred'] = model3.predict(df.drop(columns=['price']))
df['svr_pred'] = model4.predict(df.drop(columns=['price']))

## visualization

In [26]:
# histogram
fig = px.histogram(df, x='price', nbins=100, title='Price Distribution')
fig.show()

## saving the model

In [27]:
import joblib

In [29]:
joblib.dump(model1, 'merc/dtree_model.pkl')
joblib.dump(model2, 'merc/rf_model.pkl')
joblib.dump(model3, 'merc/lr_model.pkl')
joblib.dump(model4, 'merc/svr_model.pkl')

['merc/svr_model.pkl']

In [31]:
X.fuelType.unique()

array(['Petrol', 'Diesel', 'Hybrid', 'Other'], dtype=object)

In [33]:
X.model.unique()

array(['SLK', 'SL CLASS', 'G Class', 'GLE Class', 'S Class', 'GLA Class',
       'A Class', 'B Class', 'GLC Class', 'C Class', 'E Class',
       'GL Class', 'CLS Class', 'CLC Class', 'CLA Class', 'V Class',
       'M Class', 'CL Class', 'GLS Class', 'X-CLASS', 'GLB Class', 'CLK',
       'R Class'], dtype=object)

In [34]:
X.transmission.unique()

array(['Automatic', 'Manual', 'Semi-Auto'], dtype=object)

In [35]:
X.mileage.unique()

array([63000,  6200, 16000, ..., 20477, 11612,  2075], dtype=int64)

In [32]:
X

Unnamed: 0,model,year,transmission,mileage,fuelType,tax,mpg,engineSize
0,SLK,2005,Automatic,63000,Petrol,325,32.1,1.8
2,SL CLASS,2016,Automatic,6200,Petrol,555,28.0,5.5
3,G Class,2016,Automatic,16000,Petrol,325,30.4,4.0
4,G Class,2016,Automatic,4000,Petrol,325,30.1,4.0
5,SL CLASS,2011,Automatic,3000,Petrol,570,21.4,6.2
...,...,...,...,...,...,...,...,...
13109,A Class,2017,Automatic,20477,Diesel,145,68.9,2.1
13111,B Class,2019,Automatic,15257,Petrol,145,45.6,1.3
13112,C Class,2019,Automatic,5000,Diesel,145,61.4,2.0
13116,GLC Class,2019,Automatic,11612,Diesel,145,41.5,2.1
