In [58]:
import pandas as pd

In [59]:
df = pd.read_csv('datasets/volkswagen.csv')
df

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize
0,T-Roc,2019,25000,Automatic,13904,Diesel,145,49.6,2.0
1,T-Roc,2019,26883,Automatic,4562,Diesel,145,49.6,2.0
2,T-Roc,2019,20000,Manual,7414,Diesel,145,50.4,2.0
3,T-Roc,2019,33492,Automatic,4825,Petrol,145,32.5,2.0
4,T-Roc,2019,22900,Semi-Auto,6500,Petrol,150,39.8,1.5
...,...,...,...,...,...,...,...,...,...
15152,Eos,2012,5990,Manual,74000,Diesel,125,58.9,2.0
15153,Fox,2008,1799,Manual,88102,Petrol,145,46.3,1.2
15154,Fox,2009,1590,Manual,70000,Petrol,200,42.0,1.4
15155,Fox,2006,1250,Manual,82704,Petrol,150,46.3,1.2


In [60]:
df['model'] = df['model'].str.strip()

In [61]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15157 entries, 0 to 15156
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   model         15157 non-null  object 
 1   year          15157 non-null  int64  
 2   price         15157 non-null  int64  
 3   transmission  15157 non-null  object 
 4   mileage       15157 non-null  int64  
 5   fuelType      15157 non-null  object 
 6   tax           15157 non-null  int64  
 7   mpg           15157 non-null  float64
 8   engineSize    15157 non-null  float64
dtypes: float64(2), int64(4), object(3)
memory usage: 1.0+ MB


In [62]:
df.isnull().sum()

model           0
year            0
price           0
transmission    0
mileage         0
fuelType        0
tax             0
mpg             0
engineSize      0
dtype: int64

## Importing ML Libraries

In [63]:
import numpy as np
import sklearn.metrics as m
import plotly.express as px
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
# cross validation
from sklearn.model_selection import cross_val_score
# grid search
from sklearn.model_selection import GridSearchCV
# outlier
from sklearn.neighbors import LocalOutlierFactor
from sklearn.compose import ColumnTransformer

## Feature Selection

In [64]:
X = df.drop(columns=['price'])
y = df['price']

In [65]:
X

Unnamed: 0,model,year,transmission,mileage,fuelType,tax,mpg,engineSize
0,T-Roc,2019,Automatic,13904,Diesel,145,49.6,2.0
1,T-Roc,2019,Automatic,4562,Diesel,145,49.6,2.0
2,T-Roc,2019,Manual,7414,Diesel,145,50.4,2.0
3,T-Roc,2019,Automatic,4825,Petrol,145,32.5,2.0
4,T-Roc,2019,Semi-Auto,6500,Petrol,150,39.8,1.5
...,...,...,...,...,...,...,...,...
15152,Eos,2012,Manual,74000,Diesel,125,58.9,2.0
15153,Fox,2008,Manual,88102,Petrol,145,46.3,1.2
15154,Fox,2009,Manual,70000,Petrol,200,42.0,1.4
15155,Fox,2006,Manual,82704,Petrol,150,46.3,1.2


## Column Extraction

In [66]:
bin_cols = ['transmission', 'fuelType']
num_cols = ['year', 'mileage', 'tax', 'mpg', 'engineSize']
cat_cols = ['model']

## Preprocessing Pipeline

In [67]:
# create pipeline
num_pipeline = Pipeline([
    ('std_scaler', StandardScaler())
])
binary_pipeline = Pipeline([
    ('one_hot_encoder', OrdinalEncoder())
])
cat_pipeline = Pipeline([
    ('one_hot_encoder', OneHotEncoder(drop='first'))
])

# create column transformer
preprocessor = ColumnTransformer([
    ('num', num_pipeline, num_cols),
    ('binary', binary_pipeline, bin_cols),
    ('cat', cat_pipeline, cat_cols)
])

preprocessor

## Outlier Removal
- this help to remove the outliers from the dataset

In [68]:
outlier_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('outlier', LocalOutlierFactor(n_neighbors=20, contamination=0.1))
])

outlier_pipeline

In [69]:
yhat = outlier_pipeline.fit_predict(X)
X = X[yhat==1].copy()
y = y[yhat==1].copy()

In [70]:
df = pd.concat([X, y], axis=1)

In [71]:
X# cleane data

Unnamed: 0,model,year,transmission,mileage,fuelType,tax,mpg,engineSize
0,T-Roc,2019,Automatic,13904,Diesel,145,49.6,2.0
2,T-Roc,2019,Manual,7414,Diesel,145,50.4,2.0
3,T-Roc,2019,Automatic,4825,Petrol,145,32.5,2.0
4,T-Roc,2019,Semi-Auto,6500,Petrol,150,39.8,1.5
5,T-Roc,2020,Manual,10,Petrol,145,42.2,1.5
...,...,...,...,...,...,...,...,...
15152,Eos,2012,Manual,74000,Diesel,125,58.9,2.0
15153,Fox,2008,Manual,88102,Petrol,145,46.3,1.2
15154,Fox,2009,Manual,70000,Petrol,200,42.0,1.4
15155,Fox,2006,Manual,82704,Petrol,150,46.3,1.2


## Model Pipeline

In [72]:
pipeline1 = Pipeline([
    ('preprocessor', preprocessor),
    ('model', DecisionTreeRegressor())
])
pipeline1

In [73]:
pipeline2 = Pipeline([
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor())
])
pipeline2

In [74]:
pipeline3 = Pipeline([
    ('preprocessor', preprocessor),
    ('model', LinearRegression())
])
pipeline3

In [75]:
pipeline4 = Pipeline([
    ('preprocessor', preprocessor),
    ('model', SVR())
])
pipeline4

## Training Model and Evaluation

In [76]:
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=42)
model1 = pipeline1.fit(train_X, train_y)
yhat = model1.predict(test_X)
mae = m.mean_absolute_error(test_y, yhat)
mse = m.mean_squared_error(test_y, yhat)
r2 = m.r2_score(test_y, yhat)
rmse = np.sqrt(mse)
print("Decision Tree Regressor")
print(f'MAE: {mae} - tells us how far off our predictions are on average.')
print(f'MSE: {mse} - tells us how far off our predictions are on average squared.')
print(f'RMSE: {rmse} - tells us how far off our predictions are on average squared root.')
print(f'R2: {r2} - tells us how much of the variance in the target variable is explained by the model.')

Decision Tree Regressor
MAE: 1277.8726151215342 - tells us how far off our predictions are on average.
MSE: 3818353.685591945 - tells us how far off our predictions are on average squared.
RMSE: 1954.0608193175424 - tells us how far off our predictions are on average squared root.
R2: 0.9287620851373488 - tells us how much of the variance in the target variable is explained by the model.


In [77]:
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=42)
model2 = pipeline2.fit(train_X, train_y)
yhat = model2.predict(test_X)
mae = m.mean_absolute_error(test_y, yhat)
mse = m.mean_squared_error(test_y, yhat)
r2 = m.r2_score(test_y, yhat)
rmse = np.sqrt(mse)
print("Random Forest Regressor")
print(f'MAE: {mae} - tells us how far off our predictions are on average.')
print(f'MSE: {mse} - tells us how far off our predictions are on average squared.')
print(f'RMSE: {rmse} - tells us how far off our predictions are on average squared root.')
print(f'R2: {r2} - tells us how much of the variance in the target variable is explained by the model.')

Random Forest Regressor
MAE: 1046.764483449213 - tells us how far off our predictions are on average.
MSE: 2358459.555511657 - tells us how far off our predictions are on average squared.
RMSE: 1535.7276957558774 - tells us how far off our predictions are on average squared root.
R2: 0.9559989055868461 - tells us how much of the variance in the target variable is explained by the model.


In [78]:
model3 = pipeline3.fit(train_X, train_y)
yhat = model3.predict(test_X)
mae = m.mean_absolute_error(test_y, yhat)
mse = m.mean_squared_error(test_y, yhat)
r2 = m.r2_score(test_y, yhat)
rmse = np.sqrt(mse)
print("Linear Regression")
print(f'MAE: {mae}')
print(f'MSE: {mse}')
print(f'RMSE: {rmse}')
print(f'R2: {r2}')

Linear Regression
MAE: 2006.5290137623658
MSE: 7740175.097906703
RMSE: 2782.117017292174
R2: 0.8555938029713426


In [79]:
model4 = pipeline4.fit(train_X, train_y)
yhat = model4.predict(test_X)
mae = m.mean_absolute_error(test_y, yhat)
mse = m.mean_squared_error(test_y, yhat)
r2 = m.r2_score(test_y, yhat)
rmse = np.sqrt(mse)
print("Support Vector Regression")
print(f'MAE: {mae}')
print(f'MSE: {mse}')
print(f'RMSE: {rmse}')
print(f'R2: {r2}')

Support Vector Regression
MAE: 5142.263123716703
MSE: 45767777.55414468
RMSE: 6765.188656212381
R2: 0.14612387711551933


In [80]:
df['dtree_pred'] = model1.predict(df.drop(columns=['price']))
df['rf_pred'] = model2.predict(df.drop(columns=['price']))
df['lr_pred'] = model3.predict(df.drop(columns=['price']))
df['svr_pred'] = model4.predict(df.drop(columns=['price']))

## Visualization

In [81]:
# histogram
fig = px.histogram(df, x='price', nbins=100, title='Price Distribution')
fig.show()

## Saving the model

In [82]:
import joblib

In [88]:
joblib.dump(model1, 'volkswagen/dtree_model.pkl')
joblib.dump(model2, 'volkswagen/rf_model.pkl')
joblib.dump(model3, 'volkswagen/lr_model.pkl')
joblib.dump(model4, 'volkswagen/svr_model.pkl')

['volkswagen/svr_model.pkl']

In [None]:
X.fuelType.unique()

array(['Diesel', 'Petrol', 'Hybrid', 'Other'], dtype=object)

In [None]:
X.model.unique()

array(['T-Roc', 'Golf', 'Passat', 'T-Cross', 'Polo', 'Tiguan', 'Sharan',
       'Up', 'Scirocco', 'Beetle', 'Caddy Maxi Life', 'Caravelle',
       'Touareg', 'Arteon', 'Touran', 'Golf SV', 'Amarok',
       'Tiguan Allspace', 'Shuttle', 'Jetta', 'CC', 'California', 'Caddy',
       'Caddy Maxi', 'Eos', 'Fox'], dtype=object)

In [None]:
X.transmission.unique()

array(['Automatic', 'Manual', 'Semi-Auto'], dtype=object)

In [None]:
X

Unnamed: 0,model,year,transmission,mileage,fuelType,tax,mpg,engineSize
0,T-Roc,2019,Automatic,13904,Diesel,145,49.6,2.0
2,T-Roc,2019,Manual,7414,Diesel,145,50.4,2.0
3,T-Roc,2019,Automatic,4825,Petrol,145,32.5,2.0
4,T-Roc,2019,Semi-Auto,6500,Petrol,150,39.8,1.5
5,T-Roc,2020,Manual,10,Petrol,145,42.2,1.5
...,...,...,...,...,...,...,...,...
15152,Eos,2012,Manual,74000,Diesel,125,58.9,2.0
15153,Fox,2008,Manual,88102,Petrol,145,46.3,1.2
15154,Fox,2009,Manual,70000,Petrol,200,42.0,1.4
15155,Fox,2006,Manual,82704,Petrol,150,46.3,1.2
