In [1]:
import pandas as pd

# Loading data

In [2]:
df = pd.read_csv('datasets/audi.csv')
df

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize
0,A1,2017,12500,Manual,15735,Petrol,150,55.4,1.4
1,A6,2016,16500,Automatic,36203,Diesel,20,64.2,2.0
2,A1,2016,11000,Manual,29946,Petrol,30,55.4,1.4
3,A4,2017,16800,Automatic,25952,Diesel,145,67.3,2.0
4,A3,2019,17300,Manual,1998,Petrol,145,49.6,1.0
...,...,...,...,...,...,...,...,...,...
10663,A3,2020,16999,Manual,4018,Petrol,145,49.6,1.0
10664,A3,2020,16999,Manual,1978,Petrol,150,49.6,1.0
10665,A3,2020,17199,Manual,609,Petrol,150,49.6,1.0
10666,Q3,2017,19499,Automatic,8646,Petrol,150,47.9,1.4


In [4]:
df['model'] = df['model'].str.strip()

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10668 entries, 0 to 10667
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   model         10668 non-null  object 
 1   year          10668 non-null  int64  
 2   price         10668 non-null  int64  
 3   transmission  10668 non-null  object 
 4   mileage       10668 non-null  int64  
 5   fuelType      10668 non-null  object 
 6   tax           10668 non-null  int64  
 7   mpg           10668 non-null  float64
 8   engineSize    10668 non-null  float64
dtypes: float64(2), int64(4), object(3)
memory usage: 750.2+ KB


In [6]:
df.isnull().sum()

model           0
year            0
price           0
transmission    0
mileage         0
fuelType        0
tax             0
mpg             0
engineSize      0
dtype: int64

# Importing ML libraries

In [7]:
import numpy as np
import sklearn.metrics as m
import plotly.express as px
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
# cross validation
from sklearn.model_selection import cross_val_score
# grid search
from sklearn.model_selection import GridSearchCV
# outlier
from sklearn.neighbors import LocalOutlierFactor
from sklearn.compose import ColumnTransformer

# Feature Selection

In [17]:
X = df.drop(columns=['price'])
y = df['price']

In [18]:
X.model.unique()

array(['A1', 'A6', 'A4', 'A3', 'Q3', 'Q5', 'A5', 'S4', 'Q2', 'A7', 'TT',
       'Q7', 'RS6', 'RS3', 'A8', 'Q8', 'RS4', 'RS5', 'R8', 'SQ5', 'S8',
       'SQ7', 'S3', 'S5', 'A2', 'RS7'], dtype=object)

# Column Extraction

In [19]:
bin_cols = ['transmission', 'fuelType']
num_cols = ['year', 'mileage', 'tax', 'mpg', 'engineSize']
cat_cols = ['model']

# Preprocessing Pipeline

In [20]:
# create pipeline
num_pipeline = Pipeline([
    ('std_scaler', StandardScaler())
])
binary_pipeline = Pipeline([
    ('one_hot_encoder', OrdinalEncoder())
])
cat_pipeline = Pipeline([
    ('one_hot_encoder', OneHotEncoder(drop='first'))
])

# create column transformer
preprocessor = ColumnTransformer([
    ('num', num_pipeline, num_cols),
    ('binary', binary_pipeline, bin_cols),
    ('cat', cat_pipeline, cat_cols)
])

preprocessor

# Outlier Removal
- this help to remove the outliers from the dataset

In [21]:
outlier_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('outlier', LocalOutlierFactor(n_neighbors=20, contamination=0.1))
])

outlier_pipeline

In [22]:
yhat = outlier_pipeline.fit_predict(X)
X = X[yhat==1].copy()
y = y[yhat==1].copy()

In [23]:
df = pd.concat([X, y], axis=1)

In [24]:
X # clean data

Unnamed: 0,model,year,transmission,mileage,fuelType,tax,mpg,engineSize
0,A1,2017,Manual,15735,Petrol,150,55.4,1.4
1,A6,2016,Automatic,36203,Diesel,20,64.2,2.0
2,A1,2016,Manual,29946,Petrol,30,55.4,1.4
3,A4,2017,Automatic,25952,Diesel,145,67.3,2.0
4,A3,2019,Manual,1998,Petrol,145,49.6,1.0
...,...,...,...,...,...,...,...,...
10660,A4,2011,Automatic,78000,Diesel,305,39.8,3.0
10661,A4,2011,Manual,95000,Diesel,145,53.3,2.0
10662,A3,2013,Manual,31500,Petrol,125,53.3,1.4
10663,A3,2020,Manual,4018,Petrol,145,49.6,1.0


# Model Pipeline

In [17]:
pipeline1 = Pipeline([
    ('preprocessor', preprocessor),
    ('model', DecisionTreeRegressor())
])
pipeline1

In [18]:
pipeline2 = Pipeline([
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor())
])
pipeline2

In [19]:
pipeline3 = Pipeline([
    ('preprocessor', preprocessor),
    ('model', LinearRegression())
])
pipeline3

In [25]:
pipeline4 = Pipeline([
    ('preprocessor', preprocessor),
    ('model', SVR())
])
pipeline4

# Training Model and Evaluation

In [27]:
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=42)
model1 = pipeline1.fit(train_X, train_y)
yhat = model1.predict(test_X)
mae = m.mean_absolute_error(test_y, yhat)
mse = m.mean_squared_error(test_y, yhat)
r2 = m.r2_score(test_y, yhat)
rmse = np.sqrt(mse)
print("Decision Tree Regressor")
print(f'MAE: {mae} - tells us how far off our predictions are on average.')
print(f'MSE: {mse} - tells us how far off our predictions are on average squared.')
print(f'RMSE: {rmse} - tells us how far off our predictions are on average squared root.')
print(f'R2: {r2} - tells us how much of the variance in the target variable is explained by the model.')


NameError: name 'pipeline1' is not defined

In [22]:
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=42)
model2 = pipeline2.fit(train_X, train_y)
yhat = model2.predict(test_X)
mae = m.mean_absolute_error(test_y, yhat)
mse = m.mean_squared_error(test_y, yhat)
r2 = m.r2_score(test_y, yhat)
rmse = np.sqrt(mse)
print("Random Forest Regressor")
print(f'MAE: {mae} - tells us how far off our predictions are on average.')
print(f'MSE: {mse} - tells us how far off our predictions are on average squared.')
print(f'RMSE: {rmse} - tells us how far off our predictions are on average squared root.')
print(f'R2: {r2} - tells us how much of the variance in the target variable is explained by the model.')


Random Forest Regressor
MAE: 1505.3998064111404 - tells us how far off our predictions are on average.
MSE: 5934427.400321242 - tells us how far off our predictions are on average squared.
RMSE: 2436.068020462738 - tells us how far off our predictions are on average squared root.
R2: 0.9588187831386212 - tells us how much of the variance in the target variable is explained by the model.


In [23]:
model3 = pipeline3.fit(train_X, train_y)
yhat = model3.predict(test_X)
mae = m.mean_absolute_error(test_y, yhat)
mse = m.mean_squared_error(test_y, yhat)
r2 = m.r2_score(test_y, yhat)
rmse = np.sqrt(mse)
print("Linear Regression")
print(f'MAE: {mae}')
print(f'MSE: {mse}')
print(f'RMSE: {rmse}')
print(f'R2: {r2}')

Linear Regression
MAE: 2624.2085742603904
MSE: 16655107.150067251
RMSE: 4081.0669132063067
R2: 0.884423966605558


In [28]:
model4 = pipeline4.fit(train_X, train_y)
yhat = model4.predict(test_X)
mae = m.mean_absolute_error(test_y, yhat)
mse = m.mean_squared_error(test_y, yhat)
r2 = m.r2_score(test_y, yhat)
rmse = np.sqrt(mse)
print("Support Vector Regression")
print(f'MAE: {mae}')
print(f'MSE: {mse}')
print(f'RMSE: {rmse}')
print(f'R2: {r2}')

Support Vector Regression
MAE: 7546.553371800069
MSE: 142253657.7393592
RMSE: 11927.013781301639
R2: 0.01284853053020929


In [25]:
df['dtree_pred'] = model1.predict(df.drop(columns=['price']))
df['rf_pred'] = model2.predict(df.drop(columns=['price']))
df['lr_pred'] = model3.predict(df.drop(columns=['price']))
df['svr_pred'] = model4.predict(df.drop(columns=['price']))

# visualization

In [26]:
#histogram
fig = px.histogram(df, x='price', nbins=100, title='Price Distribution')
fig.show()


# saving the model

In [33]:
import joblib

In [34]:
joblib.dump(model1, 'audi/dtree_model.pkl')
joblib.dump(model2, 'audi/rf_model.pkl')
joblib.dump(model3, 'audi/lr_model.pkl')
joblib.dump(model4, 'audi/svr_model.pkl')


['audi/svr_model.pkl']

In [29]:
X.fuelType.unique()

array(['Petrol', 'Diesel', 'Hybrid'], dtype=object)

In [30]:
X

Unnamed: 0,model,year,transmission,mileage,fuelType,tax,mpg,engineSize
0,A1,2017,Manual,15735,Petrol,150,55.4,1.4
1,A6,2016,Automatic,36203,Diesel,20,64.2,2.0
2,A1,2016,Manual,29946,Petrol,30,55.4,1.4
3,A4,2017,Automatic,25952,Diesel,145,67.3,2.0
4,A3,2019,Manual,1998,Petrol,145,49.6,1.0
...,...,...,...,...,...,...,...,...
10660,A4,2011,Automatic,78000,Diesel,305,39.8,3.0
10661,A4,2011,Manual,95000,Diesel,145,53.3,2.0
10662,A3,2013,Manual,31500,Petrol,125,53.3,1.4
10663,A3,2020,Manual,4018,Petrol,145,49.6,1.0


In [15]:
X = pd.DataFrame({'year': {0: '2001'},
 'transmission': {0: 'Manual'}, 
 'mileage': {0: '30'}, 
 'fuelType': {0: 'Petrol'}, 
 'tax': {0: '100'}, 
 'mpg': {0: '23'}, 
 'engineSize': {0: '12'}
})

In [30]:
X = pd.DataFrame({'model': {0: 'RS5'}, 'year': {0: '2002'}, 'transmission': {0: 'Automatic'}, 'mileage': {0: '39'}, 'fuelType': {0: 'Petrol'}, 'tax': {0: '29'}, 'mpg': {0: '29'}, 'engineSize': {0: '100'}})
X

Unnamed: 0,model,year,transmission,mileage,fuelType,tax,mpg,engineSize
0,RS5,2002,Automatic,39,Petrol,29,29,100


In [31]:
model4.predict(X)

array([19900.39844757])