# Introduction

Our project aims to identify an optimal pricing model using regression techniques to quantify what the reasonable price range of a car would be, which Atlas Motors would use for acquiring used cars for their rental fleet. Since there is complexity in determining the prices of cars due to various factors like COE, OMV and Sales Upselling, our model will reduce the frustrations and time consumption of used car purchases. 

In this study, we would be utilizing data science processes from data collection (web-scraping: BeautifulSoup, Python), data cleaning, exploratory data analysis to the model training and testing stage. The source of data comes SgCarMart, an online car sales portal in Singapore. 

In [5]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression, Lasso, LassoCV, Ridge, RidgeCV, ElasticNet, ElasticNetCV

import  scipy.signal.signaltools

def _centered(arr, newsize):
    # Return the center newsize portion of the array.
    newsize = np.asarray(newsize)
    currsize = np.array(arr.shape)
    startind = (currsize - newsize) // 2
    endind = startind + newsize
    myslice = [slice(startind[k], endind[k]) for k in range(len(endind))]
    return arr[tuple(myslice)]

scipy.signal.signaltools._centered = _centered

import statsmodels.api as sm
import statsmodels.formula.api as smf
import patsy

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import accuracy_score
%matplotlib inline
sns.set_style('whitegrid')

In [6]:
df_clean_log = pd.read_csv('clean_log_data.csv')

In [14]:
df_clean_log

Unnamed: 0,PRICE,DEPRE_VALUE_PER_YEAR,MILEAGE_KM,DEREG_VALUE_FROM_SCRAPE_DATE,OMV,COE_FROM_SCRAPE_DATE,DAYS_OF_COE_LEFT,ENGINE_CAPACITY_CC,CURB_WEIGHT_KG,NO_OF_OWNERS,...,VEHICLE_TYPE_Others,VEHICLE_TYPE_SUV,VEHICLE_TYPE_Sports Car,VEHICLE_TYPE_Stationwagon,CAR_CATEGORY_ECONOMY,CAR_CATEGORY_EXOTIC,CAR_CATEGORY_LUXURY,CAR_CATEGORY_MID_LEVEL,CAR_CATEGORY_OTHERS,CAR_CATEGORY_ULTRA_LUXURY
0,80800.0,10.022159,11.703546,10.856920,10.877273,10.913633,762.0,7.596392,7.426549,2.0,...,0,0,0,0,0,0,1,0,0,0
1,130800.0,9.947026,10.275051,10.732214,10.335270,10.488660,1976.0,7.494430,7.265430,2.0,...,0,0,1,0,0,0,0,1,0,0
2,222800.0,10.292146,10.325482,11.015690,10.534121,10.604132,2466.0,7.821242,7.560080,2.0,...,0,0,0,0,1,0,0,0,0,0
3,98800.0,9.626416,12.358794,9.992871,10.588628,10.421329,2375.0,7.592870,7.387090,3.0,...,0,0,0,0,0,0,1,0,0,0
4,88000.0,9.658865,11.695247,10.346120,9.853772,10.424927,1827.0,7.599902,7.323171,2.0,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10306,106800.0,9.540363,11.898188,10.706296,10.893512,10.970334,2798.0,7.493317,7.408531,4.0,...,0,0,0,0,0,0,1,0,0,0
10307,135800.0,9.694000,11.374996,11.145031,9.897168,11.156279,2827.0,7.598399,7.370231,1.0,...,0,1,0,0,1,0,0,0,0,0
10308,162000.0,9.815749,12.013701,11.463483,11.347555,11.586659,2920.0,8.188133,7.652071,2.0,...,0,1,0,0,0,0,0,0,0,1
10309,98800.0,9.552511,11.599103,10.307218,10.725490,10.661556,2559.0,7.999343,7.293018,6.0,...,0,0,1,0,0,0,1,0,0,0


In [7]:
x_var=df_clean_log.drop('PRICE', axis=1)


y= df_clean_log['PRICE']

test_size = 0.20
seed = 7
X_train, X_test, y_train, y_test = train_test_split(x_var, y, test_size=test_size, random_state=seed)

In [8]:
num_folds = 10
seed = 7
scoring = 'neg_mean_squared_error'


In [9]:
import warnings
warnings.filterwarnings("ignore")
scaler = StandardScaler().fit(X_train)
rescaledX = scaler.transform(X_train)
model = ExtraTreesRegressor(n_estimators=400)
model.fit(rescaledX, y_train)
# transform the validation dataset
rescaledValidationX = scaler.transform(X_test)
predictions = model.predict(rescaledValidationX)
print("MSE : {}".format(round(mean_squared_error(y_test, predictions), 3)))
print("RMSE : {}".format(round(np.sqrt(mean_squared_error(y_test, predictions)), 3)))
print("R squared error : {}".format(round(r2_score(y_test,predictions), 3)))

MSE : 1331152023.536
RMSE : 36484.956
R squared error : 0.912
