# Predicting used cars pricves in the Canadian market

- what problem are we solving?
- which dataset are we using? (min 50k row, 5 columns)
- EDA on the dataset 
- Train and evaluate ML algorithms  (min 2 ML algos, 2 hyperparams for each algo)

In [None]:
# prepare environment

!pip install opendatasets pandas numpy matplotlib seaborn tqdm sklearn --quiet
import pandas as pd
import numpy as np
import seaborn as sns 
import matplotlib.pyplot as plt
import matplotlib
from tqdm.notebook import tqdm
import opendatasets as od
sns.set_style('darkgrid')
matplotlib.rcParams['font.size'] = 18
matplotlib.rcParams['figure.figsize'] = (18, 10)
matplotlib.rcParams['figure.facecolor'] = '#00000000'

In [None]:
# download data
filepath = '.\\data\\marketcheck-automotive-data-us-canada'
url = 'https://www.kaggle.com/rupeshraundal/marketcheck-automotive-data-us-canada?select=ca-dealers-used.csv'
if not(os.path.exists(filepath)):
    od.download_kaggle_dataset(url, filepath)    

In [None]:
# read csv data 
canada_ds = pd.read_csv(filepath+'\\ca-dealers-used.csv', low_memory=False)
canada_ds.head(3) 

In [None]:
# drop useless columns
print(canada_ds.columns)
drop_columns = ['id', 'vin','stock_no','seller_name', 'street','trim','engine_size', 
                'zip','body_type','fuel_type', 'engine_block','vehicle_type']
canada_ds.drop(drop_columns,axis = 1, inplace=True)

In [None]:
canada_ds.info()

In [None]:
canada_ds.isna().sum()

In [None]:
canada_ds.dropna(inplace=True)

In [None]:
canada_ds

In [None]:
#fix state information
canada_ds.state.unique()

In [None]:
#explain why?
canada_ds.drop(index = canada_ds[canada_ds.state=='NC'].index, inplace=True)
canada_ds['state'].replace('OH','ON',inplace=True)
canada_ds['state'].replace('SC','QC',inplace=True)

In [None]:
# create Age column 
canada_ds['age'] = 2022- canada_ds['year'] 

In [None]:
# filter very expensive and very old cars (outliers)
canada_ds=canada_ds[canada_ds['age']<25]
canada_ds=canada_ds[canada_ds['price']<100000]
canada_ds=canada_ds[canada_ds['miles']<200000]
canada_ds

In [None]:
#some graphs
matplotlib.rcParams['figure.figsize'] = (15, 15)
fig = sns.scatterplot(y ='price',x='miles', hue = 'age',data =canada_ds,s=10);

In [None]:
matplotlib.rcParams['figure.figsize'] = (15, 15)
column = 'make'
graph =  canada_ds.groupby(column)['price'].mean().sort_values(ascending=False).head(30)
fig = sns.barplot(x = graph.values, y = graph.index);

In [None]:
matplotlib.rcParams['figure.figsize'] = (15, 15)
column = 'model'
graph =  canada_ds.groupby(column)['price'].mean().sort_values(ascending=False).head(30)
fig = sns.barplot(x = graph.values, y = graph.index);

In [None]:
matplotlib.rcParams['figure.figsize'] = (15, 15)
column = 'make'
graph =  canada_ds.groupby(column)['price'].count().sort_values(ascending=False).head(30)
fig = sns.barplot(x = graph.values, y = graph.index);

In [None]:
matplotlib.rcParams['figure.figsize'] = (15, 15)
column = 'model'
graph =  canada_ds.groupby(column)['price'].count().sort_values(ascending=False).head(30)
fig = sns.barplot(x = graph.values, y = graph.index);

In [None]:
matplotlib.rcParams['figure.figsize'] = (10, 1)
column = 'drivetrain'
graph =  canada_ds.groupby(column)['price'].mean().sort_values(ascending=False)
fig = sns.barplot(x = graph.values, y = graph.index);

In [None]:
matplotlib.rcParams['figure.figsize'] = (10, 1)
column = 'drivetrain'
graph =  canada_ds.groupby(column)['price'].count().sort_values(ascending=False)
fig = sns.barplot(x = graph.values, y = graph.index);

In [None]:
matplotlib.rcParams['figure.figsize'] = (10, 1)
column = 'transmission'
graph =  canada_ds.groupby(column)['price'].mean().sort_values(ascending=False)
fig = sns.barplot(x = graph.values, y = graph.index);

In [None]:
column = 'transmission'
graph =  canada_ds.groupby(column)['price'].count().sort_values(ascending=False)
fig = sns.barplot(x = graph.values, y = graph.index);

In [None]:
matplotlib.rcParams['figure.figsize'] = (10, 5)
column = 'state'
graph =  canada_ds.groupby(column)['price'].mean().sort_values(ascending=False)
fig = sns.barplot(x = graph.values, y = graph.index);

In [None]:
matplotlib.rcParams['figure.figsize'] = (10, 5)
column = 'state'
graph =  canada_ds.groupby(column)['price'].count().sort_values(ascending=False)
fig = sns.barplot(x = graph.values, y = graph.index);

In [None]:
matplotlib.rcParams['figure.figsize'] = (10, 5)
column = 'make'
graph =  canada_ds.groupby(column)['age'].mean().sort_values(ascending=True).head(15)
fig = sns.barplot(x = graph.values, y = graph.index);

In [None]:
matplotlib.rcParams['figure.figsize'] = (10, 5)
column = 'make'
graph =  canada_ds.groupby(column)['age'].mean().sort_values(ascending=False).head(15)
fig = sns.barplot(x = graph.values, y = graph.index);

In [None]:
matplotlib.rcParams['figure.figsize'] = (10, 5)
column = 'make'
graph =  canada_ds.groupby(column)['miles'].mean().sort_values(ascending=False).head(15)
fig = sns.barplot(x = graph.values, y = graph.index);

In [None]:
matplotlib.rcParams['figure.figsize'] = (10, 2)
column = 'transmission'
graph =  canada_ds.groupby(column)['age'].mean().sort_values(ascending=False)
fig = sns.barplot(x = graph.values, y = graph.index);

## machine learning

In [None]:
# 	price	miles	year	make	model	drivetrain	transmission	city	state	age
target_cols = ['price']
features_num_cols = ['miles', 'age',]
features_cat_cols = ['make', 'model', 'drivetrain', 'transmission','state', 'city']

In [None]:
raw_data = canada_ds[features_num_cols+features_cat_cols+target_cols].copy()

In [None]:
# split train, test and val
# remove annoying warnings from sklearn
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

from sklearn.model_selection import train_test_split
trainval_data, test_data = train_test_split(raw_data, test_size = 0.2)
train_data, val_data = train_test_split(trainval_data, test_size = 0.25)

In [None]:
# standardize numeric inputs
from sklearn.preprocessing import StandardScaler, MinMaxScaler
scaler = StandardScaler()
scaler.fit(train_data[features_num_cols])
train_data[features_num_cols] = scaler.transform(train_data[features_num_cols])
test_data[features_num_cols] = scaler.transform(test_data[features_num_cols])
val_data[features_num_cols] = scaler.transform(val_data[features_num_cols])


In [None]:
# one hot encoding categorical inputs
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(sparse=False, handle_unknown='ignore').fit(train_data[features_cat_cols])
new_one_hot_cols = list(encoder.get_feature_names(features_cat_cols))
train_data[new_one_hot_cols] = encoder.transform(train_data[features_cat_cols])
test_data[new_one_hot_cols] = encoder.transform(test_data[features_cat_cols])
val_data[new_one_hot_cols] = encoder.transform(val_data[features_cat_cols])

In [None]:
print('number of features ={}'.format(len(features_num_cols + new_one_hot_cols)))

In [None]:
# create input and output dataframes
train_inputs = train_data[features_num_cols + new_one_hot_cols]
test_inputs = test_data[features_num_cols + new_one_hot_cols]
val_inputs = val_data[features_num_cols + new_one_hot_cols]

train_output = train_data[target_cols]
test_output = test_data[target_cols]
val_output = val_data[target_cols]

In [None]:
print('output average ={:.2f}$'.format(train_output.mean().values[0]))

In [None]:
#first attempt XGBoost 

from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor 

In [None]:
#tune param max_depth
n_init = 5
results=[]
for max_depth in tqdm(range(5,51,5)):
    model = XGBRegressor(n_estimators = n_init, max_depth = max_depth)
    model.fit(train_inputs, train_output)
    train_preds = model.predict(train_inputs)
    val_preds = model.predict(val_inputs)
    error_train = mean_squared_error(train_output, train_preds, squared=False)
    error_val = mean_squared_error(val_output, val_preds, squared=False)
    print('max_depth = {}'.format(max_depth),"error_train = {:.2f} *** error_val = {:.2f}".format(error_train,error_val))
    results.append([max_depth,error_train, error_val, model])

In [None]:
params = [results[i][0] for i in range(len(results))]
T_e = [results[i][1] for i in range(len(results))]
V_e = [results[i][2] for i in range(len(results))]
plt.plot(params, V_e, 'r');
plt.plot(params, T_e, 'g');
# 40 seems to be the sweet spot

In [None]:
# tune param n_estimators
max_depth_opt = 40
results =[]
for n_estimators in tqdm(range(10,61,5)):
    model = XGBRegressor(n_estimators = n_estimators, max_depth = max_depth_opt)
    model.fit(train_inputs, train_output)
    train_preds = model.predict(train_inputs)
    val_preds = model.predict(val_inputs)
    error_train = mean_squared_error(train_output, train_preds, squared=False)
    error_val = mean_squared_error(val_output, val_preds, squared=False)
    print('n_estimators = {}'.format(n_estimators),"error_train = {:.2f} *** error_val = {:.2f}".format(error_train,error_val))
    results.append([n_estimators, error_train, error_val, model])

In [None]:
params = [results[i][0] for i in range(len(results))]
T_e = [results[i][1] for i in range(len(results))]
V_e = [results[i][2] for i in range(len(results))]
plt.plot(params, V_e, 'r');
plt.plot(params, T_e, 'g');

In [None]:
# neural network
from keras import models
from keras import layers
from keras.callbacks import EarlyStopping

es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=10, restore_best_weights=True)

network = models.Sequential()
network.add(layers.Dense(2000 , activation='relu' , input_dim=len(features_num_cols + new_one_hot_cols))) 
network.add(layers.Dense(2000 , activation='relu')) 
network.add(layers.Dense(1)) 

network.compile(optimizer='adam', loss='MeanSquaredError', metrics=['RootMeanSquaredError'])

# Training 
n_epochs =2000
result = network.fit(train_inputs, train_output,epochs = n_epochs,verbose=1, batch_size = 10000,
                     validation_data=(val_inputs,val_output),callbacks = [es])