In [125]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
import xgboost as xg 
from sklearn.compose  import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import KFold

import warnings
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

In [126]:
data = pd.read_csv('./car_final.csv', index_col=0)

In [127]:
data.shape

(81308, 8)

In [128]:
X = data.drop('price', axis=1)
y = data['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=40)

### Encode categorical features

In [129]:
ohe = OneHotEncoder()
ohe.fit(X[['model', 'brand', 'type_car', 'fuel', 'condition']])

In [130]:
column_trans = make_column_transformer((OneHotEncoder(categories = ohe.categories_), ['model', 'brand', 'type_car', 'fuel', 'condition']), remainder = 'passthrough')
column_trans

### Transformation

In [131]:
from sklearn.preprocessing import MinMaxScaler
# Transforming the data
mns = MinMaxScaler()
X_train['mileage_v2'] = mns.fit_transform(X_train[['mileage_v2']])
X_test['mileage_v2'] = mns.transform(X_test[['mileage_v2']])

y_train = mns.fit_transform(y_train.values.reshape(-1,1))
y_test = mns.transform(y_test.values.reshape(-1,1))

### Training

In [132]:
def training(model, column_trans, X_train, y_train, X_test, y_test):
    pipe = make_pipeline(column_trans, model)
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    print(f'{model} reach R2 Score: {r2_score(y_test, y_pred)}')
    # Evaluate with MAE
    print(f'{model} reach MAE: {mean_absolute_error(y_test, y_pred)}')
    # Evaluate with MSE
    print(f'{model} reach MSE: {mean_squared_error(y_test, y_pred)}')
    # Evaluate with RMSE
    print(f'{model} reach RMSE: {np.sqrt(mean_squared_error(y_test, y_pred))}')
    return pipe

In [133]:
def training_with_gridsearch(model, column_trans, X_train, X_test, y_test, y_train, param_grid):
    pipe = make_pipeline(column_trans, model)
    param_grid = param_grid
    grid = RandomizedSearchCV(pipe, param_grid, cv=5, n_jobs=-1, verbose=1)
    grid.fit(X_train, y_train)
    y_pred = grid.predict(X_test)
    print(f'{model} reach R2 Score: {r2_score(y_test, y_pred)}')
    # Evaluate with MAE
    print(f'{model} reach MAE: {mean_absolute_error(y_test, y_pred)}')
    # Evaluate with MSE
    print(f'{model} reach MSE: {mean_squared_error(y_test, y_pred)}')
    # Evaluate with RMSE
    print(f'{model} reach RMSE: {np.sqrt(mean_squared_error(y_test, y_pred))}')
    
    return grid

In [134]:
def cross_validation_model(model, X, y, num_folds=5):
    kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)
    scores = []
    for train_index, test_index in kf.split(X):
        X_train, X_test = X.iloc[train_index,:], X.iloc[test_index,:]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        scores.append(r2_score(y_test, y_pred))
    return scores

### Lasso Regression

In [135]:
lasso = Lasso()
lasso_model = training(lasso, column_trans, X_train, y_train, X_test, y_test)

Lasso() reach R2 Score: -2.7355995769529073e-05
Lasso() reach MAE: 0.009126703700453707
Lasso() reach MSE: 0.00042766641569480594
Lasso() reach RMSE: 0.02068009709103915


In [136]:
scores = cross_validation_model(lasso_model, X, y)
print(f'Lasso reach R2 Score: {np.mean(scores)}')

  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(


Lasso reach R2 Score: 0.6942392146711217


  model = cd_fast.sparse_enet_coordinate_descent(


### Ridge Regression

In [137]:
ridge_model = Ridge(alpha=1.0)
ridge_model = training(ridge_model, column_trans, X_train, y_train, X_test, y_test)

Ridge() reach R2 Score: 0.6777521044300312
Ridge() reach MAE: 0.003830993626823712
Ridge() reach MSE: 0.0001378108325110515
Ridge() reach RMSE: 0.011739285860351622


In [138]:
scores = cross_validation_model(ridge_model, X, y)
print(f'Ridge model reach R2 Score: {np.mean(scores)}')

Ridge model reach R2 Score: 0.09976705530288366


### Linear Regression

In [139]:
lr = LinearRegression()
lr = training(lr, column_trans, X_train, y_train, X_test, y_test)

LinearRegression() reach R2 Score: 0.6421728796663231
LinearRegression() reach MAE: 0.003791516863578661
LinearRegression() reach MSE: 0.00015302645580042012
LinearRegression() reach RMSE: 0.012370386242976414


In [140]:
scores = cross_validation_model(lr, X, y)
print(f'Linear Regression model reach R2 Score: {np.mean(scores)}')

Linear Regression model reach R2 Score: 0.213934236107872


### Random Forest Regressor

In [141]:
regr = RandomForestRegressor(max_depth=2, random_state=0)
regr = training(regr, column_trans, X_train, y_train, X_test, y_test)

  return fit_method(estimator, *args, **kwargs)


RandomForestRegressor(max_depth=2, random_state=0) reach R2 Score: 0.30434292357780157
RandomForestRegressor(max_depth=2, random_state=0) reach MAE: 0.008009103543744425
RandomForestRegressor(max_depth=2, random_state=0) reach MSE: 0.0002975010299892914
RandomForestRegressor(max_depth=2, random_state=0) reach RMSE: 0.017248218168532406


In [142]:
scores = cross_validation_model(regr, X, y)
print(f'Random Forest Regressor model reach R2 Score: {np.mean(scores)}')

Random Forest Regressor model reach R2 Score: 0.22725499895917717


### Decision Tree with max_depth=2, 5

In [143]:
dtr1 = DecisionTreeRegressor(max_depth=2)
dtr2 = DecisionTreeRegressor(max_depth=5)
dtr1 = training(dtr1, column_trans, X_train, y_train, X_test, y_test)
print('--------------------------------------------')
dtr2 = training(dtr2, column_trans, X_train, y_train, X_test, y_test)

DecisionTreeRegressor(max_depth=2) reach R2 Score: 0.2691105243382357
DecisionTreeRegressor(max_depth=2) reach MAE: 0.008161428963112493
DecisionTreeRegressor(max_depth=2) reach MSE: 0.0003125683317073629
DecisionTreeRegressor(max_depth=2) reach RMSE: 0.017679602136568655
--------------------------------------------
DecisionTreeRegressor(max_depth=5) reach R2 Score: 0.2827237353427968
DecisionTreeRegressor(max_depth=5) reach MAE: 0.006721054125303226
DecisionTreeRegressor(max_depth=5) reach MSE: 0.0003067465778108202
DecisionTreeRegressor(max_depth=5) reach RMSE: 0.01751418219075102


In [144]:
scores_dtr1 = cross_validation_model(dtr1, X, y)
scores_dtr2 = cross_validation_model(dtr2, X, y)
print(f'Decision Tree Regressor model reach R2 Score (depth=2): {np.mean(scores_dtr1)}')
print(f'Decision Tree Regressor model reach R2 Score (depth=5): {np.mean(scores_dtr2)}')

Decision Tree Regressor model reach R2 Score (depth=2): 0.16127126420379195
Decision Tree Regressor model reach R2 Score (depth=5): 0.44967877680206864


### XGBoost Regressor

In [145]:
xgb_r = xg.XGBRegressor(objective ='reg:linear', 
                  n_estimators = 10, seed = 123)
xgb_r = training(xgb_r, column_trans, X_train, y_train, X_test, y_test)

XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=None, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=None, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             multi_strategy=None, n_estimators=10, n_jobs=None,
             num_parallel_tree=None, objective='reg:linear', ...) reach R2 Score: 0.7118756966122687
XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enabl



In [146]:
scores = cross_validation_model(xgb_r, X, y)
print(f'XGBoost Regressor model reach R2 Score: {np.mean(scores)}')



XGBoost Regressor model reach R2 Score: 0.705841782817547




### Save model

In [157]:
# Saving model
import pickle
pickle.dump(lr, open('xgb_r.pkl', 'wb'))

### Test some samples

In [158]:
manufacture_date = 2021
brand = 'Honda'
model = 'City'
type_car = 'Sedan'
fuel = 'petrol'
condition = 'used'
mileage_v2 = 23000

price = xgb_r.predict(pd.DataFrame([[manufacture_date, 
                                     brand, model, 
                                     type_car, fuel, 
                                     condition, 
                                     mileage_v2]], 
                                   columns=['manufacture_date', 
                                            'brand', 
                                            'model', 
                                            'type_car', 
                                            'fuel', 
                                            'condition', 
                                            'mileage_v2']))
print('Predicted Price: ', '{:,.0f}'.format(price[0]), 'VNĐ')
print("True price: 455.000.000 VND")

Predicted Price:  597,988,672 VNĐ
True price: 455.000.000 VND


In [159]:
manufacture_date = 2016
brand = 'Kia'
model = 'Rio'
type_car = 'Hatchback'
fuel = 'petrol'
condition = 'used'
mileage_v2 = 78545

price = xgb_r.predict(pd.DataFrame([[manufacture_date, 
                                     brand, model, 
                                     type_car, fuel, 
                                     condition, 
                                     mileage_v2]], 
                                   columns=['manufacture_date', 
                                            'brand', 
                                            'model', 
                                            'type_car', 
                                            'fuel', 
                                            'condition', 
                                            'mileage_v2']))
print('Predicted Price: ', '{:,.0f}'.format(price[0]), 'VNĐ')
print("True price: 295.000.000 VND")

Predicted Price:  354,087,040 VNĐ
True price: 295.000.000 VND


In [160]:
manufacture_date = 2020
brand = 'Toyota'
model = 'Vios'
type_car = 'Sedan'
fuel = 'petrol'
condition = 'used'
mileage_v2 = 99999

price = xgb_r.predict(pd.DataFrame([[manufacture_date, 
                                     brand, model, 
                                     type_car, fuel, 
                                     condition, 
                                     mileage_v2]], 
                                   columns=['manufacture_date', 
                                            'brand', 
                                            'model', 
                                            'type_car', 
                                            'fuel', 
                                            'condition', 
                                            'mileage_v2']))
print('Predicted Price: ', '{:,.0f}'.format(price[0]), 'VNĐ')
print("True price: 368.000.000 VND")

Predicted Price:  553,130,560 VNĐ
True price: 368.000.000 VND


### Predict

In [156]:
manufacture_date = 2023
brand = 'Hyundai'
model = 'Creta'
type_car = 'SUV / Cross over'
fuel = 'petrol'
condition = 'used'
mileage_v2 = 1000

price = xgb_r.predict(pd.DataFrame([[manufacture_date, brand, model, type_car, fuel, condition, mileage_v2]], columns=['manufacture_date', 'brand', 'model', 'type_car', 'fuel', 'condition', 'mileage_v2']))
print('Price: ', '{:,.0f}'.format(price[0]), 'VNĐ')

Price:  753,084,992 VNĐ
