In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
import xgboost as xg 
from sklearn.compose  import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import KFold

import warnings
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

In [2]:
data = pd.read_csv('./car_final.csv', index_col=0)

In [3]:
data.shape

(81436, 8)

In [4]:
X = data.drop('price', axis=1)
y = data['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=40)

In [5]:
ohe = OneHotEncoder()
ohe.fit(X[['brand', 'model', 'type_car', 'fuel', 'condition']])

In [6]:
column_trans = make_column_transformer((OneHotEncoder(categories = ohe.categories_), ['brand', 'model', 'type_car', 'fuel', 'condition']), remainder = 'passthrough')
column_trans

In [7]:
def training(model, column_trans, X_train, y_train, X_test, y_test):
    pipe = make_pipeline(column_trans, model)
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    print(f'{model} reach R2 Score: {r2_score(y_test, y_pred)}')
    # Evaluate with MAE
    print(f'{model} reach MAE: {mean_absolute_error(y_test, y_pred)}')
    # Evaluate with MSE
    print(f'{model} reach MSE: {mean_squared_error(y_test, y_pred)}')
    # Evaluate with RMSE
    print(f'{model} reach RMSE: {np.sqrt(mean_squared_error(y_test, y_pred))}')
    return pipe

In [8]:
def training_with_gridsearch(model, column_trans, X_train, X_test, y_test, y_train, param_grid):
    pipe = make_pipeline(column_trans, model)
    param_grid = param_grid
    grid = RandomizedSearchCV(pipe, param_grid, cv=5, n_jobs=-1, verbose=1)
    grid.fit(X_train, y_train)
    y_pred = grid.predict(X_test)
    print(f'{model} reach R2 Score: {r2_score(y_test, y_pred)}')
    # Evaluate with MAE
    print(f'{model} reach MAE: {mean_absolute_error(y_test, y_pred)}')
    # Evaluate with MSE
    print(f'{model} reach MSE: {mean_squared_error(y_test, y_pred)}')
    # Evaluate with RMSE
    print(f'{model} reach RMSE: {np.sqrt(mean_squared_error(y_test, y_pred))}')
    
    return grid

In [9]:
def cross_validation_model(model, X, y, num_folds=5):
    kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)
    scores = []
    for train_index, test_index in kf.split(X):
        X_train, X_test = X.iloc[train_index,:], X.iloc[test_index,:]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        scores.append(r2_score(y_test, y_pred))
    return scores

### Lasso Regression

In [10]:
lasso = Lasso()
lasso_model = training(lasso, column_trans, X_train, y_train, X_test, y_test)

Lasso() reach R2 Score: 0.6972321655471048
Lasso() reach MAE: 140728028.42231783
Lasso() reach MSE: 1.3199270284093376e+17
Lasso() reach RMSE: 363307999.9682553


  model = cd_fast.sparse_enet_coordinate_descent(


In [11]:
scores = cross_validation_model(lasso_model, X, y)
print(f'Lasso reach R2 Score: {np.mean(scores)}')

  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(


Lasso reach R2 Score: 0.696035143605582


  model = cd_fast.sparse_enet_coordinate_descent(


### Ridge Regression

In [12]:
ridge_model = Ridge(alpha=1.0)
ridge_model = training(ridge_model, column_trans, X_train, y_train, X_test, y_test)

Ridge() reach R2 Score: 0.10921733946121515
Ridge() reach MAE: 294754698.653512
Ridge() reach MSE: 3.883398354412871e+17
Ridge() reach RMSE: 623169186.8516022


In [13]:
scores = cross_validation_model(ridge_model, X, y)
print(f'Ridge model reach R2 Score: {np.mean(scores)}')

Ridge model reach R2 Score: 0.09956785828206696


### Linear Regression

In [14]:
lr = LinearRegression()
lr = training(lr, column_trans, X_train, y_train, X_test, y_test)

LinearRegression() reach R2 Score: 0.2285301096452068
LinearRegression() reach MAE: 250971476.78092113
LinearRegression() reach MSE: 3.3632501342929306e+17
LinearRegression() reach RMSE: 579935352.8017524


In [15]:
scores = cross_validation_model(lr, X, y)
print(f'Linear Regression model reach R2 Score: {np.mean(scores)}')

Linear Regression model reach R2 Score: 0.22043461411758963


### Random Forest Regressor

In [16]:
regr = RandomForestRegressor(max_depth=2, random_state=0)
regr = training(regr, column_trans, X_train, y_train, X_test, y_test)

RandomForestRegressor(max_depth=2, random_state=0) reach R2 Score: 0.2773397369807259
RandomForestRegressor(max_depth=2, random_state=0) reach MAE: 293390688.45009744
RandomForestRegressor(max_depth=2, random_state=0) reach MSE: 3.150462846359403e+17
RandomForestRegressor(max_depth=2, random_state=0) reach RMSE: 561289840.1324759


In [17]:
scores = cross_validation_model(regr, X, y)
print(f'Random Forest Regressor model reach R2 Score: {np.mean(scores)}')

Random Forest Regressor model reach R2 Score: 0.20231334212597635


### Decision Tree with max_depth=2, 5

In [18]:
dtr1 = DecisionTreeRegressor(max_depth=2)
dtr2 = DecisionTreeRegressor(max_depth=5)
dtr1 = training(dtr1, column_trans, X_train, y_train, X_test, y_test)
dtr2 = training(dtr2, column_trans, X_train, y_train, X_test, y_test)

DecisionTreeRegressor(max_depth=2) reach R2 Score: 0.22393253782925393
DecisionTreeRegressor(max_depth=2) reach MAE: 307952622.3076537
DecisionTreeRegressor(max_depth=2) reach MSE: 3.383293410408754e+17
DecisionTreeRegressor(max_depth=2) reach RMSE: 581660847.092939
DecisionTreeRegressor(max_depth=5) reach R2 Score: 0.5007779626073321
DecisionTreeRegressor(max_depth=5) reach MAE: 242709886.06749356
DecisionTreeRegressor(max_depth=5) reach MSE: 2.1763760391617062e+17
DecisionTreeRegressor(max_depth=5) reach RMSE: 466516456.2115367


In [19]:
scores_dtr1 = cross_validation_model(dtr1, X, y)
scores_dtr2 = cross_validation_model(dtr2, X, y)
print(f'Decision Tree Regressor model reach R2 Score (depth=2): {np.mean(scores_dtr1)}')
print(f'Decision Tree Regressor model reach R2 Score (depth=5): {np.mean(scores_dtr2)}')

Decision Tree Regressor model reach R2 Score (depth=2): 0.14688068833302084
Decision Tree Regressor model reach R2 Score (depth=5): 0.43887980243307856


### XGBoost Regressor

In [20]:
xgb_r = xg.XGBRegressor(objective ='reg:linear', 
                  n_estimators = 10, seed = 123)
xgb_r = training(xgb_r, column_trans, X_train, y_train, X_test, y_test)

XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=None, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=None, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             multi_strategy=None, n_estimators=10, n_jobs=None,
             num_parallel_tree=None, objective='reg:linear', ...) reach R2 Score: 0.754378494730526
XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable



In [21]:
scores = cross_validation_model(xgb_r, X, y)
print(f'XGBoost Regressor model reach R2 Score: {np.mean(scores)}')



XGBoost Regressor model reach R2 Score: 0.6818632046285111




In [22]:
# Saving model
import pickle
pickle.dump(lr, open('lr.pkl', 'wb'))

In [24]:
manufacture_date = 2023
brand = 'Hyundai'
model = 'Creta'
type_car = 'SUV / Cross over'
fuel = 'petrol'
condition = 'used'
mileage_v2 = 1000

price = lr.predict(pd.DataFrame([[manufacture_date, brand, model, type_car, fuel, condition, mileage_v2]], columns=['manufacture_date', 'brand', 'model', 'type_car', 'fuel', 'condition', 'mileage_v2']))
print('Price: ', '{:,.0f}'.format(price[0]), 'VNĐ')

Price:  1,003,017,626 VNĐ
