In [26]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
import xgboost as xg 
from sklearn.compose  import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import KFold

import warnings
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

In [27]:
data = pd.read_csv('./car_final.csv', index_col=0)

In [28]:
data.shape

(81436, 7)

In [29]:
X = data.drop('price', axis=1)
y = data['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=40)

In [30]:
ohe = OneHotEncoder()
ohe.fit(X[['brand', 'model', 'type_car', 'fuel', 'condition']])

In [31]:
column_trans = make_column_transformer((OneHotEncoder(categories = ohe.categories_), ['brand', 'model', 'type_car', 'fuel', 'condition']), remainder = 'passthrough')
column_trans

In [32]:
def training(model, column_trans, X_train, y_train, X_test, y_test):
    pipe = make_pipeline(column_trans, model)
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    print(f'{model} reach R2 Score: {r2_score(y_test, y_pred)}')
    return pipe

In [33]:
def training_with_gridsearch(model, column_trans, X_train, X_test, y_test, y_train, param_grid):
    pipe = make_pipeline(column_trans, model)
    param_grid = param_grid
    grid = RandomizedSearchCV(pipe, param_grid, cv=5, n_jobs=-1, verbose=1)
    grid.fit(X_train, y_train)
    y_pred = grid.predict(X_test)
    print(f'{model} reach R2 Score: {r2_score(y_test, y_pred)}')
    return grid

In [34]:
def cross_validation_model(model, X, y, num_folds=5):
    kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)
    scores = []
    for train_index, test_index in kf.split(X):
        X_train, X_test = X.iloc[train_index,:], X.iloc[test_index,:]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        scores.append(r2_score(y_test, y_pred))
    return scores

### Lasso Regression

In [35]:
lasso = Lasso()
lasso_model = training(lasso, column_trans, X_train, y_train, X_test, y_test)

Lasso() reach R2 Score: 0.6969453198853416


  model = cd_fast.sparse_enet_coordinate_descent(


In [36]:
scores = cross_validation_model(lasso_model, X, y)
print(f'Lasso reach R2 Score: {np.mean(scores)}')

  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(


Lasso reach R2 Score: 0.6958589848144274


  model = cd_fast.sparse_enet_coordinate_descent(


### Ridge Regression

In [37]:
ridge_model = Ridge(alpha=1.0)
ridge_model = training(ridge_model, column_trans, X_train, y_train, X_test, y_test)

Ridge() reach R2 Score: 0.7104216489638314


In [38]:
scores = cross_validation_model(ridge_model, X, y)
print(f'Ridge model reach R2 Score: {np.mean(scores)}')

Ridge model reach R2 Score: 0.687376803846982


### Linear Regression

In [39]:
lr = LinearRegression()
lr = training(lr, column_trans, X_train, y_train, X_test, y_test)

LinearRegression() reach R2 Score: 0.7003615480546302


In [40]:
scores = cross_validation_model(lr, X, y)
print(f'Linear Regression model reach R2 Score: {np.mean(scores)}')

Linear Regression model reach R2 Score: 0.6959368181453576


### Random Forest Regressor

In [41]:
regr = RandomForestRegressor(max_depth=2, random_state=0)
regr = training(regr, column_trans, X_train, y_train, X_test, y_test)

RandomForestRegressor(max_depth=2, random_state=0) reach R2 Score: 0.2561416676926127


In [42]:
scores = cross_validation_model(regr, X, y)
print(f'Random Forest Regressor model reach R2 Score: {np.mean(scores)}')

Random Forest Regressor model reach R2 Score: 0.1948759510451641


### Decision Tree with max_depth=2, 5

In [43]:
dtr1 = DecisionTreeRegressor(max_depth=2)
dtr2 = DecisionTreeRegressor(max_depth=5)
dtr1 = training(dtr1, column_trans, X_train, y_train, X_test, y_test)
dtr2 = training(dtr2, column_trans, X_train, y_train, X_test, y_test)

DecisionTreeRegressor(max_depth=2) reach R2 Score: 0.1678125111564598
DecisionTreeRegressor(max_depth=5) reach R2 Score: 0.5007779626073321


In [44]:
scores_dtr1 = cross_validation_model(dtr1, X, y)
scores_dtr2 = cross_validation_model(dtr2, X, y)
print(f'Decision Tree Regressor model reach R2 Score (depth=2): {np.mean(scores_dtr1)}')
print(f'Decision Tree Regressor model reach R2 Score (depth=5): {np.mean(scores_dtr2)}')

Decision Tree Regressor model reach R2 Score (depth=2): 0.12491929949549066
Decision Tree Regressor model reach R2 Score (depth=5): 0.4119715065105429


### XGBoost Regressor

In [45]:
xgb_r = xg.XGBRegressor(objective ='reg:linear', 
                  n_estimators = 10, seed = 123)
xgb_r = training(xgb_r, column_trans, X_train, y_train, X_test, y_test)

XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=None, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=None, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             multi_strategy=None, n_estimators=10, n_jobs=None,
             num_parallel_tree=None, objective='reg:linear', ...) reach R2 Score: 0.7397164520570316




In [46]:
scores = cross_validation_model(xgb_r, X, y)
print(f'XGBoost Regressor model reach R2 Score: {np.mean(scores)}')



XGBoost Regressor model reach R2 Score: 0.6810976048455715




In [47]:
# Saving model
import pickle
pickle.dump(lr, open('lr.pkl', 'wb'))

In [48]:
manufacture_date = 2023
brand = 'Hyundai'
model = 'Creta'
type_car = 'SUV / Cross over'
fuel = 'petrol'
condition = 'used'

price = lr.predict(pd.DataFrame([[manufacture_date, brand, model, type_car, fuel, condition]], columns=['manufacture_date', 'brand', 'model', 'type_car', 'fuel', 'condition']))
print('Price: ', '{:,.0f}'.format(price[0]), 'VNĐ')

Price:  552,871,911 VNĐ
