# Import Library

In [1]:
import matplotlib.pyplot as plt
import numpy as np

from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn import metrics

In [2]:
from DataPreprocessing import *

# XGBoost without Hyperparameters Tuning

In [3]:
xg_boost = XGBRegressor()
xg_boost.fit(X_train_std, y_train)

print("Model Accuracy on train set using XGBoost: %.15f" % xg_boost.score(X_train_std, y_train))
print("Model Accuracy on test set using XGBoost: %.15f" % xg_boost.score(X_test_std, y_test))

rmse = metrics.mean_squared_error(y_test, xg_boost.predict(X_test_std), squared=False)
print("The root mean squared error (RMSE) on test set: {:.15f}".format(rmse))

Model Accuracy on train set using XGBoost: 0.988881086836651
Model Accuracy on test set using XGBoost: 0.867756094866293
The root mean squared error (RMSE) on test set: 190655.060986640746705


# Hyperparameters Tuning

In [4]:
from sklearn.model_selection import GridSearchCV
params = { 'max_depth': [3,6,10],
           'learning_rate': [0.01, 0.05, 0.1],
           'n_estimators': [100, 500, 1000],
           'colsample_bytree': [0.3, 0.7]}
xgbr = XGBRegressor()
clf = GridSearchCV(estimator=xgbr, 
                   param_grid=params,
                   scoring='neg_mean_squared_error', 
                   verbose=1)
clf.fit(X_train_std, y_train)
print("Best parameters:", clf.best_params_)
print("Lowest RMSE: ", (-clf.best_score_)**(1/2.0))

Fitting 5 folds for each of 54 candidates, totalling 270 fits
Best parameters: {'colsample_bytree': 0.7, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 500}
Lowest RMSE:  155431.37811705694


In [5]:
sklearn_xgb = XGBRegressor(
            colsample_bytree=0.7,
            learning_rate=0.1, max_depth=3,
            n_estimators=500)
sklearn_xgb.fit(X_train_std, y_train)
sklearn_rmse = metrics.mean_squared_error(y_test, sklearn_xgb.predict(X_test_std), squared=False)
print(f"RMSE after tune hyperparameters using Scikit-learn library: {sklearn_rmse:.15f}")
print("Model Accuracy after tune hyperparameters on train set: %.15f" % sklearn_xgb.score(X_train_std, y_train))
print("Model Accuracy after tune hyperparameters on test set: %.15f" % sklearn_xgb.score(X_test_std, y_test))

RMSE after tune hyperparameters using Scikit-learn library: 159948.575519334146520
Model Accuracy after tune hyperparameters on train set: 0.970755742624392
Model Accuracy after tune hyperparameters on test set: 0.906923564652030


# Prediction

In [6]:
X_test_predict = X_test_std[0:3]
X_test_predict

array([[ 1.38954455, -1.08145768,  1.11558082,  0.30072914, -3.3142684 ,
        -0.74641118,  0.626382  , -0.47460479, -0.18362431, -1.34744894,
        -0.44536017,  0.33188388],
       [ 1.38954455, -0.67091488,  1.11558082,  0.30072914,  0.30172571,
        -0.74641118,  0.4298544 , -0.47460479, -0.18362431, -1.34744894,
        -0.44536017,  0.33188388],
       [ 0.87204969, -0.30142637,  1.11558082,  0.30072914,  0.30172571,
        -0.74641118,  1.29805861, -1.28731519, -1.27964718,  1.4188324 ,
        -0.44536017,  0.33188388]])

In [7]:
y_test

1363     700000
3171     550000
2201     275000
6483     235000
6104     375000
         ...   
2384    3600000
3214    1000000
3542     700000
6395     150000
5739     254999
Name: selling_price, Length: 1343, dtype: int64

In [8]:
y_predict = sklearn_xgb.predict(X_test_predict)
y_predict

array([660927.  , 602159.8 , 302776.94], dtype=float32)

In [9]:
pd.DataFrame({"y_test": y_test.values[0:3], "y_predict": y_predict})

Unnamed: 0,y_test,y_predict
0,700000,660927.0
1,550000,602159.8125
2,275000,302776.9375
