<a href="https://colab.research.google.com/github/yashveersinghsohi/Car_Price_Prediction/blob/master/Modeling/CarPrice_04_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Importing packages

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import (
    LinearRegression, 
    Lasso, 
    Ridge, 
    ElasticNet
  )
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor

from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

import pickle
from sklearn.externals import joblib



# Data

In [2]:
root_dir = "https://raw.githubusercontent.com/yashveersinghsohi/Car_Price_Prediction/master/Data/Feature_Engineering_Data/"

train_features_path = root_dir + "pruned_train_features.csv"
train_targets_path = root_dir + "train_targets.csv"

val_features_path = root_dir + "pruned_val_features.csv"
val_targets_path = root_dir + "val_targets.csv"

In [3]:
train_features = pd.read_csv(train_features_path)
train_targets = pd.read_csv(train_targets_path)

val_features = pd.read_csv(val_features_path)
val_targets = pd.read_csv(val_targets_path)

print(f"Train Features: {train_features.shape}")
print(f"Train Targets: {train_targets.shape}", end="\n\n")

print(f"Validation Features: {val_features.shape}")
print(f"Validation Targets: {val_targets.shape}", end="\n\n")

Train Features: (13351, 18)
Train Targets: (13351, 1)

Validation Features: (3463, 18)
Validation Targets: (3463, 1)



# Models

## Linear Models

Data

In [None]:
X_train = train_features.to_numpy()
y_train = train_targets.to_numpy().ravel()
X_val = val_features.to_numpy()
y_val = val_targets.to_numpy().ravel()

Feature Scaling

In [None]:
scaler = StandardScaler()
scaler.fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_val_scaled = scaler.transform(X_val)

Model definitions

In [None]:
lin_reg = LinearRegression()
lasso_reg = Lasso(random_state=42)
ridge_reg = Ridge(random_state=42)
elastic_reg = ElasticNet(random_state=42)

Training

In [None]:
lin_reg.fit(X_train_scaled, y_train)
lasso_reg.fit(X_train_scaled, y_train)
ridge_reg.fit(X_train_scaled, y_train)
elastic_reg.fit(X_train_scaled, y_train)

ElasticNet(alpha=1.0, copy_X=True, fit_intercept=True, l1_ratio=0.5,
           max_iter=1000, normalize=False, positive=False, precompute=False,
           random_state=42, selection='cyclic', tol=0.0001, warm_start=False)

Evaluating

In [None]:
lin_reg_preds_train = lin_reg.predict(X_train_scaled)
lasso_reg_preds_train = lasso_reg.predict(X_train_scaled)
ridge_reg_preds_train = ridge_reg.predict(X_train_scaled)
elastic_reg_preds_train = elastic_reg.predict(X_train_scaled)

lin_reg_rmse_train = np.sqrt(mean_squared_error(y_true=y_train, y_pred=lin_reg_preds_train))
lasso_reg_rmse_train = np.sqrt(mean_squared_error(y_true=y_train, y_pred=lasso_reg_preds_train))
ridge_reg_rmse_tarin = np.sqrt(mean_squared_error(y_true=y_train, y_pred=ridge_reg_preds_train))
elastic_reg_rmse_train = np.sqrt(mean_squared_error(y_true=y_train, y_pred=elastic_reg_preds_train))

print(f"Linear Regression training RMSE: {lin_reg_rmse_train}")
print(f"Lasso Regression training RMSE: {lasso_reg_rmse_train}")
print(f"Ridge Regression training RMSE: {ridge_reg_rmse_tarin}")
print(f"Elastic Net Regression training RMSE: {elastic_reg_rmse_train}")

Linear Regression training RMSE: 12809.927242521782
Lasso Regression training RMSE: 12809.93868180188
Ridge Regression training RMSE: 12809.934337297578
Elastic Net Regression training RMSE: 13325.71355631917


In [None]:
lin_reg_preds_val = lin_reg.predict(X_val_scaled)
lasso_reg_preds_val = lasso_reg.predict(X_val_scaled)
ridge_reg_preds_val = ridge_reg.predict(X_val_scaled)
elastic_reg_preds_val = elastic_reg.predict(X_val_scaled)

lin_reg_rmse_val = np.sqrt(mean_squared_error(y_true=y_val, y_pred=lin_reg_preds_val))
lasso_reg_rmse_val = np.sqrt(mean_squared_error(y_true=y_val, y_pred=lasso_reg_preds_val))
ridge_reg_rmse_val = np.sqrt(mean_squared_error(y_true=y_val, y_pred=ridge_reg_preds_val))
elastic_reg_rmse_val = np.sqrt(mean_squared_error(y_true=y_val, y_pred=elastic_reg_preds_val))

print(f"Linear Regression validation RMSE: {lin_reg_rmse_val}")
print(f"Lasso Regression validation RMSE: {lasso_reg_rmse_val}")
print(f"Ridge Regression validation RMSE: {ridge_reg_rmse_val}")
print(f"Elastic Net Regression validation RMSE: {elastic_reg_rmse_val}")

Linear Regression validation RMSE: 1127556.058065489
Lasso Regression validation RMSE: 1128736.3865379978
Ridge Regression validation RMSE: 1128982.8820277965
Elastic Net Regression validation RMSE: 902421.277033476


## KNN Regression

Data

In [None]:
X_train = train_features.to_numpy()
y_train = train_targets.to_numpy().ravel()
X_val = val_features.to_numpy()
y_val = val_targets.to_numpy().ravel()

Feature Scaling

In [None]:
scaler = StandardScaler()
scaler.fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_val_scaled = scaler.transform(X_val)

Model Definition

In [None]:
knn_reg = KNeighborsRegressor(n_jobs=-1, n_neighbors=5)

Training

In [None]:
knn_reg.fit(X_train_scaled, y_train)

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=-1, n_neighbors=5, p=2,
                    weights='uniform')

Evaluation

In [None]:
knn_train_preds = knn_reg.predict(X_train_scaled)
knn_train_rmse = np.sqrt(mean_squared_error(y_true=y_train, y_pred=knn_train_preds))
print(f"KNN Training RMSE: {knn_train_rmse}")

knn_val_preds = knn_reg.predict(X_val_scaled)
knn_val_rmse = np.sqrt(mean_squared_error(y_true=y_val, y_pred=knn_val_preds))
print(f"KNN Training RMSE: {knn_val_rmse}")

KNN Training RMSE: 7005.565015467748
KNN Training RMSE: 18534.293812885404


Hyperparameter tuning

In [None]:
# Sweep 1
# knn_params = {
#     "n_neighbors": [2, 3, 4, 5, 6, 7, 8, 9, 10],
#     "weights": ["uniform", "distance"],
#     "algorithm": ["auto", "ball_tree", "kd_tree", "brute"]
# }

# Sweep 2
# knn_params = {
#     "n_neighbors": [10, 12, 15, 17, 20],
#     "weights": ["distance"],
#     "algorithm": ["brute"]
# }

# Sweep 3
knn_params = {
    "n_neighbors": [9, 10, 11],
    "weights": ["distance"],
    "algorithm": ["brute"]
}

knn_grid = GridSearchCV(
    estimator=KNeighborsRegressor(), 
    param_grid=knn_params, 
    scoring="neg_mean_squared_error", 
    n_jobs=-1, 
    cv=3, 
    verbose=1
  )

In [None]:
knn_grid.fit(X_train_scaled, y_train)

Fitting 3 folds for each of 3 candidates, totalling 9 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 out of   9 | elapsed:    5.7s finished


GridSearchCV(cv=3, error_score=nan,
             estimator=KNeighborsRegressor(algorithm='auto', leaf_size=30,
                                           metric='minkowski',
                                           metric_params=None, n_jobs=None,
                                           n_neighbors=5, p=2,
                                           weights='uniform'),
             iid='deprecated', n_jobs=-1,
             param_grid={'algorithm': ['brute'], 'n_neighbors': [9, 10, 11],
                         'weights': ['distance']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='neg_mean_squared_error', verbose=1)

In [None]:
knn_grid.best_estimator_

KNeighborsRegressor(algorithm='brute', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=None, n_neighbors=10, p=1,
                    weights='distance')

In [None]:
knn_best = knn_grid.best_estimator_

knn_best_train_preds = knn_best.predict(X_train_scaled)
knn_best_val_preds = knn_best.predict(X_val_scaled)

knn_best_train_rmse = np.sqrt(mean_squared_error(y_true=y_train, y_pred=knn_best_train_preds))
knn_best_val_rmse = np.sqrt(mean_squared_error(y_true=y_val, y_pred=knn_best_val_preds))

print(f"KNN Best Train RMSE: {knn_best_train_rmse}")
print(f"KNN Best Validation RMSE: {knn_best_val_rmse}")

KNN Best Train RMSE: 1697.3999611902038
KNN Best Validation RMSE: 17978.053890238072


Exporting Best KNN Model

In [None]:
# Exporting Model
knn_best_model_file = 'knn_best.sav'
pickle.dump(knn_best, open(knn_best_model_file, 'wb'))

# Sanity Check
loaded_model = pickle.load(open(knn_best_model_file, 'rb'))
np.sqrt(mean_squared_error(y_true=y_val, y_pred=loaded_model.predict(X_val_scaled)))

17978.053890238072

## SVM

Data

In [None]:
X_train = train_features.to_numpy()
y_train = train_targets.to_numpy().ravel()
X_val = val_features.to_numpy()
y_val = val_targets.to_numpy().ravel()

Feature Scaling

In [None]:
scaler = StandardScaler()
scaler.fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_val_scaled = scaler.transform(X_val)

Model Definition

In [None]:
svm_reg = SVR()

Training

In [None]:
svm_reg.fit(X_train_scaled, y_train)

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

Evaluation

In [None]:
svm_train_preds = svm_reg.predict(X_train_scaled)
svm_train_rmse = np.sqrt(mean_squared_error(y_true=y_train, y_pred=svm_train_preds))
print(f"SVM Training RMSE: {svm_train_rmse}")

svm_val_preds = svm_reg.predict(X_val_scaled)
svm_val_rmse = np.sqrt(mean_squared_error(y_true=y_val, y_pred=svm_val_preds))
print(f"SVM Training RMSE: {svm_val_rmse}")

SVM Training RMSE: 15749.584300446493
SVM Training RMSE: 23000.83099583255


Hyperparameter Tuning

In [None]:
# Sweep 1
# svm_params = {
#     "gamma": ["scale", "auto"],
#     "C": [0.1, 1, 10]
# }

# Sweep 2
# svm_params = {
#     "gamma": ["scale"],
#     "C": [5, 10, 15]
# }

# Sweep 3
# svm_params = {
#     "gamma": ["scale"],
#     "C": [15, 20, 25, 30]
# }

# Sweep 4
# svm_params = {
#     "gamma": ["scale"],
#     "C": [30, 40, 50]
# }

# Sweep 5
# svm_params = {
#     "gamma": ["scale"],
#     "C": [50, 75, 100]
# }

# Sweep 6
svm_params = {
    "gamma": ["scale"],
    "C": [100, 150, 200]
}

svm_grid = GridSearchCV(
    estimator=SVR(), 
    param_grid=svm_params, 
    scoring="neg_mean_squared_error", 
    n_jobs=-1, 
    cv=3, 
    verbose=2
  )

In [None]:
svm_grid.fit(X_train_scaled, y_train)

Fitting 3 folds for each of 3 candidates, totalling 9 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 out of   9 | elapsed:   55.5s finished


GridSearchCV(cv=3, error_score=nan,
             estimator=SVR(C=1.0, cache_size=200, coef0=0.0, degree=3,
                           epsilon=0.1, gamma='scale', kernel='rbf',
                           max_iter=-1, shrinking=True, tol=0.001,
                           verbose=False),
             iid='deprecated', n_jobs=-1,
             param_grid={'C': [100, 150, 200], 'gamma': ['scale']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='neg_mean_squared_error', verbose=2)

In [None]:
svm_grid.best_estimator_

SVR(C=200, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [None]:
svm_best = svm_grid.best_estimator_

svm_best_train_preds = svm_best.predict(X_train_scaled)
svm_best_val_preds = svm_best.predict(X_val_scaled)

svm_best_train_rmse = np.sqrt(mean_squared_error(y_true=y_train, y_pred=svm_best_train_preds))
svm_best_val_rmse = np.sqrt(mean_squared_error(y_true=y_val, y_pred=svm_best_val_preds))

print(f"SVM Best Train RMSE: {svm_best_train_rmse}")
print(f"SVM Best Validation RMSE: {svm_best_val_rmse}")

SVM Best Train RMSE: 12036.66270367958
SVM Best Validation RMSE: 20697.83545417761


Exporting Best SVM model

In [None]:
# Exporting Model
svm_best_model_file = 'svm_best.sav'
pickle.dump(svm_best, open(svm_best_model_file, 'wb'))

# Sanity Check
loaded_model = pickle.load(open(svm_best_model_file, 'rb'))
np.sqrt(mean_squared_error(y_true=y_val, y_pred=loaded_model.predict(X_val_scaled)))

20697.83545417761

## Random Forests

Data

In [None]:
X_train = train_features.to_numpy()
y_train = train_targets.to_numpy().ravel()
X_val = val_features.to_numpy()
y_val = val_targets.to_numpy().ravel()

Model Definition

In [None]:
rf_reg = RandomForestRegressor(random_state=42)

Training

In [None]:
rf_reg.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=42, verbose=0, warm_start=False)

Evaluation

In [None]:
rf_train_preds = rf_reg.predict(X_train)
rf_train_rmse = np.sqrt(mean_squared_error(y_true=y_train, y_pred=rf_train_preds))
print(f"Random Forests Training RMSE: {rf_train_rmse}")

rf_val_preds = rf_reg.predict(X_val)
rf_val_rmse = np.sqrt(mean_squared_error(y_true=y_val, y_pred=rf_val_preds))
print(f"Random Forests Training RMSE: {rf_val_rmse}")

Random Forests Training RMSE: 3308.3360430291054
Random Forests Training RMSE: 17528.3281338265


Hyperparameter Tuning

In [None]:
# Sweep 1
# rf_params = {
#     "n_estimators": [100],
#     "max_depth": [3, 5, 7],
#     "min_samples_split": [50, 100, 150],
#     "max_features": [0.7],
#     "bootstrap": [True],
#     "max_samples": [0.7]
# }

# Sweep 2
# rf_params = {
#     "n_estimators": [100],
#     "min_samples_split": [30, 50, 70],
#     "max_features": [0.5, 0.7, 0.9],
#     "bootstrap": [True],
#     "max_samples": [0.5, 0.7, 0.9]
# }

# Sweep 3
# rf_params = {
#     "n_estimators": [100],
#     "min_samples_split": [20, 30, 40],
#     "max_features": [0.6, 0.7, 0.8],
#     "bootstrap": [True],
#     "max_samples": [0.8, 0.9, 1.0]
# }

# Sweep 4
# rf_params = {
#     "n_estimators": [100],
#     "min_samples_split": [10, 15, 20, 25],
#     "max_features": [0.7],
#     "bootstrap": [True],
#     "max_samples": [0.9]
# }

# Sweep 5
# rf_params = {
#     "n_estimators": [100],
#     "min_samples_split": [3, 5, 7, 10],
#     "max_features": [0.7],
#     "bootstrap": [True],
#     "max_samples": [0.9]
# }

# Sweep 6
rf_params = {
    "n_estimators": [100, 500, 1000],
    "min_samples_split": [3],
    "max_features": [0.7],
    "bootstrap": [True],
    "max_samples": [0.9]
}

rf_grid = GridSearchCV(
    estimator=RandomForestRegressor(random_state=42), 
    param_grid=rf_params, 
    scoring="neg_mean_squared_error", 
    n_jobs=-1, 
    cv=3, 
    verbose=2
  )

In [None]:
rf_grid.fit(X_train, y_train)

Fitting 3 folds for each of 3 candidates, totalling 9 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 out of   9 | elapsed:  1.8min finished


GridSearchCV(cv=3, error_score=nan,
             estimator=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                             criterion='mse', max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             max_samples=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators=100, n_jobs=None,
                                             oob_score=False, random_state=42,
                                             verbose=0, warm_start=False),
             iid='deprecated', n_jobs

In [None]:
rf_grid.best_estimator_

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features=0.7, max_leaf_nodes=None,
                      max_samples=0.9, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=3, min_weight_fraction_leaf=0.0,
                      n_estimators=500, n_jobs=None, oob_score=False,
                      random_state=42, verbose=0, warm_start=False)

In [None]:
rf_best = rf_grid.best_estimator_

rf_best_train_preds = rf_best.predict(X_train)
rf_best_val_preds = rf_best.predict(X_val)

rf_best_train_rmse = np.sqrt(mean_squared_error(y_true=y_train, y_pred=rf_best_train_preds))
rf_best_val_rmse = np.sqrt(mean_squared_error(y_true=y_val, y_pred=rf_best_val_preds))

print(f"Random Forests Best Train RMSE: {rf_best_train_rmse}")
print(f"Random Forests Best Validation RMSE: {rf_best_val_rmse}")

Random Forests Best Train RMSE: 3746.2907656827147
Random Forests Best Validation RMSE: 17494.283153509172


Exporting the best Random Forests Model

In [None]:
# Exporting Model
rf_best_model_file = 'rf_best.sav'
pickle.dump(rf_best, open(rf_best_model_file, 'wb'))

# Sanity Check
loaded_model = pickle.load(open(rf_best_model_file, 'rb'))
np.sqrt(mean_squared_error(y_true=y_val, y_pred=loaded_model.predict(X_val)))

17494.283153509172

## XGBoost

Data

In [4]:
X_train = train_features.to_numpy()
y_train = train_targets.to_numpy()
X_val = val_features.to_numpy()
y_val = val_targets.to_numpy()

Model Definition

In [5]:
xgb_reg = XGBRegressor(random_state=42)

Training

In [6]:
eval_set = [(X_val, y_val)]
xgb_reg.fit(
    X_train, 
    y_train,  
    early_stopping_rounds=10, 
    eval_metric="rmse", 
    eval_set=eval_set, 
    verbose=True
  )

[0]	validation_0-rmse:27299.5
Will train until validation_0-rmse hasn't improved in 10 rounds.
[1]	validation_0-rmse:26216
[2]	validation_0-rmse:25315.1
[3]	validation_0-rmse:24504.7
[4]	validation_0-rmse:23864.8
[5]	validation_0-rmse:23300.3
[6]	validation_0-rmse:22761
[7]	validation_0-rmse:22355
[8]	validation_0-rmse:21985.1
[9]	validation_0-rmse:21699.4
[10]	validation_0-rmse:21412.7
[11]	validation_0-rmse:21185.2
[12]	validation_0-rmse:20992
[13]	validation_0-rmse:20830.1
[14]	validation_0-rmse:20603.3
[15]	validation_0-rmse:20468.2
[16]	validation_0-rmse:20345
[17]	validation_0-rmse:20214.9
[18]	validation_0-rmse:20135.5
[19]	validation_0-rmse:20059.1
[20]	validation_0-rmse:19938.4
[21]	validation_0-rmse:19875.4
[22]	validation_0-rmse:19820
[23]	validation_0-rmse:19772.5
[24]	validation_0-rmse:19706.9
[25]	validation_0-rmse:19649.9
[26]	validation_0-rmse:19605.8
[27]	validation_0-rmse:19533.3
[28]	validation_0-rmse:19485
[29]	validation_0-rmse:19411
[30]	validation_0-rmse:19375.3


XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=42,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, verbosity=1)

Evaluation

In [7]:
xgb_train_preds = xgb_reg.predict(X_train)
xgb_train_rmse = np.sqrt(mean_squared_error(y_true=y_train, y_pred=xgb_train_preds))
print(f"XGBoost Training RMSE: {xgb_train_rmse}")

xgb_val_preds = xgb_reg.predict(X_val)
xgb_val_rmse = np.sqrt(mean_squared_error(y_true=y_val, y_pred=xgb_val_preds))
print(f"XGBoost Training RMSE: {xgb_val_rmse}")

XGBoost Training RMSE: 9334.947361397864
XGBoost Training RMSE: 18495.2876101272


Hyperparameter Tuning

In [26]:
# Sweep 1
# xgb_params = {
#     "max_depth": [2, 3, 5, 7], 
#     "min_child_weight": [1, 3, 5]
# }

# Sweep 2
# xgb_params = {
#     "max_depth": [7, 9, 11], 
#     "min_child_weight": [5, 7, 9], 
#     "subsample": [0.5, 0.7, 0.9], 
#     "colsample_bytree": [0.5, 0.7, 0.9]
# }

# Sweep 3
xgb_params = {
    "max_depth": [7, 9, 11], 
    "min_child_weight": [5], 
    "subsample": [0.8, 0.9], 
    "colsample_bytree": [0.6, 0.7, 0.8], 
}

xgb_grid = GridSearchCV(
    estimator=XGBRegressor(random_state=42), 
    param_grid=xgb_params, 
    scoring="neg_mean_squared_error", 
    n_jobs=-1, 
    cv=3, 
    verbose=2
  )

In [27]:
fit_params={
    "early_stopping_rounds":42, 
    "eval_metric" : "rmse", 
    "eval_set" : [[X_val, y_val]]
  }

xgb_grid.fit(X_train, y_train, **fit_params)

Fitting 3 folds for each of 18 candidates, totalling 54 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:   53.4s


[0]	validation_0-rmse:27096.5
Will train until validation_0-rmse hasn't improved in 42 rounds.


[Parallel(n_jobs=-1)]: Done  54 out of  54 | elapsed:  1.3min finished


[1]	validation_0-rmse:25845.1
[2]	validation_0-rmse:24723.2
[3]	validation_0-rmse:23691.3
[4]	validation_0-rmse:22812.1
[5]	validation_0-rmse:22046.2
[6]	validation_0-rmse:21378.8
[7]	validation_0-rmse:20810.6
[8]	validation_0-rmse:20362.4
[9]	validation_0-rmse:19932.6
[10]	validation_0-rmse:19570.4
[11]	validation_0-rmse:19253.2
[12]	validation_0-rmse:18996.4
[13]	validation_0-rmse:18784.4
[14]	validation_0-rmse:18603.9
[15]	validation_0-rmse:18465
[16]	validation_0-rmse:18308
[17]	validation_0-rmse:18170.8
[18]	validation_0-rmse:18051.5
[19]	validation_0-rmse:17937.1
[20]	validation_0-rmse:17863.3
[21]	validation_0-rmse:17768.4
[22]	validation_0-rmse:17697.2
[23]	validation_0-rmse:17634.2
[24]	validation_0-rmse:17592
[25]	validation_0-rmse:17568.6
[26]	validation_0-rmse:17532.7
[27]	validation_0-rmse:17498.2
[28]	validation_0-rmse:17455
[29]	validation_0-rmse:17424.9
[30]	validation_0-rmse:17405.2
[31]	validation_0-rmse:17381.1
[32]	validation_0-rmse:17344
[33]	validation_0-rmse:1732

GridSearchCV(cv=3, error_score=nan,
             estimator=XGBRegressor(base_score=0.5, booster='gbtree',
                                    colsample_bylevel=1, colsample_bynode=1,
                                    colsample_bytree=1, gamma=0,
                                    importance_type='gain', learning_rate=0.1,
                                    max_delta_step=0, max_depth=3,
                                    min_child_weight=1, missing=None,
                                    n_estimators=100, n_jobs=1, nthread=None,
                                    objective='reg:linear', random_state=42,
                                    reg_alpha=0, reg_lambda=1,
                                    scale_pos_weight=1, seed=None, silent=None,
                                    subsample=1, verbosity=1),
             iid='deprecated', n_jobs=-1,
             param_grid={'colsample_bytree': [0.6, 0.7, 0.8],
                         'max_depth': [7, 9, 11], 'min_child_weight': [

In [28]:
xgb_grid.best_estimator_

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.8, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=11, min_child_weight=5, missing=None, n_estimators=100,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=42,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=0.8, verbosity=1)

In [29]:
xgb_best = xgb_grid.best_estimator_

xgb_best_train_preds = xgb_best.predict(X_train)
xgb_best_val_preds = xgb_best.predict(X_val)

xgb_best_train_rmse = np.sqrt(mean_squared_error(y_true=y_train, y_pred=xgb_best_train_preds))
xgb_best_val_rmse = np.sqrt(mean_squared_error(y_true=y_val, y_pred=xgb_best_val_preds))

print(f"Random Forests Best Train RMSE: {xgb_best_train_rmse}")
print(f"Random Forests Best Validation RMSE: {xgb_best_val_rmse}")

Random Forests Best Train RMSE: 4490.943228116084
Random Forests Best Validation RMSE: 17135.721823084656


Exporting the best model

In [30]:
# Exporting Model
xgb_best_model_file = 'xgb_best.sav'
pickle.dump(xgb_best, open(xgb_best_model_file, 'wb'))

# Sanity Check
loaded_model = pickle.load(open(xgb_best_model_file, 'rb'))
np.sqrt(mean_squared_error(y_true=y_val, y_pred=loaded_model.predict(X_val)))



17135.721823084656