<a href="https://colab.research.google.com/github/yashveersinghsohi/Car_Price_Prediction/blob/master/Modeling/CarPrice_05_Ensembling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Importing Packages

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import StackingRegressor

from sklearn.metrics import mean_squared_error, mean_squared_log_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

import pickle

from sklearn.pipeline import make_pipeline

# Data

In [2]:
root_dir = "https://raw.githubusercontent.com/yashveersinghsohi/Car_Price_Prediction/master/Data/Feature_Engineering_Data/"

train_features_path = root_dir + "pruned_train_features.csv"
train_targets_path = root_dir + "train_targets.csv"

val_features_path = root_dir + "pruned_val_features.csv"
val_targets_path = root_dir + "val_targets.csv"

In [3]:
train_features = pd.read_csv(train_features_path)
train_targets = pd.read_csv(train_targets_path)

val_features = pd.read_csv(val_features_path)
val_targets = pd.read_csv(val_targets_path)

print(f"Train Features: {train_features.shape}")
print(f"Train Targets: {train_targets.shape}", end="\n\n")

print(f"Validation Features: {val_features.shape}")
print(f"Validation Targets: {val_targets.shape}", end="\n\n")

Train Features: (13351, 18)
Train Targets: (13351, 1)

Validation Features: (3463, 18)
Validation Targets: (3463, 1)



# Models

In [4]:
knn = KNeighborsRegressor(
    algorithm='brute', 
    leaf_size=30, 
    metric='minkowski', 
    metric_params=None, 
    n_jobs=None, 
    n_neighbors=10, 
    p=1, 
    weights='distance'
  )

rf = RandomForestRegressor(
    bootstrap=True, ccp_alpha=0.0, criterion='mse',
    max_depth=None, max_features=0.7, max_leaf_nodes=None,
    max_samples=0.9, min_impurity_decrease=0.0,
    min_impurity_split=None, min_samples_leaf=1,
    min_samples_split=3, min_weight_fraction_leaf=0.0,
    n_estimators=500, n_jobs=None, oob_score=False,
    random_state=42, verbose=0, warm_start=False
  )

xgb = XGBRegressor(
    base_score=0.5, booster='gbtree', colsample_bylevel=1,
    colsample_bynode=1, colsample_bytree=0.8, gamma=0,
    importance_type='gain', learning_rate=0.1, max_delta_step=0,
    max_depth=11, min_child_weight=5, missing=None, n_estimators=100,
    n_jobs=1, nthread=None, objective='reg:linear', random_state=42,
    reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
    silent=None, subsample=0.8, verbosity=1
  )

# Ensemble 1: KNN + Random Forest + XGBoost

Data

In [5]:
X_train = train_features.to_numpy()
y_train = train_targets.to_numpy()
X_val = val_features.to_numpy()
y_val = val_targets.to_numpy()

Stack

In [6]:
level0 = list()
level0.append(('knn', make_pipeline(StandardScaler(), knn)))
level0.append(('rf', rf))
level0.append(('xgb', xgb))

level1 = LinearRegression()

model = StackingRegressor(estimators=level0, final_estimator=level1, cv=5)

model.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)




StackingRegressor(cv=5,
                  estimators=[('knn',
                               Pipeline(memory=None,
                                        steps=[('standardscaler',
                                                StandardScaler(copy=True,
                                                               with_mean=True,
                                                               with_std=True)),
                                               ('kneighborsregressor',
                                                KNeighborsRegressor(algorithm='brute',
                                                                    leaf_size=30,
                                                                    metric='minkowski',
                                                                    metric_params=None,
                                                                    n_jobs=None,
                                                                    n_neighbors=10,
     

Evaluating Stack

In [7]:
train_preds = model.predict(X_train)
train_preds = np.where(train_preds>0, train_preds, 0)

val_preds = model.predict(X_val)
val_preds = np.where(val_preds>0, val_preds, 0)

train_rmse = np.sqrt(mean_squared_error(y_true=y_train, y_pred=train_preds))
val_rmse = np.sqrt(mean_squared_error(y_true=y_val, y_pred=val_preds))

print(f"Training RMSE: {train_rmse}")
print(f"Validation RMSE: {val_rmse}")

Training RMSE: 3103.516807806332
Validation RMSE: 17330.88987330079


This model performs worse than XGBoost. Let's remove KNN (weakest learner) and try stacking again

# Ensemble 2: Random Forest + XGBoost

Data

In [8]:
X_train = train_features.to_numpy()
y_train = train_targets.to_numpy()
X_val = val_features.to_numpy()
y_val = val_targets.to_numpy()

Stack

In [9]:
level0 = list()
level0.append(('rf', rf))
level0.append(('xgb', xgb))

level1 = LinearRegression()

model = StackingRegressor(estimators=level0, final_estimator=level1, cv=5)

model.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)




StackingRegressor(cv=5,
                  estimators=[('rf',
                               RandomForestRegressor(bootstrap=True,
                                                     ccp_alpha=0.0,
                                                     criterion='mse',
                                                     max_depth=None,
                                                     max_features=0.7,
                                                     max_leaf_nodes=None,
                                                     max_samples=0.9,
                                                     min_impurity_decrease=0.0,
                                                     min_impurity_split=None,
                                                     min_samples_leaf=1,
                                                     min_samples_split=3,
                                                     min_weight_fraction_leaf=0.0,
                                                     n_estim

Evaluating Stack

In [10]:
train_preds = model.predict(X_train)
train_preds = np.where(train_preds>0, train_preds, 0)

val_preds = model.predict(X_val)
val_preds = np.where(val_preds>0, val_preds, 0)

train_rmse = np.sqrt(mean_squared_error(y_true=y_train, y_pred=train_preds))
val_rmse = np.sqrt(mean_squared_error(y_true=y_val, y_pred=val_preds))

print(f"Training RMSE: {train_rmse}")
print(f"Validation RMSE: {val_rmse}")

Training RMSE: 3732.12801745847
Validation RMSE: 17255.334552412325


This model performs better than all the base lerners, lets use this for final predictions.

# Exporting Model

In [11]:
# # Exporting Model
# model_file = 'model.sav'
# pickle.dump(model, open(model_file, 'wb'))

# # Sanity Check
# loaded_model = pickle.load(open(model_file, 'rb'))
# np.sqrt(mean_squared_error(y_true=y_val, y_pred=loaded_model.predict(X_val)))