# Better Housing Regression w/ Ensemble Models


Data from https://www.kaggle.com/competitions/house-prices-advanced-regression-techniques/data

Goal: beat the basic regression done in HousingRegression.ipynb. 

Score to beat: .14834 (lower is better)

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [None]:
# Data Preprocessing
trainData = pd.read_csv("datasets/train.csv")
submissionData = pd.read_csv("datasets/test.csv")

yvals = trainData["SalePrice"].copy()
xvals = trainData.drop(columns="SalePrice")
price_bins = pd.qcut(yvals, q=8)

xtrain, xtest, ytrain, ytest = train_test_split(xvals, yvals, test_size=.15, stratify=price_bins)

ytrainLog = np.log1p(ytrain)
ytestLog = np.log1p(ytest)

### Feature Engineering

In [3]:
xtrain["TotalSF"] = xtrain["GrLivArea"] + xtrain["TotalBsmtSF"]
xtrain["QualityArea"] = xtrain["OverallQual"] * xtrain["TotalSF"]
xtrain["TotalBath"] = 0.5*xtrain["HalfBath"] + xtrain["FullBath"] + xtrain["BsmtFullBath"] + 0.5*xtrain["BsmtHalfBath"]
xtrain["QualityScore"] = xtrain["OverallQual"] * xtrain["OverallCond"]
xtrain["BasementRatio"] = xtrain["TotalBsmtSF"] / (xtrain["GrLivArea"]+1)
xtrain["LivingSpaceRatio"] = xtrain["GrLivArea"] / (xtrain["LotArea"]+1)

xtest["TotalSF"] = xtest["GrLivArea"] + xtest["TotalBsmtSF"]
xtest["QualityArea"] = xtest["OverallQual"] * xtest["TotalSF"]
xtest["TotalBath"] = 0.5*xtest["HalfBath"] + xtest["FullBath"] + xtest["BsmtFullBath"] + 0.5*xtest["BsmtHalfBath"]
xtest["QualityScore"] = xtest["OverallQual"] * xtest["OverallCond"]
xtest["BasementRatio"] = xtest["TotalBsmtSF"] / (xtest["GrLivArea"]+1)
xtest["LivingSpaceRatio"] = xtest["GrLivArea"] / (xtest["LotArea"]+1)

catVars = list(xtrain.select_dtypes(exclude="number").columns)
catVars.append("MSSubClass")
catVars.append("OverallQual")
catVars.append("OverallCond")
numVars = list(xtrain.select_dtypes(include="number").drop(columns=["Id", "MSSubClass", "OverallQual", "OverallCond"]).columns)
catIndices = [xtrain.columns.get_loc(col) for col in catVars]
numIndices = [xtrain.columns.get_loc(col) for col in numVars]

xtrain.loc[:,numVars] = xtrain.loc[:,numVars].fillna(0)
xtest.loc[:,numVars] = xtest.loc[:,numVars].fillna(0)


In [4]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
pipelineNoScaling = ColumnTransformer([
    ("cat", OneHotEncoder(handle_unknown="ignore"), catIndices),
    ("num", "passthrough", numIndices)
])
pipelineScaling = ColumnTransformer([
    ("cat", OneHotEncoder(handle_unknown="ignore"), catIndices),
    ("num", StandardScaler(), numIndices)
])

### The Models

1. Elastic Net -> the one built last time
2. Bagging Regressor w/ Decision Trees
3. Random Forest Regressor
4. AdaBoost w/ Decision Trees
5. Gradient Boosting
6. CatBoost

Lastly: use stacking to combine predictions from other models.

In [None]:
# Elastic Net
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import ElasticNet
import numpy as np
from sklearn.pipeline import Pipeline

netPipeline = Pipeline([
    ('preprocessing', pipelineScaling),
    ('model', ElasticNet())
])

param_grid = {
    'model__alpha': [0.01, 0.1, 1.0],
    'model__l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9]
}

grid_search = GridSearchCV(netPipeline, param_grid, cv=10, 
                           scoring='neg_root_mean_squared_error', n_jobs=-1)
grid_search.fit(xtrain, ytrainLog)

elasticNetModel = grid_search.best_estimator_

predictions = elasticNetModel.predict(xtest)
predictions = np.expm1(predictions)
print(f"Elastic Net Validation RMSE: {np.sqrt(np.mean((predictions-ytest)**2))}")

Elastic Net Validation RMSE: 25806.139752147643


In [None]:
# Bagging Regressor w/ Decision Trees
from sklearn.ensemble import BaggingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import RandomizedSearchCV

bagPipeline = Pipeline([
    ('preprocessing', pipelineNoScaling),
    ('model', DecisionTreeRegressor())
])

param_grid = {
    'estimator__model__max_depth': [7, 10, 15, 20],
    'estimator__model__max_leaf_nodes': [64, 128, 256, 512],
    'estimator__model__min_samples_split': [5,10,20,50]
}

bagReg = BaggingRegressor(bagPipeline, n_jobs=-1, n_estimators=200, max_samples=.5)

randSearch = RandomizedSearchCV(bagReg, param_grid, cv=5, scoring="neg_root_mean_squared_error", n_jobs=-1, n_iter=20)
randSearch.fit(xtrain,ytrainLog)

bagModel = randSearch.best_estimator_

predictions = bagModel.predict(xtest)
predictions = np.expm1(predictions) 

print(f"Bagging Regressor Validation RMSE: {np.sqrt(np.mean((predictions - ytest)**2))}")

Bagging Regressor Validation RMSE: 24188.360593894668


In [None]:
# Random Forest Regressor
from sklearn.ensemble import RandomForestRegressor

rfPipeline = Pipeline([
    ('preprocessing', pipelineNoScaling),
    ('model', RandomForestRegressor(n_estimators=200))
])

param_grid = {
    'model__max_depth': [7, 10, 15, 20],
    'model__max_leaf_nodes': [64, 128, 256, 512],
    'model__min_samples_split': [5,10,20,50]
}

randSearch = RandomizedSearchCV(rfPipeline, param_grid, cv=5, scoring="neg_root_mean_squared_error", n_jobs=-1, n_iter=20)
randSearch.fit(xtrain,ytrainLog)
rfModel = randSearch.best_estimator_

predictions = rfModel.predict(xtest)
predictions = np.expm1(predictions) 

print(f"Random Forest Regressor Validation RMSE: {np.sqrt(np.mean((predictions - ytest)**2))}")

Random Forest Regressor Validation RMSE: 23280.390477215875


In [None]:
# AdaBoost w/ Decision Trees
from sklearn.ensemble import AdaBoostRegressor

adaPipeline = Pipeline([
    ('preprocessing', pipelineNoScaling),
    ('model', AdaBoostRegressor(DecisionTreeRegressor(), n_estimators=200, learning_rate=.25))
])

param_grid = {
    'model__estimator__max_depth': [7, 10, 15, 20],
    'model__estimator__max_leaf_nodes': [64, 128, 256, 512],
    'model__estimator__min_samples_split': [5,10,20,50]
}

randSearch = RandomizedSearchCV(adaPipeline, param_grid, cv=5, scoring="neg_root_mean_squared_error", n_jobs=-1, n_iter=20)
randSearch.fit(xtrain,ytrainLog)
adaModel = randSearch.best_estimator_

predictions = adaModel.predict(xtest)
predictions = np.expm1(predictions) 

print(f"AdaBoost Regressor Validation RMSE: {np.sqrt(np.mean((predictions - ytest)**2))}")

NameError: name 'Pipeline' is not defined

In [None]:
# Gradient Boosting
from sklearn.ensemble import GradientBoostingRegressor

gbPipeline = Pipeline([
    ('preprocessing', pipelineNoScaling),
    ('model', GradientBoostingRegressor(max_depth=15, max_leaf_nodes=600, min_samples_split=10))
])

param_grid = {
    'model__learning_rate': [.05, .1, .25, .5, .75],
    'model__n_estimators': [100, 200, 350, 500]
}

randSearch = RandomizedSearchCV(gbPipeline, param_grid, cv=5, scoring="neg_root_mean_squared_error", n_jobs=-1, n_iter=20)
randSearch.fit(xtrain,ytrainLog)
gbModel = randSearch.best_estimator_

predictions = gbModel.predict(xtest)
predictions = np.expm1(predictions) 

print(f"Gradient Boosting Regressor Validation RMSE: {np.sqrt(np.mean((predictions - ytest)**2))}")

Gradient Boosting Regressor Validation RMSE: 24916.04760030638


In [None]:
# CatBoost 
from catboost import CatBoostRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer

def fill_categorical_na(X):
    X_copy = X.copy()
    for col in catBoostVars:
        X_copy[col] = X_copy[col].fillna('Missing')
    return X_copy

catBoostVars = catVars[:-3] 

catPipeline = Pipeline([
    ('imputer', FunctionTransformer(fill_categorical_na)),
    ('model', CatBoostRegressor(cat_features=catBoostVars,logging_level="Silent", thread_count=-1))
])

catModel = catPipeline.fit(xtrain, ytrainLog)

predictions = catModel.predict(xtest)
predictions = np.expm1(predictions) 

print(f"CatBoost Regressor Validation RMSE: {np.sqrt(np.mean((predictions - ytest)**2))}")

CatBoost Regressor Validation RMSE: 22494.98428069441


### Stacking

In [None]:
from sklearn.ensemble import StackingRegressor

stack = StackingRegressor(
    estimators=[    
        ("elasticNet", elasticNetModel),
        ("bagging", bagModel),
        ("randForest", rfModel),
        ("adaBoost", adaModel),
        ("gradBoost", gbModel),
        ("catBoost", catModel)
    ],
    final_estimator=RandomForestRegressor(n_jobs=-1, n_estimators=10),
    cv=3
)

stack.fit(xtrain,ytrainLog)

predictions = stack.predict(xtest)
predictions = np.expm1(predictions) 

print(f"CatBoost Regressor Validation RMSE: {np.sqrt(np.mean((predictions - ytest)**2))}")

: 