# Housing Dataset Regression Models

Data from https://www.kaggle.com/competitions/house-prices-advanced-regression-techniques/data

Not meant to be a competitive kaggle submission, just an overview of various regression methods.

In [69]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [70]:
trainData = pd.read_csv("datasets/train.csv")
submissionData = pd.read_csv("datasets/test.csv")

yvals = trainData["SalePrice"].copy()
xvals = trainData.drop(columns="SalePrice")
price_bins = pd.qcut(yvals, q=8)

xtrain, xtest, ytrain, ytest = train_test_split(xvals, yvals, test_size=.15, stratify=price_bins)

#### The Pipelines

This cell builds 3 pipelines: pipelineNoScaling, pipelineScaling, and pipelineFtEngineering. All 3 apply a one hot encoder to the categorical vars. NoScaling does nothing to the numeric vars. Scaling applies a standard normalization to the numeric vars. Feature engineering adds the log and square of each feature to explore nonlinear relationships.

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

catVars = list(xtrain.select_dtypes(exclude="number").columns)
catVars.append("MSSubClass")
catVars.append("OverallQual")
catVars.append("OverallCond")
numVars = list(xtrain.select_dtypes(include="number").drop(columns=["Id", "MSSubClass", "OverallQual", "OverallCond"]).columns)


pipelineNoScaling = ColumnTransformer([
    ("cat", OneHotEncoder(handle_unknown="ignore"), catVars),
    ("num", "passthrough", numVars)
])
pipelineScaling = ColumnTransformer([
    ("cat", OneHotEncoder(handle_unknown="ignore"), catVars),
    ("num", StandardScaler(), numVars)
])

In [72]:
xtrain.loc[:,numVars] = xtrain.loc[:,numVars].fillna(0)
xtest.loc[:,numVars] = xtest.loc[:,numVars].fillna(0)

xtrainUnscaled = pipelineNoScaling.fit_transform(xtrain)
xtrainScaled = pipelineScaling.fit_transform(xtrain)
ytrainLog = np.log1p(ytrain)

## Models

In [73]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error
from sklearn.model_selection import cross_val_score

linReg = LinearRegression(n_jobs=-1)
linReg.fit(xtrainUnscaled, ytrain)
testUnscaled = pipelineNoScaling.transform(xtest)
predictions = linReg.predict(testUnscaled)
meancv = cross_val_score(linReg, xtrainUnscaled, ytrain, n_jobs=-1, cv=10, scoring="neg_root_mean_squared_error").mean()
testRmse = root_mean_squared_error(ytest, predictions)

print("Simple Linear Model, no scaling, targeting unscaled y")
print("Mean Cross Val Score:", round(-1*meancv, 2))
print("Test RMSE:           ", round(testRmse,2))

Simple Linear Model, no scaling, targeting unscaled y
Mean Cross Val Score: 27801.31
Test RMSE:            49536.06


In [74]:
from sklearn.model_selection import cross_val_predict

linReg.fit(xtrainUnscaled, ytrainLog)
testUnscaled = pipelineNoScaling.transform(xtest)
predictions = np.expm1(linReg.predict(testUnscaled))
expcvs = np.expm1(cross_val_predict(linReg, xtrainUnscaled, ytrainLog, n_jobs=-1, cv=10))
meancv = root_mean_squared_error(ytrain, expcvs)
testRmse = root_mean_squared_error(ytest, predictions)

print("Simple Linear Model, no scaling, targeting log y")
print("Mean Cross Val Score:", round(meancv, 2))
print("Test RMSE:           ", round(testRmse,2))

Simple Linear Model, no scaling, targeting log y
Mean Cross Val Score: 30663.82
Test RMSE:            105193.5


In [75]:
linReg.fit(xtrainScaled, ytrain)
testScaled = pipelineScaling.transform(xtest)
predictions = linReg.predict(testScaled)
meancv = cross_val_score(linReg, xtrainScaled, ytrain, n_jobs=-1, cv=10, scoring="neg_root_mean_squared_error").mean()
testRmse = root_mean_squared_error(ytest, predictions)

print("Simple Linear Model, training data scaled, targeting unscaled y")
print("Mean Cross Val Score:", round(-meancv, 2))
print("Test RMSE:           ", round(testRmse,2))

Simple Linear Model, training data scaled, targeting unscaled y
Mean Cross Val Score: 38286.68
Test RMSE:            48492.19


In [76]:
from sklearn.linear_model import Ridge

linReg = Ridge(alpha=0.1)
linReg.fit(xtrainScaled, ytrain)
testScaled = pipelineScaling.transform(xtest)
predictions = linReg.predict(testScaled)
meancv = cross_val_score(linReg, xtrainScaled, ytrain, n_jobs=-1, cv=10, scoring="neg_root_mean_squared_error").mean()
testRmse = root_mean_squared_error(ytest, predictions)

print("Ridge Regularized Linear Model, x scaled, targeting unscaled y")
print("Mean Cross Val Score:", round(-1*meancv, 2))
print("Test RMSE:           ", round(testRmse,2))

Ridge Regularized Linear Model, x scaled, targeting unscaled y
Mean Cross Val Score: 28897.15
Test RMSE:            52172.59


In [77]:
from sklearn.linear_model import Lasso

linReg = Lasso(alpha=0.1)
linReg.fit(xtrainScaled, ytrain)
testScaled = pipelineScaling.transform(xtest)
predictions = linReg.predict(testScaled)
meancv = cross_val_score(linReg, xtrainScaled, ytrain, n_jobs=-1, cv=10, scoring="neg_root_mean_squared_error").mean()
testRmse = root_mean_squared_error(ytest, predictions)

print("Lasso Regularized Linear Model, x scaled, targeting unscaled y")
print("Mean Cross Val Score:", round(-1*meancv, 2))
print("Test RMSE:           ", round(testRmse,2))

  model = cd_fast.sparse_enet_coordinate_descent(


Lasso Regularized Linear Model, x scaled, targeting unscaled y
Mean Cross Val Score: 29596.17
Test RMSE:            47936.63


In [78]:
from sklearn.linear_model import ElasticNet

elastic = ElasticNet(alpha=0.1, l1_ratio=0.9)
elastic.fit(xtrainScaled, ytrain)
testScaled = pipelineScaling.transform(xtest)
predictions = elastic.predict(testScaled)
meancv = cross_val_score(elastic, xtrainScaled, ytrain, n_jobs=-1, cv=10, scoring="neg_root_mean_squared_error").mean()
testRmse = root_mean_squared_error(ytest, predictions)

print("Elastic Net Regularized Linear Model, x scaled, targeting unscaled y")
print("Mean Cross Val Score:", round(-1*meancv, 2))
print("Test RMSE:           ", round(testRmse,2))

# elastic net often works best

Elastic Net Regularized Linear Model, x scaled, targeting unscaled y
Mean Cross Val Score: 26882.32
Test RMSE:            49515.59


## Final Model Training

In [79]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import ElasticNet

param_grid = {
    'alpha': [0.01, 0.1, 1.0],
    'l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9]
}
xvals.loc[:,numVars] = xvals.loc[:,numVars].fillna(0)
xScaled = pipelineScaling.transform(xvals)

grid_search = GridSearchCV(ElasticNet(), param_grid, cv=10, 
                           scoring='neg_root_mean_squared_error', n_jobs=-1)
grid_search.fit(xScaled, yvals)

print(f"Best parameters: {grid_search.best_params_}")
print(f"Best CV Score: {round(-grid_search.best_score_, 2)}")

model = grid_search.best_estimator_

Best parameters: {'alpha': 0.1, 'l1_ratio': 0.9}
Best CV Score: 29947.74


In [108]:
submissionData.loc[:,numVars] = submissionData.loc[:,numVars].fillna(0)
xPredict = pipelineScaling.transform(submissionData)
predictions = model.predict(xPredict)
submission = pd.DataFrame({
    'Id': submissionData["Id"],
    'SalePrice': predictions
})

In [None]:
# submission.to_csv('submission.csv', index=False)