In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
import statsmodels.api as sm
from statsmodels.tools.eval_measures import mse, rmse
from sqlalchemy import create_engine

import warnings
warnings.filterwarnings('ignore')

postgres_user = 'dsbc_student'
postgres_pw = '7*.8G9QH21'
postgres_host = '142.93.121.174'
postgres_port = '5432'
postgres_db = 'houseprices'

In [2]:
engine = create_engine('postgresql://{}:{}@{}:{}/{}'.format(
    postgres_user, postgres_pw, postgres_host, postgres_port, postgres_db))

hp = pd.read_sql_query('select * from houseprices',con=engine)

engine.dispose()

hp.head(10)

Unnamed: 0,id,mssubclass,mszoning,lotfrontage,lotarea,street,alley,lotshape,landcontour,utilities,...,poolarea,poolqc,fence,miscfeature,miscval,mosold,yrsold,saletype,salecondition,saleprice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
5,6,50,RL,85.0,14115,Pave,,IR1,Lvl,AllPub,...,0,,MnPrv,Shed,700,10,2009,WD,Normal,143000
6,7,20,RL,75.0,10084,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,307000
7,8,60,RL,,10382,Pave,,IR1,Lvl,AllPub,...,0,,,Shed,350,11,2009,WD,Normal,200000
8,9,50,RM,51.0,6120,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2008,WD,Abnorml,129900
9,10,190,RL,50.0,7420,Pave,,Reg,Lvl,AllPub,...,0,,,,0,1,2008,WD,Normal,118000


In [3]:
import scipy.stats as stats
from scipy.stats.mstats import winsorize

#add winsorized variables to original dataframe
hp['wins_saleprice'] = winsorize(hp['saleprice'], (0, 0.05))
hp['wins_grlivarea'] = winsorize(hp['grlivarea'], (0, 0.05))
hp['wins_garagearea'] = winsorize(hp['garagearea'], (0, 0.05))
hp['wins_totalbsmtsf'] = winsorize(hp['totalbsmtsf'], (0, 0.05))
hp['wins_firstflrsf'] = winsorize(hp['firstflrsf'], (0, 0.05))

In [4]:
hp["dummies_utilities"] = pd.get_dummies(hp.utilities, drop_first=True)
hp["dummies_centralair"] = pd.get_dummies(hp.centralair, drop_first=True)

In [5]:
## OLS model

Y = hp['wins_saleprice']
 
X = hp[['overallqual', 'garagecars', 'totrmsabvgrd', 'yearbuilt', 'wins_totalbsmtsf', 'wins_grlivarea','wins_garagearea', 'dummies_centralair']]

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 465)

lrm = LinearRegression()

lrm.fit(X_train, Y_train)

Y_preds_train = lrm.predict(X_train)
Y_preds_test = lrm.predict(X_test)

print("R-squared of the model in training set is: {}".format(lrm.score(X_train, Y_train)))
print("-----Test set statistics-----")
print("R-squared of the model in test set is: {}".format(lrm.score(X_test, Y_test)))
print("Mean absolute error of the prediction is: {}".format(mean_absolute_error(Y_test, Y_preds_test)))
print("Mean squared error of the prediction is: {}".format(mse(Y_test, Y_preds_test)))
print("Root mean squared error of the prediction is: {}".format(rmse(Y_test, Y_preds_test)))
print("Mean absolute percentage error of the prediction is: {}".format(np.mean(np.abs((Y_test - Y_preds_test) / Y_test)) * 100))

R-squared of the model in training set is: 0.8358338354106741
-----Test set statistics-----
R-squared of the model in test set is: 0.8403868119284769
Mean absolute error of the prediction is: 19642.677645928008
Mean squared error of the prediction is: 684598096.0648007
Root mean squared error of the prediction is: 26164.825550054804
Mean absolute percentage error of the prediction is: 13.01981648871598


In [6]:
# import linear regression models and set alphas
from sklearn.linear_model import RidgeCV, LassoCV, ElasticNetCV
alphas = [np.power(10.0,p) for p in np.arange(-10,40,1)]

In [7]:
# Ridge Regression
ridgeregr = RidgeCV(alphas = alphas, cv =5) 
ridgeregr.fit(X_train, Y_train)

Y_preds_train = ridgeregr.predict(X_train)
Y_preds_test = ridgeregr.predict(X_test)

print("Best alpha value is: {}".format(ridgeregr.alpha_))
print("R-squared of the model on the training set is: {}".format(ridgeregr.score(X_train, Y_train)))
print("-----Test set statistics-----")
print("R-squared of the model on the test set is: {}".format(ridgeregr.score(X_test, Y_test)))
print("Mean absolute error of the prediction is: {}".format(mean_absolute_error(Y_test, Y_preds_test)))
print("Mean squared error of the prediction is: {}".format(mse(Y_test, Y_preds_test)))
print("Root mean squared error of the prediction is: {}".format(rmse(Y_test, Y_preds_test)))
print("Mean absolute percentage error of the prediction is: {}".format(np.mean(np.abs((Y_test - Y_preds_test) / Y_test)) * 100))


Best alpha value is: 1.0
R-squared of the model on the training set is: 0.8358334449749661
-----Test set statistics-----
R-squared of the model on the test set is: 0.8404475576080839
Mean absolute error of the prediction is: 19637.06296561745
Mean squared error of the prediction is: 684337551.3246977
Root mean squared error of the prediction is: 26159.846164010556
Mean absolute percentage error of the prediction is: 13.015752906960923


In [8]:
# Lasso Regression
lassoregr = LassoCV(alphas = alphas, cv = 5) 
lassoregr.fit(X_train, Y_train)

# We are making predictions here
Y_preds_train = lassoregr.predict(X_train)
Y_preds_test = lassoregr.predict(X_test)

print("Best alpha value is: {}".format(lassoregr.alpha_))
print("R-squared of the model on the training set is: {}".format(lassoregr.score(X_train, Y_train)))
print("-----Test set statistics-----")
print("R-squared of the model on the test set is: {}".format(lassoregr.score(X_test, Y_test)))
print("Mean absolute error of the prediction is: {}".format(mean_absolute_error(Y_test, Y_preds_test)))
print("Mean squared error of the prediction is: {}".format(mse(Y_test, Y_preds_test)))
print("Root mean squared error of the prediction is: {}".format(rmse(Y_test, Y_preds_test)))
print("Mean absolute percentage error of the prediction is: {}".format(np.mean(np.abs((Y_test - Y_preds_test) / Y_test)) * 100))


Best alpha value is: 1e-10
R-squared of the model on the training set is: 0.835833835410674
-----Test set statistics-----
R-squared of the model on the test set is: 0.8403868119284774
Mean absolute error of the prediction is: 19642.677645927943
Mean squared error of the prediction is: 684598096.0647985
Root mean squared error of the prediction is: 26164.82555005476
Mean absolute percentage error of the prediction is: 13.019816488715907


In [9]:
# ElasticNet regression
elasticregr = ElasticNetCV(alphas=alphas, cv=5) 
elasticregr.fit(X_train, Y_train)

Y_preds_train = elasticregr.predict(X_train)
Y_preds_test = elasticregr.predict(X_test)

print("Best alpha value is: {}".format(elasticregr.alpha_))
print("R-squared of the model on the training set is: {}".format(elasticregr.score(X_train, Y_train)))
print("-----Test set statistics-----")
print("R-squared of the model on the test set is: {}".format(elasticregr.score(X_test, Y_test)))
print("Mean absolute error of the prediction is: {}".format(mean_absolute_error(Y_test, Y_preds_test)))
print("Mean squared error of the prediction is: {}".format(mse(Y_test, Y_preds_test)))
print("Root mean squared error of the prediction is: {}".format(rmse(Y_test, Y_preds_test)))
print("Mean absolute percentage error of the prediction is: {}".format(np.mean(np.abs((Y_test - Y_preds_test) / Y_test)) * 100))


Best alpha value is: 0.01
R-squared of the model on the training set is: 0.835822121692912
-----Test set statistics-----
R-squared of the model on the test set is: 0.8407135897877277
Mean absolute error of the prediction is: 19612.77488142502
Mean squared error of the prediction is: 683196510.7510669
Root mean squared error of the prediction is: 26138.028057813906
Mean absolute percentage error of the prediction is: 12.998357666078967


Looking at all the regressions they all seem to be performing the same. The ElasticNet regression may be just slightly better than the other three. 