### Houseprice dataset:
--------------------------------

#### Load the dataset:

In [1]:
# Import libraries:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

import statsmodels.api as sm
from statsmodels.tools.eval_measures import mse, rmse

from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV, ElasticNetCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

from sqlalchemy import create_engine

# Display preferences.
%matplotlib inline
pd.options.display.float_format = '{:.3f}'.format
warnings.filterwarnings('ignore')

# Edit pandas display option to show more rows and columns:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [2]:
# Query the database to extract dataset:
postgres_user = 'dsbc_student'
postgres_pw = '7*.8G9QH21'
postgres_host = '142.93.121.174'
postgres_port = '5432'
postgres_db = 'houseprices'

engine = create_engine('postgresql://{}:{}@{}:{}/{}'.format(
    postgres_user, postgres_pw, postgres_host, postgres_port, postgres_db))

df = pd.read_sql_query('SELECT * FROM houseprices', con=engine)

# Dispose the connection, as we're only doing a single query:
engine.dispose()

# Print out the head of the dataset:
df.head()

Unnamed: 0,id,mssubclass,mszoning,lotfrontage,lotarea,street,alley,lotshape,landcontour,utilities,lotconfig,landslope,neighborhood,condition1,condition2,bldgtype,housestyle,overallqual,overallcond,yearbuilt,yearremodadd,roofstyle,roofmatl,exterior1st,exterior2nd,masvnrtype,masvnrarea,exterqual,extercond,foundation,bsmtqual,bsmtcond,bsmtexposure,bsmtfintype1,bsmtfinsf1,bsmtfintype2,bsmtfinsf2,bsmtunfsf,totalbsmtsf,heating,heatingqc,centralair,electrical,firstflrsf,secondflrsf,lowqualfinsf,grlivarea,bsmtfullbath,bsmthalfbath,fullbath,halfbath,bedroomabvgr,kitchenabvgr,kitchenqual,totrmsabvgrd,functional,fireplaces,fireplacequ,garagetype,garageyrblt,garagefinish,garagecars,garagearea,garagequal,garagecond,paveddrive,wooddecksf,openporchsf,enclosedporch,threessnporch,screenporch,poolarea,poolqc,fence,miscfeature,miscval,mosold,yrsold,saletype,salecondition,saleprice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2003,2003,Gable,CompShg,VinylSd,VinylSd,BrkFace,196.0,Gd,TA,PConc,Gd,TA,No,GLQ,706,Unf,0,150,856,GasA,Ex,Y,SBrkr,856,854,0,1710,1,0,2,1,3,1,Gd,8,Typ,0,,Attchd,2003.0,RFn,2,548,TA,TA,Y,0,61,0,0,0,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,Norm,1Fam,1Story,6,8,1976,1976,Gable,CompShg,MetalSd,MetalSd,,0.0,TA,TA,CBlock,Gd,TA,Gd,ALQ,978,Unf,0,284,1262,GasA,Ex,Y,SBrkr,1262,0,0,1262,0,1,2,0,3,1,TA,6,Typ,1,TA,Attchd,1976.0,RFn,2,460,TA,TA,Y,298,0,0,0,0,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2001,2002,Gable,CompShg,VinylSd,VinylSd,BrkFace,162.0,Gd,TA,PConc,Gd,TA,Mn,GLQ,486,Unf,0,434,920,GasA,Ex,Y,SBrkr,920,866,0,1786,1,0,2,1,3,1,Gd,6,Typ,1,TA,Attchd,2001.0,RFn,2,608,TA,TA,Y,0,42,0,0,0,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,Norm,1Fam,2Story,7,5,1915,1970,Gable,CompShg,Wd Sdng,Wd Shng,,0.0,TA,TA,BrkTil,TA,Gd,No,ALQ,216,Unf,0,540,756,GasA,Gd,Y,SBrkr,961,756,0,1717,1,0,1,0,3,1,Gd,7,Typ,1,Gd,Detchd,1998.0,Unf,3,642,TA,TA,Y,0,35,272,0,0,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,Norm,1Fam,2Story,8,5,2000,2000,Gable,CompShg,VinylSd,VinylSd,BrkFace,350.0,Gd,TA,PConc,Gd,TA,Av,GLQ,655,Unf,0,490,1145,GasA,Ex,Y,SBrkr,1145,1053,0,2198,1,0,2,1,4,1,Gd,9,Typ,1,TA,Attchd,2000.0,RFn,3,836,TA,TA,Y,192,84,0,0,0,0,,,,0,12,2008,WD,Normal,250000


#### Feature engineering & Spliting the dataset into training and test sets:

In [3]:
# Convert categorical variables to dummy variables:
df['centralair'] = pd.get_dummies(df['centralair'], drop_first=True)

# Convert 'mzoning' to dummy variables:
df = pd.concat([df,pd.get_dummies(df['mszoning'], prefix='mszoning', drop_first=True)], axis=1)

df.head()

Unnamed: 0,id,mssubclass,mszoning,lotfrontage,lotarea,street,alley,lotshape,landcontour,utilities,lotconfig,landslope,neighborhood,condition1,condition2,bldgtype,housestyle,overallqual,overallcond,yearbuilt,yearremodadd,roofstyle,roofmatl,exterior1st,exterior2nd,masvnrtype,masvnrarea,exterqual,extercond,foundation,bsmtqual,bsmtcond,bsmtexposure,bsmtfintype1,bsmtfinsf1,bsmtfintype2,bsmtfinsf2,bsmtunfsf,totalbsmtsf,heating,heatingqc,centralair,electrical,firstflrsf,secondflrsf,lowqualfinsf,grlivarea,bsmtfullbath,bsmthalfbath,fullbath,halfbath,bedroomabvgr,kitchenabvgr,kitchenqual,totrmsabvgrd,functional,fireplaces,fireplacequ,garagetype,garageyrblt,garagefinish,garagecars,garagearea,garagequal,garagecond,paveddrive,wooddecksf,openporchsf,enclosedporch,threessnporch,screenporch,poolarea,poolqc,fence,miscfeature,miscval,mosold,yrsold,saletype,salecondition,saleprice,mszoning_FV,mszoning_RH,mszoning_RL,mszoning_RM
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2003,2003,Gable,CompShg,VinylSd,VinylSd,BrkFace,196.0,Gd,TA,PConc,Gd,TA,No,GLQ,706,Unf,0,150,856,GasA,Ex,1,SBrkr,856,854,0,1710,1,0,2,1,3,1,Gd,8,Typ,0,,Attchd,2003.0,RFn,2,548,TA,TA,Y,0,61,0,0,0,0,,,,0,2,2008,WD,Normal,208500,0,0,1,0
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,Norm,1Fam,1Story,6,8,1976,1976,Gable,CompShg,MetalSd,MetalSd,,0.0,TA,TA,CBlock,Gd,TA,Gd,ALQ,978,Unf,0,284,1262,GasA,Ex,1,SBrkr,1262,0,0,1262,0,1,2,0,3,1,TA,6,Typ,1,TA,Attchd,1976.0,RFn,2,460,TA,TA,Y,298,0,0,0,0,0,,,,0,5,2007,WD,Normal,181500,0,0,1,0
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2001,2002,Gable,CompShg,VinylSd,VinylSd,BrkFace,162.0,Gd,TA,PConc,Gd,TA,Mn,GLQ,486,Unf,0,434,920,GasA,Ex,1,SBrkr,920,866,0,1786,1,0,2,1,3,1,Gd,6,Typ,1,TA,Attchd,2001.0,RFn,2,608,TA,TA,Y,0,42,0,0,0,0,,,,0,9,2008,WD,Normal,223500,0,0,1,0
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,Norm,1Fam,2Story,7,5,1915,1970,Gable,CompShg,Wd Sdng,Wd Shng,,0.0,TA,TA,BrkTil,TA,Gd,No,ALQ,216,Unf,0,540,756,GasA,Gd,1,SBrkr,961,756,0,1717,1,0,1,0,3,1,Gd,7,Typ,1,Gd,Detchd,1998.0,Unf,3,642,TA,TA,Y,0,35,272,0,0,0,,,,0,2,2006,WD,Abnorml,140000,0,0,1,0
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,Norm,1Fam,2Story,8,5,2000,2000,Gable,CompShg,VinylSd,VinylSd,BrkFace,350.0,Gd,TA,PConc,Gd,TA,Av,GLQ,655,Unf,0,490,1145,GasA,Ex,1,SBrkr,1145,1053,0,2198,1,0,2,1,4,1,Gd,9,Typ,1,TA,Attchd,2000.0,RFn,3,836,TA,TA,Y,192,84,0,0,0,0,,,,0,12,2008,WD,Normal,250000,0,0,1,0


In [4]:
# Include the feature interactions to our model:
df['totalsf'] = df['totalbsmtsf'] + df['firstflrsf'] + df['secondflrsf']
df['int_over_sf'] = df['totalsf'] * df['overallqual']

# Define the target variable and the explanatory variables:
Y = df['saleprice']
X = df[['overallqual', 'grlivarea', 'garagecars', 'int_over_sf', 'overallcond', 'mszoning_FV', 'mszoning_RL']]

# Split the dataset into training set and testing set:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2)

# Parameter optimization:
alphas = [np.power(10.0,p) for p in np.arange(-10,40,1)]

#### Regression model using OLS:

In [5]:
lrm = LinearRegression()

lrm.fit(X_train, y_train)

# We are making predictions here
y_preds_train = lrm.predict(X_train)
y_preds_test = lrm.predict(X_test)

print('R-squared of the model in training set is: {:.3f}'.format(lrm.score(X_train, y_train)))
print('-----Test set statistics-----')
print('R-squared of the model in test set is: {:.3f}'.format(lrm.score(X_test, y_test)))
print('Mean absolute error of the prediction is: {:.3f}'.format(mean_absolute_error(y_test, y_preds_test)))
print('Mean squared error of the prediction is: {:.3f}'.format(mse(y_test, y_preds_test)))
print('Root mean squared error of the prediction is: {:.3f}'.format(rmse(y_test, y_preds_test)))
print('Mean absolute percentage error of the prediction is: {:.3f}'.format(np.mean(np.abs((y_test - y_preds_test) / y_test)) * 100))

R-squared of the model in training set is: 0.771
-----Test set statistics-----
R-squared of the model in test set is: 0.834
Mean absolute error of the prediction is: 22150.843
Mean squared error of the prediction is: 1051050263.907
Root mean squared error of the prediction is: 32419.905
Mean absolute percentage error of the prediction is: 13.040


#### Regression model using RidgeCV:

In [6]:
from sklearn.linear_model import Ridge

# Fitting a ridge regression model. Alpha is the regularization parameter (usually called lambda). 
# As alpha gets larger, parameter shrinkage grows more pronounced.
ridge_cv = RidgeCV(alphas=alphas, cv=10)
ridge_cv.fit(X_train, y_train)

# We are making predictions here
y_preds_train = ridge_cv.predict(X_train)
y_preds_test = ridge_cv.predict(X_test)

print('Best alpha value is: {}'.format(ridge_cv.alpha_))
print('R-squared of the model in training set is: {:.3f}'.format(ridge_cv.score(X_train, y_train)))
print('-----Test set statistics-----')
print('R-squared of the model in test set is: {:.3f}'.format(ridge_cv.score(X_test, y_test)))
print('Mean absolute error of the prediction is: {:.3f}'.format(mean_absolute_error(y_test, y_preds_test)))
print('Mean squared error of the prediction is: {:.3f}'.format(mse(y_test, y_preds_test)))
print('Root mean squared error of the prediction is: {:.3f}'.format(rmse(y_test, y_preds_test)))
print('Mean absolute percentage error of the prediction is: {:.3f}'.format(np.mean(np.abs((y_test - y_preds_test) / y_test)) * 100))

Best alpha value is: 10.0
R-squared of the model in training set is: 0.771
-----Test set statistics-----
R-squared of the model in test set is: 0.836
Mean absolute error of the prediction is: 22098.684
Mean squared error of the prediction is: 1044036447.772
Root mean squared error of the prediction is: 32311.553
Mean absolute percentage error of the prediction is: 12.998


#### Regression model using Lasso:

In [7]:
from sklearn.linear_model import Lasso

# Fitting a ridge regression model. Alpha is the regularization parameter (usually called lambda). 
# As alpha gets larger, parameter shrinkage grows more pronounced.
lasso_cv = LassoCV(alphas=alphas, cv=10)
lasso_cv.fit(X_train, y_train)

# We are making predictions here
y_preds_train = lasso_cv.predict(X_train)
y_preds_test = lasso_cv.predict(X_test)

print('Best alpha value is: {}'.format(lasso_cv.alpha_))
print('R-squared of the model in training set is: {:.3f}'.format(lasso_cv.score(X_train, y_train)))
print('-----Test set statistics-----')
print('R-squared of the model in test set is: {:.3f}'.format(lasso_cv.score(X_test, y_test)))
print('Mean absolute error of the prediction is: {:.3f}'.format(mean_absolute_error(y_test, y_preds_test)))
print('Mean squared error of the prediction is: {:.3f}'.format(mse(y_test, y_preds_test)))
print('Root mean squared error of the prediction is: {:.3f}'.format(rmse(y_test, y_preds_test)))
print('Mean absolute percentage error of the prediction is: {:.3f}'.format(np.mean(np.abs((y_test - y_preds_test) / y_test)) * 100))

Best alpha value is: 1e-10
R-squared of the model in training set is: 0.771
-----Test set statistics-----
R-squared of the model in test set is: 0.834
Mean absolute error of the prediction is: 22150.843
Mean squared error of the prediction is: 1051050263.907
Root mean squared error of the prediction is: 32419.905
Mean absolute percentage error of the prediction is: 13.040


#### Regression model using ElasticNet:

In [8]:
from sklearn.linear_model import ElasticNet

# Fitting a ridge regression model. Alpha is the regularization parameter (usually called lambda). 
# As alpha gets larger, parameter shrinkage grows more pronounced.
elasticnet_cv = ElasticNetCV(alphas=alphas, cv=10)
elasticnet_cv.fit(X_train, y_train)

# We are making predictions here
y_preds_train = elasticnet_cv.predict(X_train)
y_preds_test = elasticnet_cv.predict(X_test)

print('Best alpha value is: {}'.format(elasticnet_cv.alpha_))
print('R-squared of the model in training set is: {:.3f}'.format(elasticnet_cv.score(X_train, y_train)))
print('-----Test set statistics-----')
print('R-squared of the model in test set is: {:.3f}'.format(elasticnet_cv.score(X_test, y_test)))
print('Mean absolute error of the prediction is: {:.3f}'.format(mean_absolute_error(y_test, y_preds_test)))
print('Mean squared error of the prediction is: {:.3f}'.format(mse(y_test, y_preds_test)))
print('Root mean squared error of the prediction is: {:.3f}'.format(rmse(y_test, y_preds_test)))
print('Mean absolute percentage error of the prediction is: {:.3f}'.format(np.mean(np.abs((y_test - y_preds_test) / y_test)) * 100))

Best alpha value is: 0.01
R-squared of the model in training set is: 0.771
-----Test set statistics-----
R-squared of the model in test set is: 0.835
Mean absolute error of the prediction is: 22111.190
Mean squared error of the prediction is: 1046594775.723
Root mean squared error of the prediction is: 32351.117
Mean absolute percentage error of the prediction is: 13.011


As we can see from the 4 different regression models above, using Lasso regression with a parameter (alpha =100) would return the best model with lowest R-squared, Mean absolute error, Mean squared error, Root mean squared error and Mean absolute % of error in our model prediction.