In [15]:
### Load the houseprices data from Thinkful's database.

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sqlalchemy import create_engine
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from statsmodels.tools.eval_measures import mse, rmse
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import LassoCV
from sklearn.linear_model import ElasticNetCV

%matplotlib inline
pd.options.display.float_format = '{:.3f}'.format
import warnings
warnings.filterwarnings('ignore')

postgres_user = 'dsbc_student'
postgres_pw = '7*.8G9QH21'
postgres_host = '142.93.121.174'
postgres_port = '5432'
postgres_db = 'houseprices'
engine = create_engine('postgresql://{}:{}@{}:{}/{}'.format(
    postgres_user, postgres_pw, postgres_host, postgres_port, postgres_db))
house_df = pd.read_sql_query('select * from houseprices',con=engine)

# no need for an open connection, as we're only doing a single query
engine.dispose()

In [16]:
house_df = pd.concat([house_df,pd.get_dummies(house_df.centralair, prefix = "centralair", drop_first = True)], axis = 1)
dummy_column_names = list(pd.get_dummies(house_df.centralair, prefix = "centralair", drop_first = True).columns)
house_df['totalsf'] = house_df['totalbsmtsf'] + house_df['firstflrsf'] + house_df['secondflrsf']
Y = np.log1p(house_df['saleprice'])
X = house_df[['overallqual', 'grlivarea', 'garagecars', 'totalsf'] + dummy_column_names]
X = sm.add_constant(X)
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 37)
alphas = [np.power(10.0,p) for p in np.arange(-10,40,1)]

In [17]:
### Try OLS, lasso, ridge, and elastic net regression using the same model specification. This time, you need to do k-fold 
### cross-validation to choose the best hyperparameter values for your models. Scikit-learn has RidgeCV, LassoCV, and 
### ElasticNetCV that you can utilize to do this. Which model is the best? Why?

In [18]:
### OLS 

lrm = LinearRegression()
lrm.fit(X_train, y_train)
y_preds_train = lrm.predict(X_train)
y_preds_test = lrm.predict(X_test)

print("R-squared of the model in the training set is: {}".format(lrm.score(X_train, y_train)))
print("-----Test set statistics-----")
print("R-squared of the model in the test set is: {}".format(lrm.score(X_test, y_test)))
print("Mean absolute error of the prediction is: {}".format(mean_absolute_error(y_test, y_preds_test)))
print("Mean squared error of the prediction is: {}".format(mse(y_test, y_preds_test)))
print("Root mean squared error of the prediction is: {}".format(rmse(y_test, y_preds_test)))
print("Mean absolute percentage error of the prediction is: {}".format(np.mean(np.abs((y_test - y_preds_test) / y_test)) * 100))

R-squared of the model in the training set is: 0.8191858742162685
-----Test set statistics-----
R-squared of the model in the test set is: 0.7776690719006142
Mean absolute error of the prediction is: 0.12720268943550483
Mean squared error of the prediction is: 0.034147143979617184
Root mean squared error of the prediction is: 0.18478945851865355
Mean absolute percentage error of the prediction is: 1.0609997089249261


In [20]:
# Lasso

lassoregr = LassoCV(alphas = alphas, cv = 5) 
lassoregr.fit(X_train, y_train)
y_preds_train = lassoregr.predict(X_train)
y_preds_test = lassoregr.predict(X_test)

print("R-squared of the model on the training set is: {}".format(lassoregr.score(X_train, y_train)))
print("-----Test set statistics-----")
print("R-squared of the model on the test set is: {}".format(lassoregr.score(X_test, y_test)))
print("Mean absolute error of the prediction is: {}".format(mean_absolute_error(y_test, y_preds_test)))
print("Mean squared error of the prediction is: {}".format(mse(y_test, y_preds_test)))
print("Root mean squared error of the prediction is: {}".format(rmse(y_test, y_preds_test)))
print("Mean absolute percentage error of the prediction is: {}".format(np.mean(np.abs((y_test - y_preds_test) / y_test)) * 100))

R-squared of the model on the training set is: 0.8191858742162685
-----Test set statistics-----
R-squared of the model on the test set is: 0.7776690719634503
Mean absolute error of the prediction is: 0.12720268941039048
Mean squared error of the prediction is: 0.03414714396996638
Root mean squared error of the prediction is: 0.18478945849254058
Mean absolute percentage error of the prediction is: 1.060999708705768


In [21]:
# Ridge

ridgeregr = RidgeCV(alphas = alphas, cv = 5) 
ridgeregr.fit(X_train, y_train)
y_preds_train = ridgeregr.predict(X_train)
y_preds_test = ridgeregr.predict(X_test)

print("R-squared of the model on the training set is: {}".format(ridgeregr.score(X_train, y_train)))
print("-----Test set statistics-----")
print("R-squared of the model on the test set is: {}".format(ridgeregr.score(X_test, y_test)))
print("Mean absolute error of the prediction is: {}".format(mean_absolute_error(y_test, y_preds_test)))
print("Mean squared error of the prediction is: {}".format(mse(y_test, y_preds_test)))
print("Root mean squared error of the prediction is: {}".format(rmse(y_test, y_preds_test)))
print("Mean absolute percentage error of the prediction is: {}".format(np.mean(np.abs((y_test - y_preds_test) / y_test)) * 100))

R-squared of the model on the training set is: 0.8191820460530164
-----Test set statistics-----
R-squared of the model on the test set is: 0.7777610861264382
Mean absolute error of the prediction is: 0.12717114846683264
Mean squared error of the prediction is: 0.03413301178917366
Root mean squared error of the prediction is: 0.18475121593422236
Mean absolute percentage error of the prediction is: 1.0607252270165501


In [None]:
# OLS and Lasso produce very similar results. Lasso produces smaller mean errors values and so is likely a better option here.