### 1) Load the houseprices data

In [25]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import mean_absolute_error
from statsmodels.tools.eval_measures import mse, rmse
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.linear_model import LassoCV, RidgeCV, ElasticNetCV

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('houseprices.cvs')

### 2) Reimplement your model from the previous checkpoint.

In [3]:
num_col = ['overallqual', 'grlivarea', 'garagecars', 'garagearea', 'totalbsmtsf', 'firstflrsf']
cat_col = ['exterqual', 'kitchenqual']

In [4]:
df2 = pd.concat([df[num_col], df[cat_col], df['saleprice']], axis = 1)

In [5]:
for col in cat_col:
    df2 = pd.concat([df2, pd.get_dummies(df[col], drop_first=True, prefix = col)], axis = 1)

In [6]:
Y = np.log(df2['saleprice'])
X = df2.drop(['saleprice', 'exterqual', 'kitchenqual'], axis = 1)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 42)

In [8]:
lrm = LinearRegression()
lrm.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [9]:
pred_train = lrm.predict(X_train)
pred_test = lrm.predict(X_test)

print("R-squared of the model in the training set is: {}".format(lrm.score(X_train, y_train)))
print("-----Test set statistics-----")
print("R-squared of the model in the test set is: {}".format(lrm.score(X_test, y_test)))
print("Mean absolute error of the prediction is: {}".format(mean_absolute_error(y_test, pred_test)))
print("Mean squared error of the prediction is: {}".format(mse(y_test, pred_test)))
print("Root mean squared error of the prediction is: {}".format(rmse(y_test, pred_test)))
print("Mean absolute percentage error of the prediction is: {}".format(np.mean(np.abs((y_test - pred_test) / y_test)) * 100))

R-squared of the model in the training set is: 0.8003528283156789
-----Test set statistics-----
R-squared of the model in the test set is: 0.8329179475267747
Mean absolute error of the prediction is: 0.12340351327054319
Mean squared error of the prediction is: 0.030593672673704088
Root mean squared error of the prediction is: 0.17491047045189745
Mean absolute percentage error of the prediction is: 1.0445968696594476


### 3) Try OLS, Lasso, Ridge, and ElasticNet regression using the same model specification. This time, you need to do k-fold cross-validation to choose the best hyperparameter values for your models. Which model is the best? Why?

#### 3.1) OLS

In [10]:
lrm2 = LinearRegression()

In [11]:
for i in np.arange(2,11):
    print('Cross validation score with {0} folds for OLS: {1:.4f}'.format(i, cross_val_score(lrm2, X, Y, cv = i, scoring = 'r2').mean()))
lrm2_best_nfolds = np.argmax([cross_val_score(lrm2, X, Y, cv = i, scoring = 'r2').mean() for i in np.arange(2,11)]) + 2
print('Best cross validation score: {0:.4f} with {1} folds'.format(cross_val_score(lrm2, X, Y, cv = i, scoring = 'r2').mean(), lrm2_best_nfolds))

Cross validation score with 2 folds for OLS: 0.7950
Cross validation score with 3 folds for OLS: 0.7949
Cross validation score with 4 folds for OLS: 0.7944
Cross validation score with 5 folds for OLS: 0.7952
Cross validation score with 6 folds for OLS: 0.7940
Cross validation score with 7 folds for OLS: 0.7957
Cross validation score with 8 folds for OLS: 0.7971
Cross validation score with 9 folds for OLS: 0.7965
Cross validation score with 10 folds for OLS: 0.7929
Best cross validation score: 0.7929 with 8 folds


In [12]:
pred_OLS = cross_val_predict(lrm2, X, Y, cv = lrm2_best_nfolds)

print("R-squared of the model in the training set is: {}".format(metrics.r2_score(Y, pred_OLS)))
print("-----Test set statistics-----")
# print("R-squared of the model in the test set is: {}".format(cross_validate()))
print("Mean absolute error of the prediction is: {}".format(mean_absolute_error(Y, pred_OLS)))
print("Mean squared error of the prediction is: {}".format(mse(Y, pred_OLS)))
print("Root mean squared error of the prediction is: {}".format(rmse(Y, pred_OLS)))
print("Mean absolute percentage error of the prediction is: {}".format(np.mean(np.abs((Y - pred_OLS) / Y)) * 100))

R-squared of the model in the training set is: 0.7962754912474016
-----Test set statistics-----
Mean absolute error of the prediction is: 0.12289336788755638
Mean squared error of the prediction is: 0.03248438348612612
Root mean squared error of the prediction is: 0.1802342461524061
Mean absolute percentage error of the prediction is: 1.030242480331095


#### 3.2) Lasso Regression
For Lasso regression, I need to set two things:  a list of alphas for the model to pick, and how many folds to use.  I set the list of alphas as 10^i, where i ranges from -30 to 30 in steps of 2.  I find the optimal number of folds with a for loop similar to the one for OLS.  The R-squared value for Lasso regression does not increase after 3 folds, so 3 folds are used.  The optimal value for alpha is 0.0001.

In [13]:
alpha_lst = [10**int(a) for a in np.arange(-30, 30, 2)]

In [14]:
for i in np.arange(2,6):
    tmp = LassoCV(alphas = alpha_lst, cv = i)
    tmp.fit(X_train, y_train)
    print('R-squared value is {} with {} folds'.format(tmp.score(X_train, y_train), i))

R-squared value is 0.8003293246312333 with 2 folds
R-squared value is 0.8003293246312333 with 3 folds
R-squared value is 0.8003293246312333 with 4 folds
R-squared value is 0.8003293246312333 with 5 folds


In [15]:
lasso = LassoCV(alphas = alpha_lst, cv=3)
lasso.fit(X_train, y_train)
print('The alpha value for Lasso is: {}'.format(lasso.alpha_))

The alpha value for Lasso is: 0.0001


In [16]:
y_lasso_train = lasso.predict(X_train)
y_lasso_test = lasso.predict(X_test)

print("R-squared of the model on the Lasso training set is: {}".format(lasso.score(X_train, y_train)))
print("-----Test set statistics-----")
print("R-squared of the model on the Lasso test set is: {}".format(lasso.score(X_test, y_test)))
print("Mean absolute error of the Lasso prediction is: {}".format(mean_absolute_error(y_test, y_lasso_test)))
print("Mean squared error of the Lasso prediction is: {}".format(mse(y_test, y_lasso_test)))
print("Root mean squared error of the Lasso prediction is: {}".format(rmse(y_test, y_lasso_test)))
print("Mean absolute percentage error of the Lasso prediction is: {}".format(np.mean(np.abs((y_test - y_lasso_test) / y_test)) * 100))


R-squared of the model on the Lasso training set is: 0.8003293246312333
-----Test set statistics-----
R-squared of the model on the Lasso test set is: 0.8325503147419295
Mean absolute error of the Lasso prediction is: 0.12348407821102601
Mean squared error of the Lasso prediction is: 0.030660988324410974
Root mean squared error of the Lasso prediction is: 0.17510279359396574
Mean absolute percentage error of the Lasso prediction is: 1.0452117801384997


#### 3.3) Ridge Regression
The same alpha values from section 3.2 are used to the test Ridge regression model, and number of folds range from 2 to 4.  All the folds had the same R-squared value, so I will be using the same number of folds as I used for Lasso regression.  The optimal alpha value is 1.  

In [21]:
for i in np.arange(2,5):
    tmp = RidgeCV(alphas = alpha_lst, cv = i)
    tmp.fit(X_train, y_train)
    print('R-squared value is {} with {} folds'.format(tmp.score(X_train, y_train), i))

R-squared value is 0.8003191321739819 with 2 folds
R-squared value is 0.8003191321739819 with 3 folds
R-squared value is 0.8003191321739819 with 4 folds


In [23]:
ridge = RidgeCV(alphas=alpha_lst, cv = 3)
ridge.fit(X_train, y_train)
print('The alpha value for Ridge is: {}'.format(ridge.alpha_))

The alpha value for Ridge is: 1


In [24]:
y_ridge_train = ridge.predict(X_train)
y_ridge_test = ridge.predict(X_test)

print("R-squared of the model on the Ridge training set is: {}".format(ridge.score(X_train, y_train)))
print("-----Test set statistics-----")
print("R-squared of the model on the Ridge test set is: {}".format(ridge.score(X_test, y_test)))
print("Mean absolute error of the Ridge prediction is: {}".format(mean_absolute_error(y_test, y_ridge_test)))
print("Mean squared error of the Ridge prediction is: {}".format(mse(y_test, y_ridge_test)))
print("Root mean squared error of the Ridge prediction is: {}".format(rmse(y_test, y_ridge_test)))
print("Mean absolute percentage error of the Ridge prediction is: {}".format(np.mean(np.abs((y_test - y_ridge_test) / y_test)) * 100))

R-squared of the model on the Ridge training set is: 0.8003191321739819
-----Test set statistics-----
R-squared of the model on the Ridge test set is: 0.8323754038836404
Mean absolute error of the Ridge prediction is: 0.1235609326546338
Mean squared error of the Ridge prediction is: 0.03069301549588971
Root mean squared error of the Ridge prediction is: 0.1751942222103506
Mean absolute percentage error of the Ridge prediction is: 1.0458381765585514


#### 3.4) ElasticNet
Since ElasticNet regression is a blend of Lasso and Ridge regression, it too has an alpha hyperparameter.  It has an additional hyperparameter called l1_ratio.  The documentation suggests using an array closer to one, such as [.1, .5, .7, .9, .95, .99, 1].  Again, the R-squared value for 2-4 folds is the same.  I will use 3 folds to be consistent with the other models.

In [26]:
for i in np.arange(2,5):
    tmp = ElasticNetCV(alphas = alpha_lst, l1_ratio=  [.1, .5, .7, .9, .95, .99, 1],cv = i)
    tmp.fit(X_train, y_train)
    print('R-squared value is {} with {} folds'.format(tmp.score(X_train, y_train), i))

R-squared value is 0.8003293246312333 with 2 folds
R-squared value is 0.8003293246312333 with 3 folds
R-squared value is 0.8003293246312333 with 4 folds


In [28]:
elastic = ElasticNetCV(alphas= alpha_lst, l1_ratio= [.1, .5, .7, .9, .95, .99, 1], cv = 3)
elastic.fit(X_train, y_train)
print('The alpha value and l1_ratio for ElasticNet are {} and {}'.format(elastic.alpha_, elastic.l1_ratio_))

The alpha value and l1_ratio for ElasticNet are 0.0001 and 1.0


In [30]:
y_elastic_train = elastic.predict(X_train)
y_elastic_test = elastic.predict(X_test)

print("R-squared of the model on the ElasticNet training set is: {}".format(elastic.score(X_train, y_train)))
print("-----Test set statistics-----")
print("R-squared of the model on the ElasticNet test set is: {}".format(elastic.score(X_test, y_test)))
print("Mean absolute error of the ElasticNet prediction is: {}".format(mean_absolute_error(y_test, y_elastic_test)))
print("Mean squared error of the ElasticNet prediction is: {}".format(mse(y_test, y_elastic_test)))
print("Root mean squared error of the ElasticNet prediction is: {}".format(rmse(y_test, y_elastic_test)))
print("Mean absolute percentage error of the ElasticNet prediction is: {}".format(np.mean(np.abs((y_test - y_elastic_test) / y_test)) * 100))

R-squared of the model on the ElasticNet training set is: 0.8003293246312333
-----Test set statistics-----
R-squared of the model on the ElasticNet test set is: 0.8325503147419295
Mean absolute error of the ElasticNet prediction is: 0.12348407821102601
Mean squared error of the ElasticNet prediction is: 0.030660988324410974
Root mean squared error of the ElasticNet prediction is: 0.17510279359396574
Mean absolute percentage error of the ElasticNet prediction is: 1.0452117801384997


#### 3.5) Which model is the best?
A table below summarizes the R-squared value for training and test sets, the MAE, MSE, RMSE, and MAPE for all four models.  OLS had the lowest R-squared values, but its error metrics were mixed.  Overall, the Lasso, Ridge, and ElasticNet regressions had similar reults.  In fact, the l1 ratio for the ElasticNet regression was 1, which means it is a Lasso regression.  Ridge regression has the lower R-squared value for the test set, but generally has higher error metrics.  It is difficult to pick if Ridge or Lasso regression is better in this case.  

Generally, I don't think results between each model should be this close together.  My guess is that my work in the feature engineering phase was not optimal.

| Model | R-squared Training Set  | R-squared Test Set  | MAE  | MSE  | RMSE  | MAPE  |
|------|------|
|   OLS  | 0.7963| NA  | 0.1229  | 0.0325  | 0.1802  | 1.030 |
|   Lasso  |   0.8003  |   0.8326  |   0.1235  |   0.0307  |   0.1751  |   1.045  |
|   Ridge  |   0.8003  |   0.8324  |   0.1236  |   0.0307  |   0.1752  |   1.046  |
|   ElasticNet  |   0.8003  |   0.8326  |   0.1235  |   0.0307  |   0.1751  |   1.045|  