# 18.7 - Overfitting and Regularization

In [82]:
import numpy as np
import pandas as pd
from sklearn import linear_model
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV, ElasticNetCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from statsmodels.tools.eval_measures import mse, rmse
from sqlalchemy import create_engine

import warnings
warnings.filterwarnings('ignore')

postgres_user = 'dsbc_student'
postgres_pw = '7*.8G9QH21'
postgres_host = '142.93.121.174'
postgres_port = '5432'
postgres_db = 'houseprices'

engine = create_engine('postgresql://{}:{}@{}:{}/{}'.format(
    postgres_user, postgres_pw, postgres_host, postgres_port, postgres_db))

house = pd.read_sql_query('select * from houseprices',con=engine)

engine.dispose()

y = house['saleprice']
X = house[['grlivarea', 'totalbsmtsf', 'fullbath', 'halfbath', 'overallqual', 'overallcond', 'yearbuilt', 'garagearea']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 465)

### Model 1: Linear Regression

In [83]:
lr = LinearRegression()
lr = lr.fit(X_train, y_train)

### Model 2: Lasso

In [84]:
alphas=[10 ** i for i in range(-10,40)]

lasso = LassoCV(alphas=alphas)
lasso = lasso.fit(X_train, y_train)

### Model 3: Ridge

In [85]:
ridge = RidgeCV(alphas=alphas)
ridge = ridge.fit(X_train, y_train)

### Model 4: Elastic Net

In [86]:
elastic = ElasticNetCV(alphas=alphas)
elastic = elastic.fit(X_train, y_train)

### Evaluation

In [87]:
regs = [lr, lasso, ridge, elastic]
types = ['--------- Linear Regression ---------', 
         '--------------- Lasso ---------------', 
         '--------------- Ridge ---------------', 
         '------------ Elastic Net ------------']
count = 0
for r in regs:
    print(types[count])
    count += 1
    try:
        print('Alpha: \t\t\t{}'.format(r.alpha_))
    except:
        pass
    #print(r.score(X_test, y_test))
    #print(mean_absolute_error(y_test, r.predict(X_test)))
    #print(rmse(y_test, r.predict(X_test)))
    #print('')
    print('R-squared(Train): \t{}'.format(round(r.score(X_train, y_train),3)))
    print('R-squared(Test): \t{}'.format(round(r.score(X_test, y_test),3)))
    print('MAE: \t\t\t{}'.format(round(mean_absolute_error(y_test, r.predict(X_test)),2)))
    print('MSE: \t\t\t{}'.format(round(mse(y_test, r.predict(X_test)),2)))
    print('RMSE: \t\t\t{}'.format(round(rmse(y_test, r.predict(X_test)),3)))
    print('MAPE: \t\t\t{}%'.format(round(np.mean(np.abs((y_test-r.predict(X_test))*100/y_test)),3)))
    print('')

--------- Linear Regression ---------
R-squared(Train): 	0.771
R-squared(Test): 	0.783
MAE: 			24169.31
MSE: 			1457655930.83
RMSE: 			38179.26
MAPE: 			14.833%

--------------- Lasso ---------------
Alpha: 			1000
R-squared(Train): 	0.77
R-squared(Test): 	0.784
MAE: 			23986.78
MSE: 			1447958841.78
RMSE: 			38052.054
MAPE: 			14.612%

--------------- Ridge ---------------
Alpha: 			100
R-squared(Train): 	0.77
R-squared(Test): 	0.785
MAE: 			23719.37
MSE: 			1443478608.15
RMSE: 			37993.139
MAPE: 			14.428%

------------ Elastic Net ------------
Alpha: 			0.1
R-squared(Train): 	0.77
R-squared(Test): 	0.784
MAE: 			23895.26
MSE: 			1448314250.94
RMSE: 			38056.724
MAPE: 			14.584%



All 4 models are fairly comparable. All regularized models show a slight improvement no matter what metric we use. Ridge performs the best out of all of them, however the difference is very slight. The RMSE for the ridge model is \\$37,993, compared to \\$38,179 for OLS.