In [1]:
%load_ext nb_black

<IPython.core.display.Javascript object>

In [10]:
import warnings
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV, ElasticNetCV
from sklearn.metrics import mean_absolute_error

import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.stats.diagnostic import het_breuschpagan
from statsmodels.graphics.gofplots import qqplot
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.eval_measures import mse, rmse

from scipy import stats

from sqlalchemy import create_engine

import seaborn as sns
import matplotlib.pyplot as plt

<IPython.core.display.Javascript object>

In [3]:
warnings.filterwarnings("ignore")

postgres_user = "dsbc_student"
postgres_pw = "7*.8G9QH21"
postgres_host = "142.93.121.174"
postgres_port = "5432"
postgres_db = "houseprices"

engine = create_engine(
    "postgresql://{}:{}@{}:{}/{}".format(
        postgres_user, postgres_pw, postgres_host, postgres_port, postgres_db
    )
)
house_prices_df = pd.read_sql_query("select * from houseprices", con=engine)

# no need for an open connection, as we're only doing a single query
engine.dispose()

<IPython.core.display.Javascript object>

In [5]:
def clean_house_prices_df(house_prices_df):
    house_prices_df = house_prices_df.set_index("id")

    # Log transfrom dataframe
    log_df = house_prices_df.copy()
    log_df["log_saleprice"] = np.log(log_df["saleprice"])
    log_df = log_df.drop(columns=["saleprice"])

    # Keep only top correlated columns
    corr_df = log_df.corr()[["log_saleprice"]]
    corr_df.columns = ["corr"]
    corr_df["abs_corr"] = corr_df.abs()
    top_corrs = corr_df.sort_values("abs_corr", ascending=False).head(10)
    num_cols = log_df.select_dtypes("number").columns
    keep_cols = top_corrs.index
    drop_cols = [c for c in num_cols if c not in keep_cols]
    log_df = log_df.drop(columns=drop_cols)
    log_df

    drop_cols = ["poolqc", "alley", "fence", "fireplacequ", "miscfeature"]
    log_df = log_df.drop(columns=drop_cols)
    log_df = log_df.dropna()

    cat_cols = log_df.select_dtypes("O").copy()
    keep_cols = ["exterqual", "bsmtqual", "kitchenqual", "centralair"]
    drop_cols = [c for c in cat_cols.columns if c not in keep_cols]

    log_df = log_df.drop(columns=drop_cols)

    # Encode centralair as binary
    log_df["centralair"] = (log_df["centralair"] == "Y").astype(int)

    # Encode qual columns as ordinal
    quality_map = {"Fa": 1, "TA": 2, "Gd": 3, "Ex": 4}
    log_df[["exterqual", "bsmtqual", "kitchenqual"]] = log_df[
        ["exterqual", "bsmtqual", "kitchenqual"]
    ].replace(quality_map)
    
    log_df= log_df.drop(columns=["fullbath", "garagearea", "totalbsmtsf", "exterqual"])

    return log_df


<IPython.core.display.Javascript object>

In [7]:
houses = clean_house_prices_df(house_prices_df)
houses

Unnamed: 0_level_0,overallqual,yearbuilt,yearremodadd,bsmtqual,centralair,firstflrsf,grlivarea,kitchenqual,garagecars,log_saleprice
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,7,2003,2003,3,1,856,1710,3,2,12.247694
2,6,1976,1976,3,1,1262,1262,2,2,12.109011
3,7,2001,2002,3,1,920,1786,3,2,12.317167
4,7,1915,1970,2,1,961,1717,3,3,11.849398
5,8,2000,2000,3,1,1145,2198,3,3,12.429216
...,...,...,...,...,...,...,...,...,...,...
1456,6,1999,2000,3,1,953,1647,2,2,12.072541
1457,6,1978,1988,3,1,2073,2073,2,2,12.254863
1458,7,1941,2006,2,1,1188,2340,3,1,12.493130
1459,5,1950,1996,2,1,1078,1078,3,1,11.864462


<IPython.core.display.Javascript object>

In [8]:
X = houses.drop(columns=["log_saleprice"])
y = houses["log_saleprice"]
X = sm.add_constant(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

<IPython.core.display.Javascript object>

In [9]:
lm_results = sm.OLS(y_train, X_train).fit()

lm_results.summary()

0,1,2,3
Dep. Variable:,log_saleprice,R-squared:,0.826
Model:,OLS,Adj. R-squared:,0.825
Method:,Least Squares,F-statistic:,560.7
Date:,"Wed, 08 Apr 2020",Prob (F-statistic):,0.0
Time:,20:49:09,Log-Likelihood:,454.3
No. Observations:,1070,AIC:,-888.6
Df Residuals:,1060,BIC:,-838.8
Df Model:,9,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,6.6586,0.754,8.835,0.000,5.180,8.137
overallqual,0.0753,0.006,11.771,0.000,0.063,0.088
yearbuilt,0.0006,0.000,2.120,0.034,4.51e-05,0.001
yearremodadd,0.0013,0.000,3.751,0.000,0.001,0.002
bsmtqual,0.0513,0.012,4.256,0.000,0.028,0.075
centralair,0.2201,0.026,8.339,0.000,0.168,0.272
firstflrsf,0.0001,1.58e-05,7.656,0.000,8.97e-05,0.000
grlivarea,0.0002,1.32e-05,17.607,0.000,0.000,0.000
kitchenqual,0.0566,0.011,4.989,0.000,0.034,0.079

0,1,2,3
Omnibus:,698.839,Durbin-Watson:,2.063
Prob(Omnibus):,0.0,Jarque-Bera (JB):,26715.326
Skew:,-2.436,Prob(JB):,0.0
Kurtosis:,26.989,Cond. No.,532000.0


<IPython.core.display.Javascript object>

In [12]:
y_pred = lm_results.predict(X_test)
y_true = y_test

evaluators = ["mae", "mse", "rmse", "mape"]
evaluations = [
    mean_absolute_error(y_true, y_pred),
    mse(y_true, y_pred),
    rmse(y_true, y_pred),
    np.mean(np.abs((y_true - y_pred) / y_true)) * 100,
]
evaluation_df = pd.DataFrame({"evaluators": evaluators, "values": evaluations})
evaluation_df

Unnamed: 0,evaluators,values
0,mae,0.113693
1,mse,0.029106
2,rmse,0.170605
3,mape,0.947434


<IPython.core.display.Javascript object>

In [14]:
alphas = [np.power(10.0, p) for p in np.arange(-10, 40, 1)]
alphas

[1e-10,
 1e-09,
 1e-08,
 1e-07,
 1e-06,
 1e-05,
 0.0001,
 0.001,
 0.01,
 0.1,
 1.0,
 10.0,
 100.0,
 1000.0,
 10000.0,
 100000.0,
 1000000.0,
 10000000.0,
 100000000.0,
 1000000000.0,
 10000000000.0,
 100000000000.0,
 1000000000000.0,
 10000000000000.0,
 100000000000000.0,
 1000000000000000.0,
 1e+16,
 1e+17,
 1e+18,
 1e+19,
 1e+20,
 1e+21,
 1e+22,
 1e+23,
 1e+24,
 1e+25,
 1e+26,
 1e+27,
 1e+28,
 1e+29,
 1e+30,
 1e+31,
 1e+32,
 1e+33,
 1e+34,
 1e+35,
 1e+36,
 1e+37,
 1e+38,
 1e+39]

<IPython.core.display.Javascript object>

In [15]:
lasso_cv = LassoCV(alphas=alphas, cv=5)

lasso_cv.fit(X_train, y_train)

y_preds_train = lasso_cv.predict(X_train)
y_preds_test = lasso_cv.predict(X_test)

print("Best alpha value is: {}".format(lasso_cv.alpha_))
print(
    "R-squared of the model in training set is: {}".format(
        lasso_cv.score(X_train, y_train)
    )
)
print("-----Test set statistics-----")
print(
    "R-squared of the model in test set is: {}".format(lasso_cv.score(X_test, y_test))
)
print(
    "Mean absolute error of the prediction is: {}".format(
        mean_absolute_error(y_test, y_preds_test)
    )
)
print("Mean squared error of the prediction is: {}".format(mse(y_test, y_preds_test)))
print(
    "Root mean squared error of the prediction is: {}".format(
        rmse(y_test, y_preds_test)
    )
)
print(
    "Mean absolute percentage error of the prediction is: {}".format(
        np.mean(np.abs((y_test - y_preds_test) / y_test)) * 100
    )
)

Best alpha value is: 1e-10
R-squared of the model in training set is: 0.8264177121949328
-----Test set statistics-----
R-squared of the model in test set is: 0.7769455119205917
Mean absolute error of the prediction is: 0.11369319864212621
Mean squared error of the prediction is: 0.029106213793965115
Root mean squared error of the prediction is: 0.17060543307282192
Mean absolute percentage error of the prediction is: 0.9474336664273999


<IPython.core.display.Javascript object>

In [16]:
lrm = LinearRegression()

lrm.fit(X_train, y_train)

y_preds_train = lrm.predict(X_train)
y_preds_test = lrm.predict(X_test)

print(
    "R-squared of the model in training set is: {}".format(lrm.score(X_train, y_train))
)
print("-----Test set statistics-----")
print("R-squared of the model in test set is: {}".format(lrm.score(X_test, y_test)))
print(
    "Mean absolute error of the prediction is: {}".format(
        mean_absolute_error(y_test, y_preds_test)
    )
)
print("Mean squared error of the prediction is: {}".format(mse(y_test, y_preds_test)))
print(
    "Root mean squared error of the prediction is: {}".format(
        rmse(y_test, y_preds_test)
    )
)
print(
    "Mean absolute percentage error of the prediction is: {}".format(
        np.mean(np.abs((y_test - y_preds_test) / y_test)) * 100
    )
)

R-squared of the model in training set is: 0.8264177121949328
-----Test set statistics-----
R-squared of the model in test set is: 0.7769455115426959
Mean absolute error of the prediction is: 0.1136931987308856
Mean squared error of the prediction is: 0.02910621384327646
Root mean squared error of the prediction is: 0.1706054332173406
Mean absolute percentage error of the prediction is: 0.9474336671913627


<IPython.core.display.Javascript object>

In [17]:
ridge_cv = RidgeCV(alphas=alphas, cv=5)

ridge_cv.fit(X_train, y_train)

# We are making predictions here
y_preds_train = ridge_cv.predict(X_train)
y_preds_test = ridge_cv.predict(X_test)

print("Best alpha value is: {}".format(ridge_cv.alpha_))
print(
    "R-squared of the model in training set is: {}".format(
        ridge_cv.score(X_train, y_train)
    )
)
print("-----Test set statistics-----")
print(
    "R-squared of the model in test set is: {}".format(ridge_cv.score(X_test, y_test))
)
print(
    "Mean absolute error of the prediction is: {}".format(
        mean_absolute_error(y_test, y_preds_test)
    )
)
print("Mean squared error of the prediction is: {}".format(mse(y_test, y_preds_test)))
print(
    "Root mean squared error of the prediction is: {}".format(
        rmse(y_test, y_preds_test)
    )
)
print(
    "Mean absolute percentage error of the prediction is: {}".format(
        np.mean(np.abs((y_test - y_preds_test) / y_test)) * 100
    )
)

Best alpha value is: 1e-10
R-squared of the model in training set is: 0.8264177121949328
-----Test set statistics-----
R-squared of the model in test set is: 0.7769455115427579
Mean absolute error of the prediction is: 0.11369319873087476
Mean squared error of the prediction is: 0.02910621384326836
Root mean squared error of the prediction is: 0.17060543321731686
Mean absolute percentage error of the prediction is: 0.9474336671912698


<IPython.core.display.Javascript object>

In [18]:
elasticnet_cv = ElasticNetCV(alphas=alphas, cv=5)

elasticnet_cv.fit(X_train, y_train)

# We are making predictions here
y_preds_train = elasticnet_cv.predict(X_train)
y_preds_test = elasticnet_cv.predict(X_test)

print("Best alpha value is: {}".format(elasticnet_cv.alpha_))
print(
    "R-squared of the model in training set is: {}".format(
        elasticnet_cv.score(X_train, y_train)
    )
)
print("-----Test set statistics-----")
print(
    "R-squared of the model in test set is: {}".format(
        elasticnet_cv.score(X_test, y_test)
    )
)
print(
    "Mean absolute error of the prediction is: {}".format(
        mean_absolute_error(y_test, y_preds_test)
    )
)
print("Mean squared error of the prediction is: {}".format(mse(y_test, y_preds_test)))
print(
    "Root mean squared error of the prediction is: {}".format(
        rmse(y_test, y_preds_test)
    )
)
print(
    "Mean absolute percentage error of the prediction is: {}".format(
        np.mean(np.abs((y_test - y_preds_test) / y_test)) * 100
    )
)

Best alpha value is: 1e-10
R-squared of the model in training set is: 0.8264177121949328
-----Test set statistics-----
R-squared of the model in test set is: 0.7769455117646251
Mean absolute error of the prediction is: 0.11369319868071746
Mean squared error of the prediction is: 0.029106213814317074
Root mean squared error of the prediction is: 0.17060543313246818
Mean absolute percentage error of the prediction is: 0.9474336667596669


<IPython.core.display.Javascript object>