In [1]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import math
import statsmodels.api as sm
from datetime import datetime, timedelta
from sklearn.linear_model import Lasso, Ridge
from sklearn.metrics import mean_squared_error



%matplotlib inline

## American Housing Survey Questionnaire - 2017

About: https://www.census.gov/programs-surveys/ahs.html

Download: http://www2.census.gov/programs-surveys/ahs/2017/AHS%202017%20National%20PUF%20v3.0%20Flat%20CSV.zip?#

Codebook: https://www.census.gov/data-tools/demo/codebook/ahs/ahsdict.html

Definitions: https://www2.census.gov/programs-surveys/ahs/2017/2017%20AHS%20Definitions.pdf?#


In [None]:
df = pd.read_csv("data/ahs2017n.csv")
print(df.shape)
df.head()

## Cleaning and Exploration

In [None]:
cols = [
    "YRBUILT",    # year built
    "LOTSIZE",    # size of lot
    "STORIES",    # number of stories
    "BEDROOMS",   # number of bedrooms
    "BATHROOMS",  # number of bathrooms
    "KITCHENS",   # number of kitches
    "NUMPEOPLE",  # number of people living in unit
    "TOTROOMS",   # total rooms
    "HHAGE",      # householder age
    "HINCP",      # household income
    "MARKETVAL",  # current market value
]

df_filtered = df[cols]
df_filtered = df_filtered[df_filtered['NUMPEOPLE'] > 0]
df_filtered = df_filtered[df_filtered['HINCP'] < 1_000_000]
df_filtered = df_filtered[df_filtered['MARKETVAL'] < 2_000_000]
df_filtered = df_filtered[df_filtered['MARKETVAL'] > 1_000]
df_filtered["BATHROOMS"] = df_filtered["BATHROOMS"].apply(lambda x: int(x.replace("'","")))
df_filtered["LOTSIZE"] = df_filtered["LOTSIZE"].apply(lambda x: int(x.replace("'","")))
df_filtered = df_filtered[df_filtered['LOTSIZE'] > 0]


df_filtered.shape

In [None]:
_ = df_filtered.hist(column=cols, bins=25, figsize=(20,20))

## K-Folds Cross Validation

In [None]:
df_filtered = df_filtered.sample(frac=1)
num_folds = 5
folds_counter = 0

def assign_k(row):
    global num_folds, folds_counter
    curr_num = folds_counter
    folds_counter = (folds_counter + 1) % num_folds
    return curr_num

df_filtered['k'] = df_filtered.apply(assign_k, axis=1)

df_train = df_filtered[df_filtered['k'] != 0]
df_test = df_filtered[df_filtered['k'] == 0]

## Model 1 - Multiple Linear Regression - Forward Stepwise Selection

In [None]:
predictors = cols[:-1]
selected_predictors = []
errors = []
smallest_error = float("inf")
smallest_error_index = 0

for j in range(len(predictors)):
    
    smallest_mse_index = 0
    smallest_mse = float("inf")
    
    for i, predictor in enumerate(predictors): 
        
        X = df_train[selected_predictors + [predictor]]
        X = sm.add_constant(X)
        Y = df_train["MARKETVAL"]
        model = sm.OLS(Y,X)
        results = model.fit()
        y_true = df_test["MARKETVAL"]
        y_pred = results.predict(sm.add_constant(df_test[selected_predictors + [predictor]]))
        mse = sm.tools.eval_measures.mse(y_true, y_pred)
        
        if mse < smallest_mse:
            smallest_mse = mse
            smallest_mse_index = i
            
            
    selected_predictors.append(predictors.pop(smallest_mse_index))
    errors.append(smallest_mse)
    
    if smallest_mse < smallest_error:
        smallest_error = smallest_mse
        smallest_error_index = j
        

In [None]:
plt.plot(errors)
_ = plt.ylabel("MSE")
_ = plt.xlabel("Number of predictors")

In [None]:
# create best model
multiple_linear_regression_error = smallest_error
print("Top predictors: " + str(selected_predictors[:smallest_error_index+1]))
print("MSE: " + str(multiple_linear_regression_error))
X = df_train[selected_predictors[:smallest_error_index+1]]
X = sm.add_constant(X)
Y = df_train["MARKETVAL"]
model = sm.OLS(Y,X)
results = model.fit()
results.summary()

## Model 2 - Lasso Regression - Forward Stepwise Selection

In [None]:
predictors = cols[:-1]
selected_predictors = []
errors = []
smallest_error = float("inf")
smallest_error_index = 0

for j in range(len(predictors)):
    
    smallest_mse_index = 0
    smallest_mse = float("inf")
    
    for i, predictor in enumerate(predictors): 
        
        X = df_train[selected_predictors + [predictor]]
        Y = df_train["MARKETVAL"]
        lasso = Lasso()
        lasso.fit(X, Y)
        y_true = df_test["MARKETVAL"]
        y_pred = lasso.predict(df_test[selected_predictors + [predictor]])        
        mse = mean_squared_error(y_true, y_pred)
        
        if mse < smallest_mse:
            smallest_mse = mse
            smallest_mse_index = i
            
            
    selected_predictors.append(predictors.pop(smallest_mse_index))
    errors.append(smallest_mse)
    
    if smallest_mse < smallest_error:
        smallest_error = smallest_mse
        smallest_error_index = j
        

In [None]:
plt.plot(errors)
_ = plt.ylabel("MSE")
_ = plt.xlabel("Number of predictors")

In [None]:
lasso_regression_error = smallest_error
print("Top predictors: " + str(selected_predictors[:smallest_error_index+1]))
print("MSE: " + str(lasso_regression_error))

## Model 3 - Ridge Regression - Forward Stepwise Selection

In [None]:
predictors = cols[:-1]
selected_predictors = []
errors = []
smallest_error = float("inf")
smallest_error_index = 0

for j in range(len(predictors)):
    
    smallest_mse_index = 0
    smallest_mse = float("inf")
    
    for i, predictor in enumerate(predictors): 
        
        X = df_train[selected_predictors + [predictor]]
        Y = df_train["MARKETVAL"]
        ridge = Ridge()
        ridge.fit(X, Y)
        y_true = df_test["MARKETVAL"]
        y_pred = ridge.predict(df_test[selected_predictors + [predictor]])        
        mse = mean_squared_error(y_true, y_pred)
        
        if mse < smallest_mse:
            smallest_mse = mse
            smallest_mse_index = i
            
            
    selected_predictors.append(predictors.pop(smallest_mse_index))
    errors.append(smallest_mse)
    
    if smallest_mse < smallest_error:
        smallest_error = smallest_mse
        smallest_error_index = j
        

In [None]:
plt.plot(errors)
_ = plt.ylabel("MSE")
_ = plt.xlabel("Number of predictors")

In [None]:
ridge_regression_error = smallest_error
print("Top predictors: " + str(selected_predictors[:smallest_error_index+1]))
print("MSE: " + str(ridge_regression_error))

In [None]:
print(multiple_linear_regression_error)
print(lasso_regression_error)
print(ridge_regression_error)