In [4]:
# data manipulation/viz
import os 
import pandas as pd 
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt

# modeling
from patsy import dmatrices
import statsmodels.api as sm
from statsmodels.formula.api import glm 

from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import GridSearchCV

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV

from numpy import mean
from numpy import std
from numpy import absolute
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.linear_model import Lasso

(1459, 80)
(1460, 81)


In [24]:
# Data cleaning
print("Count number of missing values per variable:")
print(train.isnull().sum()[train.isnull().sum() != 0])

# function to clean the missing values
def na_clean(df):
    
    # some vars are just too missing so I remove the field
    df = df.drop(columns = ["PoolQC", "MiscFeature"])

    # replace some numeric vars w/ median
    median_replace_vars = ['LotFrontage', 'MasVnrArea', 'GarageYrBlt', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'GarageArea']
    for var in median_replace_vars:
        df[var].fillna(df[var].median(), inplace = True)
    
    # replace some num vars w/ 0
    zero_replace_vars = ['BsmtFullBath', 'BsmtHalfBath', 'GarageCars']
    for var in zero_replace_vars:
        df[var].fillna(0, inplace = True)
    
    # replace some cat vars w/ most freq value 
    df['MasVnrType'].fillna('None', inplace = True)
    df['Electrical'].fillna('SBrkr', inplace = True)
    df['MSZoning'].fillna('RL', inplace = True)
    df['SaleType'].fillna('WD', inplace = True)
    df['Utilities'].fillna('AllPub', inplace = True)
    df['KitchenQual'].fillna('TA', inplace = True)
    df['Functional'].fillna('Typ', inplace = True)

    # other cat vars just put missing if there isn't a glaring most popular category
    replace_missing_vars = ['Alley', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 
        'BsmtFinType2', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'Fence', 'Exterior1st', 
        'Exterior2nd', 'FireplaceQu']
    for var in replace_missing_vars:
        df[var].fillna("Missing", inplace = True)

    return df

train = na_clean(train)
test = na_clean(test)

# make sure there are no more missing values
print("\nNumber of missing values after running na_clean()")
print("Missing values in train: {}".format(train.isnull().sum().sum()))
print("Missing values in test: {}".format(test.isnull().sum().sum()))

Count number of missing values per variable:
LotFrontage      259
Alley           1369
MasVnrType         8
MasVnrArea         8
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
BsmtFinType2      38
Electrical         1
FireplaceQu      690
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
PoolQC          1453
Fence           1179
MiscFeature     1406
dtype: int64

Number of missing values after running na_clean()
Missing values in train: 0
Missing values in test: 0


In [58]:
# keep adapting this formula to change how we feature engineer
def feature_eng(df, test_data = False):
    '''Input either the training or test data. 
    2nd arg set to True if it's the testing data. That way we ignore the final log transformation on sale price'''

    # num features to just binarize b/c few houses have the feature
    df["SwimmingPool"] = df['PoolArea'].map(lambda x: 0 if x==0 else 1)
    df["3SsnPorch"] = df['3SsnPorch'].map(lambda x: 0 if x==0 else 1)
    df["ScreenPorch"] = df['ScreenPorch'].map(lambda x: 0 if x==0 else 1)

    # re-factoring vars:
    # group the irregularities into 2 factor levels
    df['LotShape'] = df['LotShape'].map({'Reg': 'Reg', 'IR1': 'Reg', 'IR2': 'Irreg', 'IR3': 'Irreg'})

    # simplifying MSSubClass because we have the year built in another feature
    df['MSSubClass'] = df['MSSubClass'].map(lambda x: 
        "1_story"   if (x in (20, 30, 40, 120)) else(
        "1.5_story" if (x in (45, 50, 150)) else(
        "2_story"   if (x in (60, 70, 75, 160, 180, 190)) else(
        "split"     if (x in (80, 85)) else(
        "duplex"    if (x ==90) else(
        "other"))))))

    # deciding to drop a few features for various reasons
    vars_to_drop = [
        "LowQualFinSF", # hardly any variation
        "LandSlope", # not much variation
        "PoolArea", # binarized above
        "MiscVal", # not much variation
        "Utilities", # only 1 obs in training data different from regular
        #"KitchenAbvGr" # hardly any variation. But, Deva included in lm's so including it now.
        ]
    df.drop(columns=vars_to_drop, inplace=True) 

    # adding a remodeled feature
    df['Remodeled'] = (df.YearRemodAdd-df.YearBuilt) == 0

    # total inside area will be a sum of 1st and 2nd floor sq ft
    df['Total_Inside_Area'] = df['1stFlrSF'] + df['2ndFlrSF']
    df.drop(columns = ['1stFlrSF', '2ndFlrSF', 'GrLivArea'], inplace = True)
    
    # Expensive Neighborhoods based on earlier EDA
    Expensive_neighborhoods = ['Somerst', 'Blmngtn', 'BrDale', 'NridgHt', 'StoneBr', 'MeadowV']
    df['Expensive_neighborhood'] = train['Neighborhood'].apply(lambda x: any([k in x for k in Expensive_neighborhoods]))
    

    
    # log transformations
    df['Log_Total_Inside_Area'] = np.log(df.Total_Inside_Area)
    df['Log_LotArea'] = np.log(df.LotArea)
    df['Log_BasementSF'] = np.log(df.TotalBsmtSF + 1)

    # simplify the bathrooms variable
    df['Bathrooms'] = df.BsmtFullBath + 0.5*df.BsmtHalfBath + df.FullBath + 0.5*df.HalfBath
    df.drop(columns = ['BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath'], inplace = True)

    # get log of sale price which will be our actual response variable
    if test_data:
        pass 
    else:
        df['LogSalePrice'] = np.log(df.SalePrice)

    return df

In [59]:
# Load data
test = pd.read_csv("../../Data/test.csv")
train = pd.read_csv("../../Data/train.csv")
sample = pd.read_csv("../../Data/sample_submission.csv")
print(test.shape)
print(train.shape)

train = na_clean(train)
test = na_clean(test)


# run function on test and train
train = feature_eng(train)
test = feature_eng(test, test_data=True)

(1459, 80)
(1460, 81)


In [7]:
train.head(5)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,LotConfig,...,MoSold,YrSold,SaleType,SaleCondition,SalePrice,SwimmingPool,Remodeled,Total_Inside_Area,Bathrooms,LogSalePrice
0,1,2_story,RL,65.0,8450,Pave,Missing,Reg,Lvl,Inside,...,2,2008,WD,Normal,208500,0,True,1710,3.5,12.247694
1,2,1_story,RL,80.0,9600,Pave,Missing,Reg,Lvl,FR2,...,5,2007,WD,Normal,181500,0,True,1262,2.5,12.109011
2,3,2_story,RL,68.0,11250,Pave,Missing,Reg,Lvl,Inside,...,9,2008,WD,Normal,223500,0,False,1786,3.5,12.317167
3,4,2_story,RL,60.0,9550,Pave,Missing,Reg,Lvl,Corner,...,2,2006,WD,Abnorml,140000,0,False,1717,2.0,11.849398
4,5,2_story,RL,84.0,14260,Pave,Missing,Reg,Lvl,FR2,...,12,2008,WD,Normal,250000,0,True,2198,3.5,12.429216


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,LotConfig,...,SalePrice,SwimmingPool,Remodeled,Total_Inside_Area,Log_Total_Inside_Area,Log_LotArea,Log_BasementSF,Bathrooms,LogSalePrice,Expensive_neighborhood
0,1,2_story,RL,65.0,8450,Pave,Missing,Reg,Lvl,Inside,...,208500,0,True,1710,7.444249,9.041922,6.753438,3.5,12.247694,False
1,2,1_story,RL,80.0,9600,Pave,Missing,Reg,Lvl,FR2,...,181500,0,True,1262,7.140453,9.169518,7.141245,2.5,12.109011,False
2,3,2_story,RL,68.0,11250,Pave,Missing,Reg,Lvl,Inside,...,223500,0,False,1786,7.487734,9.328123,6.825460,3.5,12.317167,False
3,4,2_story,RL,60.0,9550,Pave,Missing,Reg,Lvl,Corner,...,140000,0,False,1717,7.448334,9.164296,6.629363,2.0,11.849398,False
4,5,2_story,RL,84.0,14260,Pave,Missing,Reg,Lvl,FR2,...,250000,0,True,2198,7.695303,9.565214,7.044033,3.5,12.429216,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,2_story,RL,62.0,7917,Pave,Missing,Reg,Lvl,Inside,...,175000,0,False,1647,7.406711,8.976768,6.860664,2.5,12.072541,False
1456,1457,1_story,RL,85.0,13175,Pave,Missing,Reg,Lvl,Inside,...,210000,0,False,2073,7.636752,9.486076,7.341484,3.0,12.254863,False
1457,1458,2_story,RL,66.0,9042,Pave,Missing,Reg,Lvl,Inside,...,266500,0,False,2340,7.757906,9.109636,7.050123,2.0,12.493130,False
1458,1459,1_story,RL,68.0,9717,Pave,Missing,Reg,Lvl,Inside,...,142125,0,False,1078,6.982863,9.181632,6.983790,2.0,11.864462,False


In [67]:
# Copied from Jared

X, Y = train[["Log_Total_Inside_Area","OverallQual","Log_LotArea",'Bathrooms',
              'TotRmsAbvGrd','Expensive_neighborhood',"Remodeled",'Log_BasementSF'
             ]], train["LogSalePrice"]

# I cut at 876 to split our data into 60% train, 40% test
# for accuracy check
X_train = X[:876]
Y_train = Y[:876]
X_test = X[877:]
Y_test = Y[877:]

# iterate over a smaple of alpha values
for i in [0, 1.0e-10, 0.0001, 0.001, 0.01, 0.1, 0.5, 1.0, 2.0, 10.0]:
    # define Lasso model
    model = Lasso(alpha=i)
    
    # fit to our training data
    model.fit(X_train, Y_train)

    # 10-fold cross-validation for evaluating fit of training data
    # for MAE calculation. see example in cite in top line
    cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)

    # shamelessly copied form online, calculate the mean absolute error
    scores = cross_val_score(model, X_train, Y_train, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)

    scores = absolute(scores)
    print("Lasso alpha is equal to: ", i)
    print('Mean MAE: %.3f (%.3f)' % (mean(scores), std(scores)))
    
    # calculate accuracy of lasso regression with built-in tools
    print('Accuracy:', model.score(X_test, Y_test))

  model.fit(X_train, Y_train)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Lasso alpha is equal to:  0
Mean MAE: 0.134 (0.012)
Accuracy: 0.7613477933608038
Lasso alpha is equal to:  1e-10
Mean MAE: 0.134 (0.012)
Accuracy: 0.7613477932928194
Lasso alpha is equal to:  0.0001
Mean MAE: 0.134 (0.012)
Accuracy: 0.761485286966637
Lasso alpha is equal to:  0.001
Mean MAE: 0.134 (0.012)
Accuracy: 0.7626688331909669
Lasso alpha is equal to:  0.01
Mean MAE: 0.137 (0.012)
Accuracy: 0.7673379457450866
Lasso alpha is equal to:  0.1
Mean MAE: 0.186 (0.013)
Accuracy: 0.6544391714540575
Lasso alpha is equal to:  0.5
Mean MAE: 0.319 (0.026)
Accuracy: -0.003263483235423026
Lasso alpha is equal to:  1.0
Mean MAE: 0.319 (0.026)
Accuracy: -0.003263483235423026
Lasso alpha is equal to:  2.0
Mean MAE: 0.319 (0.026)
Accuracy: -0.003263483235423026
Lasso alpha is equal to:  10.0
Mean MAE: 0.319 (0.026)
Accuracy: -0.003263483235423026
