In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import (train_test_split, cross_val_score)
from sklearn.linear_model import Ridge, Lasso, ElasticNet, LinearRegression, RidgeCV, LassoCV, ElasticNetCV
from sklearn.preprocessing import (StandardScaler, PolynomialFeatures)
from sklearn import metrics

In [2]:
ames = pd.read_csv('../datasets/ames_cleaned.csv')

In [3]:
ames.shape

(2046, 141)

In [4]:
ames.tail()

Unnamed: 0,id,lot_shape,utilities,land_slope,overall_qual,exter_qual,bsmt_qual,bsmt_exposure,bsmtfin_type_1,heating_qc,...,garage_area^2,garage_area age,garage_area ext_blt_area,garage_area total_bath,age^2,age ext_blt_area,age total_bath,ext_blt_area^2,ext_blt_area total_bath,total_bath^2
2041,1587,3,4,3,6.5,3.5,3.5,2,6,5,...,270400.0,520.0,143520.0,1560.0,1.0,276.0,3.0,76176.0,828.0,9.0
2042,785,3,4,3,4.5,3.0,3.0,0,4,5,...,290521.0,37191.0,85162.0,539.0,4761.0,10902.0,69.0,24964.0,158.0,1.0
2043,916,4,4,3,6.0,3.0,3.0,0,1,4,...,116964.0,27702.0,0.0,513.0,6561.0,0.0,121.5,0.0,0.0,2.25
2044,639,4,4,3,4.5,3.0,3.0,0,3,3,...,86436.0,15582.0,96726.0,588.0,2809.0,17437.0,106.0,108241.0,658.0,4.0
2045,10,4,4,3,6.0,3.0,3.0,0,1,4,...,195364.0,4862.0,88400.0,1105.0,121.0,2200.0,27.5,40000.0,500.0,6.25


# Standard Scaling and Train Test Split

In [5]:
# drop 'id'
ames.drop(columns='id', inplace=True)

In [None]:
# continuous variables to be scaled.
'''poly_fts=['lot_frontage',
 'lot_area',
 'total_bsmt_sf',
 'gr_liv_area',
 'garage_area',
 'age',
 'ext_blt_area',
 'total_bath',
 'lot_frontage^2',
 'lot_frontage lot_area',
 'lot_frontage total_bsmt_sf',
 'lot_frontage gr_liv_area',
 'lot_frontage garage_area',
 'lot_frontage age',
 'lot_frontage ext_blt_area',
 'lot_frontage total_bath',
 'lot_area^2',
 'lot_area total_bsmt_sf',
 'lot_area gr_liv_area',
 'lot_area garage_area',
 'lot_area age',
 'lot_area ext_blt_area',
 'lot_area total_bath',
 'total_bsmt_sf^2',
 'total_bsmt_sf gr_liv_area',
 'total_bsmt_sf garage_area',
 'total_bsmt_sf age',
 'total_bsmt_sf ext_blt_area',
 'total_bsmt_sf total_bath',
 'gr_liv_area^2',
 'gr_liv_area garage_area',
 'gr_liv_area age',
 'gr_liv_area ext_blt_area',
 'gr_liv_area total_bath',
 'garage_area^2',
 'garage_area age',
 'garage_area ext_blt_area',
 'garage_area total_bath',
 'age^2',
 'age ext_blt_area',
 'age total_bath',
 'ext_blt_area^2',
 'ext_blt_area total_bath',
 'total_bath^2']

In [None]:
# assign back to dataframe 
#ames.loc[:,poly_fts] = scaled

In [6]:
X = ames.drop(columns=['saleprice'])
y = ames['saleprice']

In [7]:
X.shape

(2046, 139)

In [8]:
y.shape

(2046,)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    random_state = 68)

In [10]:
ss =StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)

# Baseline Mean Squared Error

The baseline mean squared error is calculated by setting all predict target values (yhat) as mean of all sale prices in test set.

In [11]:
# custom function to calculate MSE
# metrics.mean_squared_error does not support broadcasting for vector - scalar (mean of y_train)
def mse(y, yhat):
    mse = sum((y-yhat)**2)/len(y)
    return mse

In [12]:
rmse_base_train = np.sqrt(mse(y_train, y_train.mean()))
print('The baseline root mean square error for train set is {:.0f}.'.format(rmse_base_train))

The baseline root mean square error for train set is 79557.


In [13]:
rmse_base_test = np.sqrt(mse(y_test, y_test.mean()))
print('The baseline root mean square error for train set is {:.0f}.'.format(rmse_base_test))

The baseline root mean square error for train set is 78334.


# Linear Regression

In [14]:
lr = LinearRegression()
lr.fit(X_train, y_train)

LinearRegression()

In [15]:
# custom function to calculate root mean squared error
def rmse(model, X, y):
    result = -cross_val_score(model, 
                             X, 
                             y, 
                             cv=5, scoring = 'neg_mean_squared_error').mean()
    return np.sqrt(result)

In [16]:
# custom function to calculate R2 score
def r2_score(model, X, y):
    result = model.score(X, y)
    return result

**Train**

In [17]:
r2_lr_train = r2_score(lr, X_train, y_train)
print('The R2 score using linear regression on train set is {:.3f}.'.format(r2_lr_train))

The R2 score using linear regression on train set is 0.942.


In [18]:
rmse_lr_train = rmse(lr, X_train, y_train)
print('The root mean square error using linear regression on train set is {:.0f}.'.format(rmse_lr_train))

The root mean square error using linear regression on train set is 18985451997313820.


**Test**

In [19]:
r2_lr_test = r2_score(lr, X_test, y_test)
print('The R2 score using linear regression on test set is {:.3f}.'.format(r2_lr_test))

The R2 score using linear regression on test set is -977009030102862921728.000.


In [20]:
rmse_lr_test = rmse(lr, X_test, y_test)
print('The root mean square error using linear regression on train set is {:.0f}.'.format(rmse_lr_test))

The root mean square error using linear regression on train set is 5403665849856550.


# Ridge Regression

## Ridge Regression CV

In [21]:
r_alphas = np.logspace(0,5,200)
ridge_cv = RidgeCV(alphas=r_alphas,
                  store_cv_values=True)
ridge_cv.fit(X_train, y_train);

In [22]:
# optimal alpha
r_best_alpha = ridge_cv.alpha_
print(r_best_alpha)

54.158713780794734


## Optimal Alpha

In [23]:
ridge = Ridge(alpha=r_best_alpha)
ridge.fit(X_train, y_train)

Ridge(alpha=54.158713780794734)

**Train**

In [24]:
r2_ridge_train = r2_score(ridge, X_train, y_train)
print('The R2 score using ridge regression on train set is {:.3f}.'.format(r2_ridge_train))

The R2 score using ridge regression on train set is 0.940.


In [25]:
rmse_ridge_train = rmse(ridge, X_train, y_train)
print('The root mean square error using ridge regression on train set is {:.0f}.'.format(rmse_ridge_train))

The root mean square error using ridge regression on train set is 21864.


**Test**

In [26]:
r2_ridge_test = r2_score(ridge, X_test, y_test)
print('The R2 score using ridge regression on test set is {:.3f}.'.format(r2_ridge_test))

The R2 score using ridge regression on test set is 0.908.


In [27]:
rmse_ridge_test = rmse(ridge, X_test, y_test)
print('The root mean square error using ridge regression on test set is {:.0f}.'.format(rmse_ridge_test))

The root mean square error using ridge regression on test set is 25994.


# Lasso Regression

## Lasso Regression CV

In [28]:
lasso_cv = LassoCV(n_alphas=1000, cv= 5)
lasso_cv.fit(X_train, y_train)

LassoCV(cv=5, n_alphas=1000)

In [29]:
# optimal alpha
l_best_alpha = lasso_cv.alpha_
print(l_best_alpha)

126.3353282596484


## Optimal Alpha

In [30]:
lasso = Lasso(alpha=l_best_alpha, max_iter=2000)

In [31]:
lasso.fit(X_train, y_train)

Lasso(alpha=126.3353282596484, max_iter=2000)

In [32]:
coefs = pd.Series(lasso.coef_, index = X_train.columns)

AttributeError: 'numpy.ndarray' object has no attribute 'columns'

In [33]:
pd.set_option('display.max_rows',200)

In [34]:
coefs.sort_values()

NameError: name 'coefs' is not defined

**Train**

In [35]:
r2_lasso_train = r2_score(lasso, X_train, y_train)
print('The R2 score using ridge regression on train set is {:.3f}.'.format(r2_lasso_train))

The R2 score using ridge regression on train set is 0.940.


In [36]:
rmse_lasso_train = rmse(lasso, X_train, y_train)
print('The root mean square error using lasso regression on train set is {:.0f}.'.format(rmse_lasso_train))

The root mean square error using lasso regression on train set is 21901.


**Test**

In [37]:
r2_lasso_test = r2_score(lasso, X_test, y_test)
print('The R2 score using lasso regression on test set is {:.3f}.'.format(r2_lasso_test))

The R2 score using lasso regression on test set is 0.908.


In [38]:
rmse_lasso_test = rmse(lasso, X_test, y_test)
print('The root mean square error using lasso regression on test set is {:.0f}.'.format(rmse_lasso_test))

The root mean square error using lasso regression on test set is 26456.


# Elastic Net Regression

## Elastic Net CV

In [39]:
#enet_alphas = np.arange(0.5,1,0.005)
enet_ratios = [.1,.5,.7,.9,.95,.99,1]

In [40]:
enet_cv = ElasticNetCV(n_alphas=1000, l1_ratio = enet_ratios, cv=5)
enet_cv.fit(X_train, y_train)

ElasticNetCV(cv=5, l1_ratio=[0.1, 0.5, 0.7, 0.9, 0.95, 0.99, 1], n_alphas=1000)

In [41]:
e_best_alpha = enet_cv.alpha_
print(e_best_alpha)

126.3353282596484


In [42]:
e_best_ratio = enet_cv.l1_ratio_
print(e_best_ratio)

1.0


## Optimal Alpha and L1-Ratio

In [43]:
enet = ElasticNet(alpha = e_best_alpha, l1_ratio=e_best_ratio, max_iter=2000)
enet.fit(X_train, y_train)

ElasticNet(alpha=126.3353282596484, l1_ratio=1.0, max_iter=2000)

**Train**

In [44]:
r2_enet_train = r2_score(enet, X_train, y_train)
print('The R2 score using elastic net regression on train set is {:.3f}.'.format(r2_enet_train))

The R2 score using elastic net regression on train set is 0.940.


In [45]:
rmse_enet_train = rmse(enet, X_train, y_train)
print('The root mean square error using elastic net regression on train set is {:.0f}.'.format(rmse_enet_train))

The root mean square error using elastic net regression on train set is 21901.


**Test**

In [46]:
r2_enet_test = r2_score(enet, X_test, y_test)
print('The R2 score using elastic net regression on test set is {:.3f}.'.format(r2_enet_test))

The R2 score using elastic net regression on test set is 0.908.


In [47]:
rmse_enet_test = rmse(enet, X_test, y_test)
print('The root mean square error using elastic net regression on test set is {:.0f}.'.format(rmse_enet_test))

The root mean square error using elastic net regression on test set is 26456.


# Evaluation and Hyperparameters Selection

# Kaggle Submission

Apply the same preprocessing steps from Part 1 to test data.

In [48]:
test = pd.read_csv('../datasets/test.csv')
test.shape

(879, 80)

**Imputation**

In [49]:
# custom function to rename columns to lower case and replace whitespaces with underscore.
def rename_cols(df):
    old_cols = df.columns
    new_cols = [x.lower().replace(' ','_') for x in old_cols]
    #create dictionary with old column label as key and new label as value
    col_dict = {old:new for old, new in zip(old_cols,new_cols)}
    df.rename(columns=col_dict, inplace=True)
    return df

In [50]:
test = rename_cols(test)
test.drop(columns='pid',inplace=True)
test['lot_frontage'].fillna(0, inplace=True)
test.drop(columns='alley', inplace=True)
test['mas_vnr_type'].fillna(test['mas_vnr_type'].mode()[0], inplace=True)
test['mas_vnr_area'].fillna(test['mas_vnr_area'].median(), inplace=True)

In [51]:
bsmt = [col for col in test.columns if 'bsmt' in col]
for col in bsmt:
    if test[col].dtype == 'object':
        test[col].fillna('NA', inplace=True)
    elif test[col].dtype == 'float64':
        test[col].fillna(0, inplace=True)

In [52]:
garage = [col for col in test.columns if 'garage' in col]
for col in garage:
    if test[col].dtype == 'object':
        test[col].fillna('NA', inplace=True)
    elif test[col].dtype == 'float64':
        test[col].fillna(0, inplace=True)
test.drop(columns='garage_yr_blt', inplace=True)

In [53]:
test.drop(columns=['pool_qc','pool_area','misc_feature'], inplace=True)
test['fence'].fillna('NA', inplace=True)
test['fireplace_qu'].fillna('NA', inplace=True)
test['electrical'].fillna(test['electrical'].mode()[0], inplace=True)

In [54]:
test.isnull().sum().sum()

0

**Feature Engineering**

In [55]:
test['age'] = test['yr_sold']-test['year_built']
test.drop(columns=['year_built','year_remod/add','yr_sold'], inplace=True)

In [56]:
area_cols = [x for x in test.columns if 'area' in x or 'sf' in x or 'porch' in x]
test['ext_blt_area'] = test['mas_vnr_area']+test['wood_deck_sf']+test['open_porch_sf']+\
                       test['enclosed_porch']+test['3ssn_porch']+test['screen_porch']
test.drop(columns=['mas_vnr_area','wood_deck_sf','open_porch_sf','enclosed_porch',\
                   '3ssn_porch','screen_porch','garage_cars'], inplace=True)
test.drop(columns=['bsmtfin_sf_1','bsmtfin_sf_2','bsmt_unf_sf',\
                   '1st_flr_sf','2nd_flr_sf','low_qual_fin_sf'], inplace=True)

In [57]:
room_cols = [x for x in test.columns if 'room' in x or 'abvgr' in x or 'bath' in x]
test['total_bath']= test['bsmt_full_bath']+0.5*test['bsmt_half_bath']+test['full_bath']+0.5*test['half_bath']
test.drop(columns=['bsmt_full_bath','bsmt_half_bath','full_bath','half_bath','totrms_abvgrd'],inplace=True)

In [58]:
test.replace({'exter_qual':{'Po':1,'Fa':2,'TA':3,'Gd':4,'Ex':5},
              'exter_cond':{'Po':1,'Fa':2,'TA':3,'Gd':4,'Ex':5},
               'bsmt_qual':{'NA':0,'Po':1,'Fa':2,'TA':3,'Gd':4,'Ex':5},
               'bsmt_cond':{'NA':0,'Po':1,'Fa':2,'TA':3,'Gd':4,'Ex':5},
              'heating_qc':{'Po':1,'Fa':2,'TA':3,'Gd':4,'Ex':5},
            'kitchen_qual':{'Po':1,'Fa':2,'TA':3,'Gd':4,'Ex':5},
            'fireplace_qu':{'NA':0,'Po':1,'Fa':2,'TA':3,'Gd':4,'Ex':5},
           'garage_finish':{'NA':0,'Unf':1,'RFn':2,'Fin':3},
             'garage_qual':{'NA':0,'Po':1,'Fa':2,'TA':3,'Gd':4,'Ex':5},
             'garage_cond':{'NA':0,'Po':1,'Fa':2,'TA':3,'Gd':4,'Ex':5}
             }, 
            inplace=True
            )

In [59]:
test['overall_qual']=(test['overall_qual']+test['overall_cond'])/2
test['exter_qual']=(test['exter_qual']+test['exter_cond'])/2
test['bsmt_qual']=(test['bsmt_qual']+test['bsmt_cond'])/2
test['garage_qual']=(test['garage_qual']+test['garage_cond']+test['garage_finish'])/3

In [60]:
test.drop(columns=['overall_cond','exter_cond','bsmt_cond',\
                   'garage_finish','garage_cond','condition_2','bsmtfin_type_2'], inplace=True)

In [61]:
test.replace({'lot_shape':{'IR3':1,'IR2':2,'IR1':3,'Reg':4},
              'utilities':{'ELO':1,'NoSeWa':2,'NoSewr':3,'AllPub':4},
             'land_slope':{'Sev':1,'Mod':2,'Gtl':3},
          'bsmt_exposure':{'NA':0,'No':0,'Mn':1,'Av':2,'Gd':3},
          'bsmt_exposure':{'NA':0,'No':0,'Mn':1,'Av':2,'Gd':3},
         'bsmtfin_type_1':{'NA':0,'Unf':1,'LwQ':2,'Rec':3,'BLQ':4,'ALQ':5,'GLQ':6},
             'electrical':{'Mix':1,'FuseP':2,'FuseF':3,'FuseA':4,'SBrkr':5},
             'functional':{'Sal':0,'Sev':0,'Maj2':1,'Maj1':1,'Mod':2,'Min2':3,'Min1':3,'Typ':4},
            'paved_drive':{'N':1,'P':2,'Y':3},
                  'fence':{'NA':0,'MnWw':1,'GdWo':2,'MnPrv':3,'GdPrv':4}
             }, 
            inplace=True
            )

In [62]:
test.drop(columns=['ms_subclass','exterior_2nd','roof_matl','heating','sale_type','exterior_1st'], inplace=True)

In [63]:
test.replace({'ms_zoning':{'C (all)':'non R','A (agr)':'non R','I (all)':'non R'},
             'mas_vnr_type':{'CBlock':'Others','BrkCmn':'Others','Stone':'Others'}
             }, 
            inplace=True
            )

In [64]:
dum_cols = ['ms_zoning','street','land_contour','lot_config','neighborhood','condition_1','bldg_type',\
            'house_style','roof_style','mas_vnr_type','foundation','central_air','garage_type']

In [65]:
test = pd.get_dummies(test, columns=dum_cols,drop_first=True)

In [66]:
test.drop(columns='misc_val',inplace=True)

In [68]:
test.shape

(879, 104)

**Interaction Terms**

In [69]:
cont_cols = ['lot_frontage','lot_area','total_bsmt_sf','gr_liv_area',\
             'garage_area','age','ext_blt_area','total_bath']

In [70]:
poly = PolynomialFeatures(degree = 2, include_bias=False)
X_poly=poly.fit_transform(test[cont_cols])
poly_fts = poly.get_feature_names(cont_cols)
test.drop(columns=cont_cols, inplace=True)

In [71]:
test.loc[:,poly_fts] = X_poly

**Standard Scaling**

In [74]:
# scaler already fitted using train set
X_kag = ss.transform(test.drop(columns='id'))

In [None]:
#test.loc[:,poly_fts] = scaled

**Final Predictor Variables**

In [75]:
y_kag_pred = enet.predict(X_kag)

In [76]:
# concatenate predicted values to dataframe
test.loc[:,'saleprice'] = y_kag_pred

In [77]:
# subset 'id' and 'saleprice' for submission
subm_cols = ['id', 'saleprice']
final = test[subm_cols]
final = final.sort_values(by='id')

In [78]:
final.to_csv('../datasets/submission2.csv', index=False)