This is the initial notebook to explore the data and run the first models. 

In [1]:
import pandas as pd
import numpy as np

from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.preprocessing import OneHotEncoder

from fastai.imports import *
from fastai.tabular.all import *

from numpy import random
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_squared_log_error
from sklearn import metrics

import eli5
from eli5.sklearn import PermutationImportance

In [2]:
test = pd.read_csv('test.csv')
sample = pd.read_csv('sample_submission.csv')
train = pd.read_csv('train.csv')

In [3]:
sample.shape

(1459, 2)

In [4]:
sample.head()

Unnamed: 0,Id,SalePrice
0,1461,169277.052498
1,1462,187758.393989
2,1463,183583.68357
3,1464,179317.477511
4,1465,150730.079977


In [5]:
test.shape

(1459, 80)

In [6]:
train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


Based on initial analysis the train and test datasets have similar characteristics, so it will be easier to combine them for imputation and data analysis work. 

In [7]:
train_test = pd.concat([train, test], ignore_index=True)

In [8]:
train_test

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500.0
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500.0
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500.0
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000.0
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2914,2915,160,RM,21.0,1936,Pave,,Reg,Lvl,AllPub,...,0,,,,0,6,2006,WD,Normal,
2915,2916,160,RM,21.0,1894,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2006,WD,Abnorml,
2916,2917,20,RL,160.0,20000,Pave,,Reg,Lvl,AllPub,...,0,,,,0,9,2006,WD,Abnorml,
2917,2918,85,RL,62.0,10441,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,Shed,700,7,2006,WD,Normal,


In [9]:
train_test.describe().round().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Id,2919.0,1460.0,843.0,1.0,730.0,1460.0,2190.0,2919.0
MSSubClass,2919.0,57.0,43.0,20.0,20.0,50.0,70.0,190.0
LotFrontage,2433.0,69.0,23.0,21.0,59.0,68.0,80.0,313.0
LotArea,2919.0,10168.0,7887.0,1300.0,7478.0,9453.0,11570.0,215245.0
OverallQual,2919.0,6.0,1.0,1.0,5.0,6.0,7.0,10.0
OverallCond,2919.0,6.0,1.0,1.0,5.0,5.0,6.0,9.0
YearBuilt,2919.0,1971.0,30.0,1872.0,1954.0,1973.0,2001.0,2010.0
YearRemodAdd,2919.0,1984.0,21.0,1950.0,1965.0,1993.0,2004.0,2010.0
MasVnrArea,2896.0,102.0,179.0,0.0,0.0,0.0,164.0,1600.0
BsmtFinSF1,2918.0,441.0,456.0,0.0,0.0,368.0,733.0,5644.0


GarageYrBlt showed a high value of 2207, which was well past the latest YrSold of 2010. However, there was only one of those values, so this will be ignored. 

In [10]:
train_test['GarageYrBlt'][train_test['GarageYrBlt'] > 2010].count()

1

In [11]:
train_test.describe(include=object).round().T

Unnamed: 0,count,unique,top,freq
MSZoning,2915,5,RL,2265
Street,2919,2,Pave,2907
Alley,198,2,Grvl,120
LotShape,2919,4,Reg,1859
LandContour,2919,4,Lvl,2622
Utilities,2917,2,AllPub,2916
LotConfig,2919,5,Inside,2133
LandSlope,2919,3,Gtl,2778
Neighborhood,2919,25,NAmes,443
Condition1,2919,9,Norm,2511


In [12]:
pd.isnull(train_test).sum()[pd.isnull(train_test).sum() > 0]

MSZoning           4
LotFrontage      486
Alley           2721
Utilities          2
Exterior1st        1
Exterior2nd        1
MasVnrType        24
MasVnrArea        23
BsmtQual          81
BsmtCond          82
BsmtExposure      82
BsmtFinType1      79
BsmtFinSF1         1
BsmtFinType2      80
BsmtFinSF2         1
BsmtUnfSF          1
TotalBsmtSF        1
Electrical         1
BsmtFullBath       2
BsmtHalfBath       2
KitchenQual        1
Functional         2
FireplaceQu     1420
GarageType       157
GarageYrBlt      159
GarageFinish     159
GarageCars         1
GarageArea         1
GarageQual       159
GarageCond       159
PoolQC          2909
Fence           2348
MiscFeature     2814
SaleType           1
SalePrice       1459
dtype: int64

It looks like Alley, FireplaceQu, PoolQC, Fence and MiscFeature have significant numbers of missing data. So it will be best to eliminate those rows. 

In [13]:
drop_high_nan=['Alley','FireplaceQu','PoolQC','Fence','MiscFeature']
train_test=train_test.drop(drop_high_nan,axis=1)

There are a number of rows that have less than 5 rows with missing data. Since some of these are categorical and some are continuous data, their missing data will be replaced with the most frequent value. 

In [14]:
small_nan_cols = ['MSZoning', 'Utilities', 'Exterior1st', 'Exterior2nd', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 
                  'TotalBsmtSF', 'Electrical', 'BsmtFullBath', 'BsmtHalfBath', 'KitchenQual', 'Functional', 'GarageCars', 
                  'GarageArea','SaleType', 'SaleCondition']

In [15]:
small_impute = SimpleImputer(strategy='most_frequent')
train_test[small_nan_cols] = pd.DataFrame(small_impute.fit_transform(train_test[small_nan_cols]),columns=small_nan_cols)

Now there are 13 columns that need to be imputed. 

In [16]:
pd.isnull(train_test).sum()[pd.isnull(train_test).sum() > 0]

LotFrontage      486
MasVnrType        24
MasVnrArea        23
BsmtQual          81
BsmtCond          82
BsmtExposure      82
BsmtFinType1      79
BsmtFinType2      80
GarageType       157
GarageYrBlt      159
GarageFinish     159
GarageQual       159
GarageCond       159
SalePrice       1459
dtype: int64

To classify the remaining columns to be imputed and take a look at the range of data in each column. 

In [17]:
impute_cols = ['LotFrontage','MasVnrType','MasVnrArea','BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinType2',
               'GarageType','GarageYrBlt','GarageFinish','GarageQual','GarageCond']

In [18]:
for i in impute_cols:
    print(train_test.groupby([i], dropna=False).size())    

LotFrontage
21.0      50
22.0       1
24.0      49
25.0       1
26.0       3
        ... 
182.0      1
195.0      1
200.0      1
313.0      2
NaN      486
Length: 129, dtype: int64
MasVnrType
BrkCmn       25
BrkFace     879
None       1742
Stone       249
NaN          24
dtype: int64
MasVnrArea
0.0       1738
1.0          3
3.0          1
11.0         1
14.0         4
          ... 
1224.0       2
1290.0       1
1378.0       1
1600.0       1
NaN         23
Length: 445, dtype: int64
BsmtQual
Ex      258
Fa       88
Gd     1209
TA     1283
NaN      81
dtype: int64
BsmtCond
Fa      104
Gd      122
Po        5
TA     2606
NaN      82
dtype: int64
BsmtExposure
Av      418
Gd      276
Mn      239
No     1904
NaN      82
dtype: int64
BsmtFinType1
ALQ    429
BLQ    269
GLQ    849
LwQ    154
Rec    288
Unf    851
NaN     79
dtype: int64
BsmtFinType2
ALQ      52
BLQ      68
GLQ      34
LwQ      87
Rec     105
Unf    2493
NaN      80
dtype: int64
GarageType
2Types       23
Attchd     1723
Basment

The following columns seem to have one value significantly larger than the rest, and it would probably be best to use the mode, or most common, value to feel each NaN value: MasVnrType, MasVnrArea, BsmtCond, BsmtExposure, BsmtFinType2, GarageType, GarageFinish, GarageQual, and GarageCond. That is represents 9 out of the 13 columns. 

BsmtQual has two values larger than the rest: Gd and TA. But it only has 2.8% NaNs, so simply using the mode might be good enough.  

GarageYrBlt has 59 NaNs out 2919 rows which is only 2%. It has a dispersed set of values, so it might be easiest just to have any NaNs have the same value as YearBuilt. 

BsmtFinType1 has only 2.7% value of NaNs, and most two of its largest values are GLQ and Unf. It might be easiest to use the mode here. 

LotFrontage has 486 NaNs out of 2919 rows which is a pretty high 16.7%. It has a dispersed range of values, but looking at its characteristics from the describe function above, it seems to have a pretty even distribution with a mean of 10,168 and a median of 9,453. So using the mean to fill in the NaNs seems reasonable. If it turns out this value has a high impact on the predictability of the SalePrice, then it might be good to revisit this assumption. 

In [19]:
mode_cols = ['MasVnrType', 'MasVnrArea', 'BsmtCond', 'BsmtExposure', 'BsmtFinType2', 'GarageType', 'GarageFinish', 
             'GarageQual','GarageCond', 'BsmtQual', 'BsmtFinType1']

In [20]:
mode_impute = SimpleImputer(strategy='most_frequent')
train_test[mode_cols] = pd.DataFrame(mode_impute.fit_transform(train_test[mode_cols]),columns=mode_cols)

In [21]:
train_test['LotFrontage'].fillna((train_test['LotFrontage'].mean()), inplace=True)

In [22]:
train_test['GarageYrBlt'] = train_test['GarageYrBlt'].fillna(train_test['YearBuilt'])

After the work above the only column with NaNs is SalePrice which reflects the values correctly not included in the test dataset. 

In [23]:
pd.isnull(train_test).sum()[pd.isnull(train_test).sum() > 0]

SalePrice    1459
dtype: int64

It will be necessary to identify all the columns that have non-numeric object values and then convert them to numeric values. 

In [24]:
obj_cols = list(train_test.select_dtypes(['object']).columns)

In [25]:
obj_cols

['MSZoning',
 'Street',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'LandSlope',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'Exterior2nd',
 'MasVnrType',
 'MasVnrArea',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinSF1',
 'BsmtFinType2',
 'BsmtFinSF2',
 'BsmtUnfSF',
 'TotalBsmtSF',
 'Heating',
 'HeatingQC',
 'CentralAir',
 'Electrical',
 'BsmtFullBath',
 'BsmtHalfBath',
 'KitchenQual',
 'Functional',
 'GarageType',
 'GarageFinish',
 'GarageCars',
 'GarageArea',
 'GarageQual',
 'GarageCond',
 'PavedDrive',
 'SaleType',
 'SaleCondition']

In [26]:
train_test[obj_cols].head()

Unnamed: 0,MSZoning,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,...,Functional,GarageType,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,SaleType,SaleCondition
0,RL,Pave,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,...,Typ,Attchd,RFn,2.0,548.0,TA,TA,Y,WD,Normal
1,RL,Pave,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,Norm,...,Typ,Attchd,RFn,2.0,460.0,TA,TA,Y,WD,Normal
2,RL,Pave,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,...,Typ,Attchd,RFn,2.0,608.0,TA,TA,Y,WD,Normal
3,RL,Pave,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,Norm,...,Typ,Detchd,Unf,3.0,642.0,TA,TA,Y,WD,Abnorml
4,RL,Pave,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,Norm,...,Typ,Attchd,RFn,3.0,836.0,TA,TA,Y,WD,Normal


In [27]:
for column in obj_cols:
     train_test[column] = pd.factorize(train_test[column], sort=True)[0]

  uniques = Index(uniques)


In [28]:
train_test[obj_cols].head()

Unnamed: 0,MSZoning,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,...,Functional,GarageType,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,SaleType,SaleCondition
0,3,1,3,3,0,4,0,5,2,2,...,6,1,1,2,283,4,4,2,8,4
1,3,1,3,3,0,2,0,24,1,2,...,6,1,1,2,202,4,4,2,8,4
2,3,1,0,3,0,4,0,5,2,2,...,6,1,1,2,337,4,4,2,8,4
3,3,1,0,3,0,0,0,6,2,2,...,6,5,2,3,366,4,4,2,8,0
4,3,1,0,3,0,2,0,15,2,2,...,6,1,1,3,495,4,4,2,8,4


To separate the train_test dataset back into the train and test datasets, identify the independent and dependent columns, and create the validation split.

In [72]:
train = train_test[train_test['SalePrice'].notnull()].copy()
test = train_test[train_test['SalePrice'].isnull()].drop('SalePrice',axis=1)

KeyError: "['SalePrice'] not found in axis"

In [27]:
X = train.drop('SalePrice',axis=1)
y = train.SalePrice

In [66]:
X.shape

(1460, 75)

In [28]:
X

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,...,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1,60,3,65.0,8450,1,3,3,0,4,...,61,0,0,0,0,0,2,2008,8,4
1,2,20,3,80.0,9600,1,3,3,0,2,...,0,0,0,0,0,0,5,2007,8,4
2,3,60,3,68.0,11250,1,0,3,0,4,...,42,0,0,0,0,0,9,2008,8,4
3,4,70,3,60.0,9550,1,0,3,0,0,...,35,272,0,0,0,0,2,2006,8,0
4,5,60,3,84.0,14260,1,0,3,0,2,...,84,0,0,0,0,0,12,2008,8,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,3,62.0,7917,1,3,3,0,4,...,40,0,0,0,0,0,8,2007,8,4
1456,1457,20,3,85.0,13175,1,3,3,0,4,...,0,0,0,0,0,0,2,2010,8,4
1457,1458,70,3,66.0,9042,1,3,3,0,4,...,60,0,0,0,0,2500,5,2010,8,4
1458,1459,20,3,68.0,9717,1,3,3,0,4,...,0,112,0,0,0,0,4,2010,8,4


In [29]:
y

0       208500.0
1       181500.0
2       223500.0
3       140000.0
4       250000.0
          ...   
1455    175000.0
1456    210000.0
1457    266500.0
1458    142125.0
1459    147500.0
Name: SalePrice, Length: 1460, dtype: float64

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.25, random_state=15)

In [31]:
X_train.shape, X_test.shape

((1095, 75), (365, 75))

In [32]:
y_train.shape, y_test.shape

((1095,), (365,))

In [33]:
xgb_model = xgb.XGBRegressor()

In [34]:
xgb_model.fit(X_train, y_train)

XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=None, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=None, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=100, n_jobs=None, num_parallel_tree=None,
             predictor=None, random_state=None, ...)

In [35]:
y_pred_xgb = xgb_model.predict(X_test)

In [36]:
y_pred_xgb.shape

(365,)

In [37]:
X_test.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,...,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition
1342,1343,60,3,69.305795,9375,1,3,3,0,4,...,87,0,0,0,0,0,8,2007,8,4
448,449,50,4,50.0,8600,1,3,0,0,4,...,0,0,0,0,0,0,6,2006,8,4
44,45,20,3,70.0,7945,1,3,3,0,4,...,0,0,0,0,0,0,5,2006,8,4
38,39,20,3,68.0,7922,1,3,3,0,4,...,52,0,0,0,0,0,1,2010,8,0
1165,1166,20,3,79.0,9541,1,0,3,0,4,...,114,0,0,0,0,0,9,2009,6,5


In [38]:
print('RMSLE:', np.sqrt(mean_squared_log_error(y_test, y_pred_xgb)))

RMSLE: 0.14317940724007835


In [39]:
rf_model = RandomForestRegressor()

In [40]:
rf_model.fit(X_train, y_train)

RandomForestRegressor()

In [41]:
y_pred_rf = rf_model.predict(X_test)

In [42]:
print('RMSLE:', np.sqrt(mean_squared_log_error(y_test, y_pred_rf)))

RMSLE: 0.1426407679954631


In [76]:
submit = test[['Id']]
submit = submit.reset_index(drop=True)

In [80]:
submit.head()

Unnamed: 0,Id,SalePrice
0,1461,123464.390625
1,1462,150218.5
2,1463,169867.0
3,1464,191323.234375
4,1465,190229.671875


In [64]:
submit.shape

(365, 2)

In [61]:
y_pred_xgb.shape

(365,)

In [65]:
y_pred_xgb[0:5]

array([257878.23, 135112.9 , 145891.8 , 137550.33, 218759.97],
      dtype=float32)

In [79]:
submit['SalePrice'] = xgb_model.predict(test)

In [81]:
submit.to_csv('submit_xgb_firstlook.csv', index=False)

In [82]:
submit['SalePrice'] = rf_model.predict(test)

sample.to_csv('submit_rf_firstlook.csv', index=False)

In [119]:
perm = PermutationImportance(xgb.XGBRegressor(), random_state=1,n_iter =10,cv=5).fit(X, y)
eli5.show_weights(perm, feature_names = X_test.columns.tolist(),top=100)

Weight,Feature
0.2425  ± 0.0851,OverallQual
0.1813  ± 0.0778,GrLivArea
0.0399  ± 0.0333,TotalBsmtSF
0.0308  ± 0.0446,BsmtFinSF1
0.0172  ± 0.0394,GarageCars
0.0129  ± 0.0118,YearBuilt
0.0122  ± 0.0234,BsmtQual
0.0111  ± 0.0095,OverallCond
0.0109  ± 0.0173,1stFlrSF
0.0109  ± 0.0172,YearRemodAdd


In [113]:
xgb_importance = pd.DataFrame(dict(cols=X_test.columns, imp=xgb_model.feature_importances_));

In [114]:
xgb_importance.head()

Unnamed: 0,cols,imp
0,Id,0.000384
1,MSSubClass,0.000264
2,MSZoning,0.003276
3,LotFrontage,0.001054
4,LotArea,0.00254


In [115]:
xgb_importance.sort_values(by=['imp'], inplace=True, ascending=False)

In [116]:
xgb_importance.head(25)

Unnamed: 0,cols,imp
16,OverallQual,0.445142
48,FullBath,0.163204
29,BsmtQual,0.06135
7,LandContour,0.03711
40,CentralAir,0.035767
45,GrLivArea,0.03028
60,GarageArea,0.026689
53,TotRmsAbvGrd,0.015846
51,KitchenAbvGr,0.013151
52,KitchenQual,0.013062


In [97]:
rf_importance = pd.DataFrame(dict(cols=X_test.columns, imp=rf_model.feature_importances_));

In [101]:
rf_importance.head()

Unnamed: 0,cols,imp
16,OverallQual,0.520204
45,GrLivArea,0.1038
43,2ndFlrSF,0.041737
60,GarageArea,0.040312
42,1stFlrSF,0.040189


In [99]:
rf_importance.sort_values(by=['imp'], inplace=True, ascending=False)

In [104]:
rf_importance.head(55)

Unnamed: 0,cols,imp
16,OverallQual,0.520204
45,GrLivArea,0.1038
43,2ndFlrSF,0.041737
60,GarageArea,0.040312
42,1stFlrSF,0.040189
37,TotalBsmtSF,0.038646
33,BsmtFinSF1,0.027259
4,LotArea,0.014294
48,FullBath,0.013968
59,GarageCars,0.012879
