This is the initial notebook to explore the data and run the first models. 

In [1]:
import pandas as pd
import numpy as np

from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.preprocessing import OneHotEncoder

from fastai.imports import *
from fastai.tabular.all import *

from numpy import random
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_squared_log_error
from sklearn import metrics
from sklearn.model_selection import KFold, cross_val_score

import eli5
from eli5.sklearn import PermutationImportance

In [101]:
test = pd.read_csv('test.csv')
sample = pd.read_csv('sample_submission.csv')
train = pd.read_csv('train.csv')

In [102]:
test.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


Based on initial analysis the train and test datasets have similar characteristics, so it will be easier to combine them for imputation and data analysis work. 

In [103]:
train_test = pd.concat([train, test], ignore_index=True)

In [104]:
train_test.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500.0
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500.0
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500.0
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000.0
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000.0


In [105]:
train_test.describe().round().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Id,2919.0,1460.0,843.0,1.0,730.0,1460.0,2190.0,2919.0
MSSubClass,2919.0,57.0,43.0,20.0,20.0,50.0,70.0,190.0
LotFrontage,2433.0,69.0,23.0,21.0,59.0,68.0,80.0,313.0
LotArea,2919.0,10168.0,7887.0,1300.0,7478.0,9453.0,11570.0,215245.0
OverallQual,2919.0,6.0,1.0,1.0,5.0,6.0,7.0,10.0
OverallCond,2919.0,6.0,1.0,1.0,5.0,5.0,6.0,9.0
YearBuilt,2919.0,1971.0,30.0,1872.0,1954.0,1973.0,2001.0,2010.0
YearRemodAdd,2919.0,1984.0,21.0,1950.0,1965.0,1993.0,2004.0,2010.0
MasVnrArea,2896.0,102.0,179.0,0.0,0.0,0.0,164.0,1600.0
BsmtFinSF1,2918.0,441.0,456.0,0.0,0.0,368.0,733.0,5644.0


In [106]:
train_test.describe(include=object).round().T

Unnamed: 0,count,unique,top,freq
MSZoning,2915,5,RL,2265
Street,2919,2,Pave,2907
Alley,198,2,Grvl,120
LotShape,2919,4,Reg,1859
LandContour,2919,4,Lvl,2622
Utilities,2917,2,AllPub,2916
LotConfig,2919,5,Inside,2133
LandSlope,2919,3,Gtl,2778
Neighborhood,2919,25,NAmes,443
Condition1,2919,9,Norm,2511


In [107]:
pd.isnull(train_test).sum()[pd.isnull(train_test).sum() > 0]

MSZoning           4
LotFrontage      486
Alley           2721
Utilities          2
Exterior1st        1
Exterior2nd        1
MasVnrType        24
MasVnrArea        23
BsmtQual          81
BsmtCond          82
BsmtExposure      82
BsmtFinType1      79
BsmtFinSF1         1
BsmtFinType2      80
BsmtFinSF2         1
BsmtUnfSF          1
TotalBsmtSF        1
Electrical         1
BsmtFullBath       2
BsmtHalfBath       2
KitchenQual        1
Functional         2
FireplaceQu     1420
GarageType       157
GarageYrBlt      159
GarageFinish     159
GarageCars         1
GarageArea         1
GarageQual       159
GarageCond       159
PoolQC          2909
Fence           2348
MiscFeature     2814
SaleType           1
SalePrice       1459
dtype: int64

It looks like Alley, FireplaceQu, PoolQC, Fence and MiscFeature have significant numbers of missing data. So it will be best to eliminate those rows. There are a number of rows that have less than 5 rows with missing data. Since some of these are categorical and some are continuous data, their missing data will be replaced with the most frequent value. 

In [108]:
drop_high_nan=['Alley','FireplaceQu','PoolQC','Fence','MiscFeature']
train_test=train_test.drop(drop_high_nan,axis=1)
small_nan_cols = ['MSZoning', 'Utilities', 'Exterior1st', 'Exterior2nd', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 
                  'TotalBsmtSF', 'Electrical', 'BsmtFullBath', 'BsmtHalfBath', 'KitchenQual', 'Functional', 'GarageCars', 
                  'GarageArea','SaleType', 'SaleCondition']
small_impute = SimpleImputer(strategy='most_frequent')
train_test[small_nan_cols] = pd.DataFrame(small_impute.fit_transform(train_test[small_nan_cols]),columns=small_nan_cols)

The following columns seem to have one value significantly larger than the rest, and it would probably be best to use the mode, or most common, value to feel each NaN value: MasVnrType, MasVnrArea, BsmtCond, BsmtExposure, BsmtFinType2, GarageType, GarageFinish, GarageQual, and GarageCond. That is represents 9 out of the 13 columns. 

BsmtQual has two values larger than the rest: Gd and TA. But it only has 2.8% NaNs, so simply using the mode might be good enough.  

GarageYrBlt has 59 NaNs out 2919 rows which is only 2%. It has a dispersed set of values, so it might be easiest just to have any NaNs have the same value as YearBuilt. 

BsmtFinType1 has only 2.7% value of NaNs, and most two of its largest values are GLQ and Unf. It might be easiest to use the mode here. 

LotFrontage has 486 NaNs out of 2919 rows which is a pretty high 16.7%. It has a dispersed range of values, but looking at its characteristics from the describe function above, it seems to have a pretty even distribution with a mean of 10,168 and a median of 9,453. So using the mean to fill in the NaNs seems reasonable. If it turns out this value has a high impact on the predictability of the SalePrice, then it might be good to revisit this assumption. 

In [109]:
mode_cols = ['MasVnrType', 'MasVnrArea', 'BsmtCond', 'BsmtExposure', 'BsmtFinType2', 'GarageType', 'GarageFinish', 
             'GarageQual','GarageCond', 'BsmtQual', 'BsmtFinType1']
mode_impute = SimpleImputer(strategy='most_frequent')
train_test[mode_cols] = pd.DataFrame(mode_impute.fit_transform(train_test[mode_cols]),columns=mode_cols)
train_test['LotFrontage'].fillna((train_test['LotFrontage'].mean()), inplace=True)
train_test['GarageYrBlt'] = train_test['GarageYrBlt'].fillna(train_test['YearBuilt'])

In [110]:
pd.isnull(train_test).sum()[pd.isnull(train_test).sum() > 0]

SalePrice    1459
dtype: int64

In [111]:
train_test['BsmtQual'].unique()

array(['Gd', 'TA', 'Ex', 'Fa'], dtype=object)

In [112]:
train_test['BsmtFinType1'].unique()

array(['GLQ', 'ALQ', 'Unf', 'Rec', 'BLQ', 'LwQ'], dtype=object)

In [113]:
train_test.BsmtQual = train_test.BsmtQual.replace({"Ex": 110, "Gd": 95, "TA": 85, "Fa": 75, "Po": 60, "NA": 0})

In [114]:
train_test.BsmtFinType1 = train_test.BsmtFinType1.replace({"GLQ": 6, "ALQ": 5, "BLQ": 4, "Rec": 3, "LwQ": 2, "Unf": 1,
                                                         "NA": 0})

It will be necessary to identify all the columns that have non-numeric object values and then convert them to numeric values. 

In [115]:
obj_cols = list(train_test.select_dtypes(['object']).columns)
obj_cols

['MSZoning',
 'Street',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'LandSlope',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'Exterior2nd',
 'MasVnrType',
 'MasVnrArea',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinSF1',
 'BsmtFinType2',
 'BsmtFinSF2',
 'BsmtUnfSF',
 'TotalBsmtSF',
 'Heating',
 'HeatingQC',
 'CentralAir',
 'Electrical',
 'BsmtFullBath',
 'BsmtHalfBath',
 'KitchenQual',
 'Functional',
 'GarageType',
 'GarageFinish',
 'GarageCars',
 'GarageArea',
 'GarageQual',
 'GarageCond',
 'PavedDrive',
 'SaleType',
 'SaleCondition']

In [116]:
train_test['BsmtFinSF1'].head()

0    706.0
1    978.0
2    486.0
3    216.0
4    655.0
Name: BsmtFinSF1, dtype: object

In [117]:
for column in obj_cols:
     train_test[column] = pd.factorize(train_test[column], sort=True)[0]

  uniques = Index(uniques)


In [118]:
train_test['BsmtFinSF1'].head()

0    512
1    716
2    332
3    116
4    469
Name: BsmtFinSF1, dtype: int64

In [119]:
train_test[obj_cols].head()

Unnamed: 0,MSZoning,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,...,Functional,GarageType,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,SaleType,SaleCondition
0,3,1,3,3,0,4,0,5,2,2,...,6,1,1,2,283,4,4,2,8,4
1,3,1,3,3,0,2,0,24,1,2,...,6,1,1,2,202,4,4,2,8,4
2,3,1,0,3,0,4,0,5,2,2,...,6,1,1,2,337,4,4,2,8,4
3,3,1,0,3,0,0,0,6,2,2,...,6,5,2,3,366,4,4,2,8,0
4,3,1,0,3,0,2,0,15,2,2,...,6,1,1,3,495,4,4,2,8,4


Creating a series of new features that might improve predictibility.

In [120]:
train_test['QualCondSum'] = train_test['OverallQual'] + train_test['OverallCond']
train_test['RemodTime'] = train_test['YearRemodAdd'] - train_test['YearBuilt']
train_test['BsmtFinTypeSF1'] = train_test['BsmtFinType1'] * train_test['BsmtFinSF1']
train_test['BsmtFin%'] = train_test['BsmtFinSF1'] / train_test['TotalBsmtSF']
train_test['TotalFlrSF'] = train_test['1stFlrSF'] + train_test['2ndFlrSF']
train_test['TotalFinSF'] = train_test['GrLivArea'] + train_test['BsmtFinSF1']
train_test['GarageCarArea'] = train_test['GarageArea'] * train_test['GarageCars']
train_test['TotalSF'] = train_test['1stFlrSF'] + train_test['2ndFlrSF'] + train_test['TotalBsmtSF']

In [91]:
'''train_test['TotalBathrooms'] = (train_test['FullBath'] + (0.5 * train_test['HalfBath']) +
                               train_test['BsmtFullBath'] + (0.5 * train_test['BsmtHalfBath']))
train_test['YrBltPlusRemod'] = train_test['YearBuilt'] + train_test['YearRemodAdd']
train_test["HighQualSF"] = train_test["GrLivArea"]+train_test["1stFlrSF"] + train_test["2ndFlrSF"]
+0.5*train_test["GarageArea"]+0.5*train_test["TotalBsmtSF"]+1*train_test["MasVnrArea"]
train_test["Age"] = train_test["YrSold"]-train_test["YearBuilt"]'''

To create a column with the log of the SalePrice to match the evaluation process in the contest. 

In [121]:
train_test['LogSalePrice'] = train_test['SalePrice'].apply(np.log)

In [122]:
train_test['LogSalePrice'].head()

0    12.247694
1    12.109011
2    12.317167
3    11.849398
4    12.429216
Name: LogSalePrice, dtype: float64

In [123]:
train_test.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,...,SalePrice,QualCondSum,RemodTime,BsmtFinTypeSF1,BsmtFin%,TotalFlrSF,TotalFinSF,GarageCarArea,TotalSF,LogSalePrice
0,1,60,3,65.0,8450,1,3,3,0,4,...,208500.0,12,0,3072,1.961686,1710,2222,566,1971,12.247694
1,2,20,3,80.0,9600,1,3,3,0,2,...,181500.0,14,0,3580,1.187396,1262,1978,404,1865,12.109011
2,3,60,3,68.0,11250,1,0,3,0,4,...,223500.0,12,1,1992,1.081433,1786,2118,674,2093,12.317167
3,4,70,3,60.0,9550,1,0,3,0,0,...,140000.0,12,55,580,0.640884,1717,1833,1098,1898,11.849398
4,5,60,3,84.0,14260,1,0,3,0,2,...,250000.0,13,0,2814,0.930556,2198,2667,1485,2702,12.429216


To separate the train_test dataset back into the train and test datasets, identify the independent and dependent columns, and create the validation split.

In [124]:
train = train_test[train_test['SalePrice'].notnull()].copy()
test = train_test[train_test['SalePrice'].isnull()].drop(['SalePrice','LogSalePrice'],axis=1)
X = train.drop(['SalePrice','LogSalePrice'],axis=1)
y = train.LogSalePrice

In [125]:
test.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,...,SaleType,SaleCondition,QualCondSum,RemodTime,BsmtFinTypeSF1,BsmtFin%,TotalFlrSF,TotalFinSF,GarageCarArea,TotalSF
1460,1461,20,2,80.0,11622,1,3,3,0,4,...,8,4,11,0,954,1.139785,896,1214,428,1175
1461,1462,20,3,81.0,14267,1,0,3,0,0,...,8,4,12,0,3390,1.039877,1329,2007,85,1981
1462,1463,60,3,74.0,13830,1,0,3,0,4,...,8,4,10,1,3504,1.859873,1629,2213,448,1943
1463,1464,60,3,78.0,9978,1,0,3,0,4,...,8,4,12,0,2550,1.362179,1604,2029,424,1916
1464,1465,120,3,43.0,5005,1,0,1,0,4,...,8,4,13,0,760,0.245955,1280,1432,492,1898


In [126]:
xgb_model = xgb.XGBRegressor()

In [127]:
X,y = shuffle(X,y, random_state=42)
X = X.reset_index(drop=True)
y = y.reset_index(drop=True)

In [128]:
def rmse_cv(model,X,y):
    rmse = np.sqrt(-cross_val_score(model, X, y, scoring="neg_mean_squared_error", cv=6))
    return rmse

In [129]:
print(rmse_cv(xgb_model,X,y).mean())

0.14058326427456178


In [135]:
perm = PermutationImportance(xgb.XGBRegressor(), random_state=1,n_iter =10,cv=5).fit(X, y)
eli5.show_weights(perm, feature_names = X.columns.tolist(),top=100)

Weight,Feature
0.2492  ± 0.1353,TotalSF
0.0864  ± 0.0351,OverallQual
0.0248  ± 0.0307,TotalFinSF
0.0212  ± 0.0119,QualCondSum
0.0205  ± 0.0183,YearRemodAdd
0.0179  ± 0.0256,YearBuilt
0.0137  ± 0.0173,LotArea
0.0102  ± 0.0172,GarageCarArea
0.0078  ± 0.0087,OverallCond
0.0048  ± 0.0103,MSZoning


In [130]:
submit = test[['Id']]
submit = submit.reset_index(drop=True)

In [131]:
xgb_model.fit(X,y)

XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=None, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=None, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=100, n_jobs=None, num_parallel_tree=None,
             predictor=None, random_state=None, ...)

In [132]:
submit_predict = xgb_model.predict(test)
submit_predict = np.exp(submit_predict)

In [133]:
submit['SalePrice'] = submit_predict

In [134]:
submit.to_csv('submit_xgb_add_features.csv', index=False)