In [37]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn import preprocessing
import seaborn as sns
import warnings
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest
from sklearn.cross_validation import StratifiedKFold
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble.gradient_boosting import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score

warnings.filterwarnings('ignore')
plt.style.use("seaborn-whitegrid")
%matplotlib inline

In [38]:
trainData = pd.read_csv('data/train.csv')

In [39]:
print("Training Data Size: {}".format(len(trainData)))

Training Data Size: 1460


In [40]:
trainData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
Id               1460 non-null int64
MSSubClass       1460 non-null int64
MSZoning         1460 non-null object
LotFrontage      1201 non-null float64
LotArea          1460 non-null int64
Street           1460 non-null object
Alley            91 non-null object
LotShape         1460 non-null object
LandContour      1460 non-null object
Utilities        1460 non-null object
LotConfig        1460 non-null object
LandSlope        1460 non-null object
Neighborhood     1460 non-null object
Condition1       1460 non-null object
Condition2       1460 non-null object
BldgType         1460 non-null object
HouseStyle       1460 non-null object
OverallQual      1460 non-null int64
OverallCond      1460 non-null int64
YearBuilt        1460 non-null int64
YearRemodAdd     1460 non-null int64
RoofStyle        1460 non-null object
RoofMatl         1460 non-null object
Exterior1st      1460 non-n

In [41]:
trainData.describe()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
count,1460.0,1460.0,1201.0,1460.0,1460.0,1460.0,1460.0,1460.0,1452.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,730.5,56.89726,70.049958,10516.828082,6.099315,5.575342,1971.267808,1984.865753,103.685262,443.639726,...,94.244521,46.660274,21.95411,3.409589,15.060959,2.758904,43.489041,6.321918,2007.815753,180921.19589
std,421.610009,42.300571,24.284752,9981.264932,1.382997,1.112799,30.202904,20.645407,181.066207,456.098091,...,125.338794,66.256028,61.119149,29.317331,55.757415,40.177307,496.123024,2.703626,1.328095,79442.502883
min,1.0,20.0,21.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0,34900.0
25%,365.75,20.0,59.0,7553.5,5.0,5.0,1954.0,1967.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2007.0,129975.0
50%,730.5,50.0,69.0,9478.5,6.0,5.0,1973.0,1994.0,0.0,383.5,...,0.0,25.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0,163000.0
75%,1095.25,70.0,80.0,11601.5,7.0,6.0,2000.0,2004.0,166.0,712.25,...,168.0,68.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0,214000.0
max,1460.0,190.0,313.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,5644.0,...,857.0,547.0,552.0,508.0,480.0,738.0,15500.0,12.0,2010.0,755000.0


In [42]:
with open("data/data_description.txt", "r") as text_file:
    print(text_file.read())

MSSubClass: Identifies the type of dwelling involved in the sale.	

        20	1-STORY 1946 & NEWER ALL STYLES
        30	1-STORY 1945 & OLDER
        40	1-STORY W/FINISHED ATTIC ALL AGES
        45	1-1/2 STORY - UNFINISHED ALL AGES
        50	1-1/2 STORY FINISHED ALL AGES
        60	2-STORY 1946 & NEWER
        70	2-STORY 1945 & OLDER
        75	2-1/2 STORY ALL AGES
        80	SPLIT OR MULTI-LEVEL
        85	SPLIT FOYER
        90	DUPLEX - ALL STYLES AND AGES
       120	1-STORY PUD (Planned Unit Development) - 1946 & NEWER
       150	1-1/2 STORY PUD - ALL AGES
       160	2-STORY PUD - 1946 & NEWER
       180	PUD - MULTILEVEL - INCL SPLIT LEV/FOYER
       190	2 FAMILY CONVERSION - ALL STYLES AND AGES

MSZoning: Identifies the general zoning classification of the sale.
		
       A	Agriculture
       C	Commercial
       FV	Floating Village Residential
       I	Industrial
       RH	Residential High Density
       RL	Residential Low Density
       RP	Residential Low Density Park 
       RM

In [43]:
trainDataNorm = trainData

In [44]:
trainData.shape

(1460, 81)

In [45]:
trainData[trainData.MSSubClass.isnull()]
trainData[pd.isnull(trainData['LotFrontage'])]

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
7,8,60,RL,,10382,Pave,,IR1,Lvl,AllPub,...,0,,,Shed,350,11,2009,WD,Normal,200000
12,13,20,RL,,12968,Pave,,IR2,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,144000
14,15,20,RL,,10920,Pave,,IR1,Lvl,AllPub,...,0,,GdWo,,0,5,2008,WD,Normal,157000
16,17,20,RL,,11241,Pave,,IR1,Lvl,AllPub,...,0,,,Shed,700,3,2010,WD,Normal,149000
24,25,20,RL,,8246,Pave,,IR1,Lvl,AllPub,...,0,,MnPrv,,0,5,2010,WD,Normal,154000
31,32,20,RL,,8544,Pave,,IR1,Lvl,AllPub,...,0,,MnPrv,,0,6,2008,WD,Normal,149350
42,43,85,RL,,9180,Pave,,IR1,Lvl,AllPub,...,0,,MnPrv,,0,12,2007,WD,Normal,144000
43,44,20,RL,,9200,Pave,,IR1,Lvl,AllPub,...,0,,MnPrv,,0,7,2008,WD,Normal,130250
50,51,60,RL,,13869,Pave,,IR2,Lvl,AllPub,...,0,,,,0,7,2007,WD,Normal,177000
64,65,60,RL,,9375,Pave,,Reg,Lvl,AllPub,...,0,,GdPrv,,0,2,2009,WD,Normal,219500


### MSSubClass
This refers to the type of dwelling. Let's encode this to a simpler range of values.

In [46]:
le = preprocessing.LabelEncoder()
le.fit(trainData["MSSubClass"])

LabelEncoder()

In [47]:
trainData["MSSubClass"] = le.transform(trainData["MSSubClass"])

In [49]:
set(trainData["MSSubClass"])

{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}

Now we have values 1-14 for types of dwelling.

### MSZoning
Zoning classification, needs encoding.

In [50]:
le.fit(trainData["MSZoning"])

LabelEncoder()

In [51]:
trainData["MSZoning"] = le.transform(trainData["MSZoning"])
set(trainData["MSZoning"])

{0, 1, 2, 3, 4}

### LotFrontage, LotArea
These are scalars

### Street, Alley, LotShape, LandContour, Utilities, LotConfig, Landslope, Neighborhood, Condition1, Condition2, BldgType, HouseStyle
All need encoding

In [54]:
le.fit(trainData["Street"])
trainData["Street"] = le.transform(trainData["Street"])
set(trainData["Street"])

{0, 1}

In [72]:
set(trainData["Alley"])

{'Grvl', 'Na', 'Pave'}

In [71]:
# Fill nan values with 'Na' string for encoding
trainData['Alley'].fillna('Na', inplace = True)

In [73]:
le.fit(trainData["Alley"])
trainData["Alley"] = le.transform(trainData["Alley"])
set(trainData["Alley"])

{0, 1, 2}

In [56]:
le.fit(trainData["LotShape"])
trainData["LotShape"] = le.transform(trainData["LotShape"])
set(trainData["LotShape"])

{0, 1, 2, 3}

In [57]:
le.fit(trainData["LandContour"])
trainData["LandContour"] = le.transform(trainData["LandContour"])
set(trainData["LandContour"])

{0, 1, 2, 3}

In [58]:
le.fit(trainData["Utilities"])
trainData["Utilities"] = le.transform(trainData["Utilities"])
set(trainData["Utilities"])

{0, 1}

In [59]:
le.fit(trainData["LotConfig"])
trainData["LotConfig"] = le.transform(trainData["LotConfig"])
set(trainData["LotConfig"])

{0, 1, 2, 3, 4}

In [66]:
le.fit(trainData["LandSlope"])
trainData["LandSlope"] = le.transform(trainData["LandSlope"])
set(trainData["LandSlope"])

{0, 1, 2}

In [61]:
le.fit(trainData["Neighborhood"])
trainData["Neighborhood"] = le.transform(trainData["Neighborhood"])
set(trainData["Neighborhood"])

{0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24}

In [62]:
le.fit(trainData["Condition1"])
trainData["Condition1"] = le.transform(trainData["Condition1"])
set(trainData["Condition1"])

{0, 1, 2, 3, 4, 5, 6, 7, 8}

In [63]:
le.fit(trainData["Condition2"])
trainData["Condition2"] = le.transform(trainData["Condition2"])
set(trainData["Condition2"])

{0, 1, 2, 3, 4, 5, 6, 7}

In [64]:
le.fit(trainData["BldgType"])
trainData["BldgType"] = le.transform(trainData["BldgType"])
set(trainData["BldgType"])

{0, 1, 2, 3, 4}

In [65]:
le.fit(trainData["HouseStyle"])
trainData["HouseStyle"] = le.transform(trainData["HouseStyle"])
set(trainData["HouseStyle"])

{0, 1, 2, 3, 4, 5, 6, 7}

### OverallQual, OverallCond 
1-10 ratings

In [78]:
set(trainData["OverallQual"])

{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}

In [80]:
set(trainData["OverallCond"])

{1, 2, 3, 4, 5, 6, 7, 8, 9}

### YearBuilt, YearRemodAdd  
Year value. There are gaps between years.  
There is a strict temporal relationship between values;  
I would not consider these categorical.  
YearRemodAdd == construction date if no remodel.

In [83]:
len(set(trainData["YearBuilt"]))

112

In [84]:
max(trainData["YearBuilt"])-min(trainData["YearBuilt"])

138

In [93]:
trainData["YearBuilt"].describe()

count    1460.000000
mean     1971.267808
std        30.202904
min      1872.000000
25%      1954.000000
50%      1973.000000
75%      2000.000000
max      2010.000000
Name: YearBuilt, dtype: float64

In [94]:
len(set(trainData["YearRemodAdd"]))

61

In [95]:
max(trainData["YearRemodAdd"])-min(trainData["YearRemodAdd"])

60

In [96]:
trainData["YearRemodAdd"].describe()

count    1460.000000
mean     1984.865753
std        20.645407
min      1950.000000
25%      1967.000000
50%      1994.000000
75%      2004.000000
max      2010.000000
Name: YearRemodAdd, dtype: float64

### RoofStyle, RoofMatl, Exterior1st, Exterior2nd, MasVnrType
All need encoding

In [97]:
set(trainData["RoofStyle"])

{'Flat', 'Gable', 'Gambrel', 'Hip', 'Mansard', 'Shed'}

In [98]:
le.fit(trainData["RoofStyle"])
trainData["RoofStyle"] = le.transform(trainData["RoofStyle"])
set(trainData["RoofStyle"])

{0, 1, 2, 3, 4, 5}

In [99]:
set(trainData["RoofMatl"])

{'ClyTile',
 'CompShg',
 'Membran',
 'Metal',
 'Roll',
 'Tar&Grv',
 'WdShake',
 'WdShngl'}

In [101]:
le.fit(trainData["RoofMatl"])
trainData["RoofMatl"] = le.transform(trainData["RoofMatl"])
set(trainData["RoofMatl"])

{0, 1, 2, 3, 4, 5, 6, 7}

In [105]:
set(trainData["Exterior1st"])

{'AsbShng',
 'AsphShn',
 'BrkComm',
 'BrkFace',
 'CBlock',
 'CemntBd',
 'HdBoard',
 'ImStucc',
 'MetalSd',
 'Plywood',
 'Stone',
 'Stucco',
 'VinylSd',
 'Wd Sdng',
 'WdShing'}

In [106]:
le.fit(trainData["Exterior1st"])
trainData["Exterior1st"] = le.transform(trainData["Exterior1st"])
set(trainData["Exterior1st"])

{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}

In [107]:
set(trainData["Exterior2nd"])

{'AsbShng',
 'AsphShn',
 'Brk Cmn',
 'BrkFace',
 'CBlock',
 'CmentBd',
 'HdBoard',
 'ImStucc',
 'MetalSd',
 'Other',
 'Plywood',
 'Stone',
 'Stucco',
 'VinylSd',
 'Wd Sdng',
 'Wd Shng'}

In [108]:
le.fit(trainData["Exterior2nd"])
trainData["Exterior2nd"] = le.transform(trainData["Exterior2nd"])
set(trainData["Exterior2nd"])

{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}

In [109]:
set(trainData["MasVnrType"])

{nan, 'BrkCmn', 'BrkFace', 'None', 'Stone'}

In [110]:
# Fill nan values with 'Na' string for encoding
trainData['MasVnrType'].fillna('Unknown', inplace = True)

In [111]:
le.fit(trainData["MasVnrType"])
trainData["MasVnrType"] = le.transform(trainData["MasVnrType"])
set(trainData["MasVnrType"])

{0, 1, 2, 3, 4}