<a href="https://colab.research.google.com/github/timchoh585/kaggle-data/blob/master/Housing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Base Imports

In [0]:
# data analysis and wrangling
import pandas as pd
import numpy as np
import random as rnd

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline 

import warnings
warnings.filterwarnings('ignore')

# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier

Load in data and create a combined table for easy data clean up.  Will apply same mappings to each DF independently of combined; using combined to see any correlation on various stats tests.

In [12]:
train_url = 'https://raw.githubusercontent.com/timchoh585/kaggle-data/master/house%20prices/train.csv'
test_url = 'https://raw.githubusercontent.com/timchoh585/kaggle-data/master/house%20prices/test.csv'

train_df = pd.read_csv(train_url)
test_df = pd.read_csv(test_url)
combine = pd.concat([train_df, test_df])

print(train_df.columns.values)
train_df.head()
train_df.tail()
train_df.info()
print('_'*40)
test_df.info()
print('_'*40)
combine.describe()

['Id' 'MSSubClass' 'MSZoning' 'LotFrontage' 'LotArea' 'Street' 'Alley'
 'LotShape' 'LandContour' 'Utilities' 'LotConfig' 'LandSlope'
 'Neighborhood' 'Condition1' 'Condition2' 'BldgType' 'HouseStyle'
 'OverallQual' 'OverallCond' 'YearBuilt' 'YearRemodAdd' 'RoofStyle'
 'RoofMatl' 'Exterior1st' 'Exterior2nd' 'MasVnrType' 'MasVnrArea'
 'ExterQual' 'ExterCond' 'Foundation' 'BsmtQual' 'BsmtCond' 'BsmtExposure'
 'BsmtFinType1' 'BsmtFinSF1' 'BsmtFinType2' 'BsmtFinSF2' 'BsmtUnfSF'
 'TotalBsmtSF' 'Heating' 'HeatingQC' 'CentralAir' 'Electrical' '1stFlrSF'
 '2ndFlrSF' 'LowQualFinSF' 'GrLivArea' 'BsmtFullBath' 'BsmtHalfBath'
 'FullBath' 'HalfBath' 'BedroomAbvGr' 'KitchenAbvGr' 'KitchenQual'
 'TotRmsAbvGrd' 'Functional' 'Fireplaces' 'FireplaceQu' 'GarageType'
 'GarageYrBlt' 'GarageFinish' 'GarageCars' 'GarageArea' 'GarageQual'
 'GarageCond' 'PavedDrive' 'WoodDeckSF' 'OpenPorchSF' 'EnclosedPorch'
 '3SsnPorch' 'ScreenPorch' 'PoolArea' 'PoolQC' 'Fence' 'MiscFeature'
 'MiscVal' 'MoSold' 'YrSold' 'SaleTy

Unnamed: 0,1stFlrSF,2ndFlrSF,3SsnPorch,BedroomAbvGr,BsmtFinSF1,BsmtFinSF2,BsmtFullBath,BsmtHalfBath,BsmtUnfSF,EnclosedPorch,...,OverallQual,PoolArea,SalePrice,ScreenPorch,TotRmsAbvGrd,TotalBsmtSF,WoodDeckSF,YearBuilt,YearRemodAdd,YrSold
count,2919.0,2919.0,2919.0,2919.0,2918.0,2918.0,2917.0,2917.0,2918.0,2919.0,...,2919.0,2919.0,1460.0,2919.0,2919.0,2918.0,2919.0,2919.0,2919.0,2919.0
mean,1159.581706,336.483727,2.602261,2.860226,441.423235,49.582248,0.429894,0.061364,560.772104,23.098321,...,6.089072,2.251799,180921.19589,16.06235,6.451524,1051.777587,93.709832,1971.312778,1984.264474,2007.792737
std,392.362079,428.701456,25.188169,0.822693,455.610826,169.205611,0.524736,0.245687,439.543659,64.244246,...,1.409947,35.663946,79442.502883,56.184365,1.569379,440.766258,126.526589,30.291442,20.894344,1.314964
min,334.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,34900.0,0.0,2.0,0.0,0.0,1872.0,1950.0,2006.0
25%,876.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,220.0,0.0,...,5.0,0.0,129975.0,0.0,5.0,793.0,0.0,1953.5,1965.0,2007.0
50%,1082.0,0.0,0.0,3.0,368.5,0.0,0.0,0.0,467.0,0.0,...,6.0,0.0,163000.0,0.0,6.0,989.5,0.0,1973.0,1993.0,2008.0
75%,1387.5,704.0,0.0,3.0,733.0,0.0,1.0,0.0,805.5,0.0,...,7.0,0.0,214000.0,0.0,7.0,1302.0,168.0,2001.0,2004.0,2009.0
max,5095.0,2065.0,508.0,8.0,5644.0,1526.0,3.0,2.0,2336.0,1012.0,...,10.0,800.0,755000.0,576.0,15.0,6110.0,1424.0,2010.0,2010.0,2010.0


check for null values

In [1]:
combine.columns[combine.isna().any()].tolist()

NameError: ignored

create integer mappings for all NULL columns first

In [0]:
alley_map = {"Grvl": 1, "Pave": 2, "NA": 0}

Show Correlation map to see which features are "most" important

In [0]:
corr = combine.corr()
corr.style.background_gradient(cmap='coolwarm').set_precision(2)

Unnamed: 0,1stFlrSF,2ndFlrSF,3SsnPorch,BedroomAbvGr,BsmtFinSF1,BsmtFinSF2,BsmtFullBath,BsmtHalfBath,BsmtUnfSF,EnclosedPorch,Fireplaces,FullBath,GarageArea,GarageCars,GarageYrBlt,GrLivArea,HalfBath,Id,KitchenAbvGr,LotArea,LotFrontage,LowQualFinSF,MSSubClass,MasVnrArea,MiscVal,MoSold,OpenPorchSF,OverallCond,OverallQual,PoolArea,SalePrice,ScreenPorch,TotRmsAbvGrd,TotalBsmtSF,WoodDeckSF,YearBuilt,YearRemodAdd,YrSold
1stFlrSF,1.0,-0.25,0.044,0.11,0.46,0.084,0.26,0.011,0.3,-0.066,0.41,0.37,0.49,0.44,0.26,0.56,-0.1,-0.0087,0.076,0.33,0.46,-0.013,-0.25,0.4,0.093,0.04,0.24,-0.16,0.48,0.12,0.61,0.098,0.39,0.8,0.23,0.31,0.24,-0.013
2ndFlrSF,-0.25,1.0,-0.032,0.5,-0.16,-0.098,-0.16,-0.06,-0.00038,0.055,0.17,0.4,0.13,0.18,0.086,0.66,0.61,-0.022,0.069,0.032,0.027,0.018,0.31,0.12,-0.0053,0.014,0.19,0.0055,0.25,0.045,0.32,0.011,0.58,-0.21,0.09,0.018,0.16,-0.019
3SsnPorch,0.044,-0.032,1.0,-0.048,0.051,-0.023,0.027,0.027,-0.0058,-0.033,0.019,0.015,0.029,0.023,0.021,0.0063,-0.023,-0.047,-0.021,0.016,0.028,-0.0045,-0.038,0.014,-0.00079,0.027,-0.0094,0.044,0.019,-0.0065,0.045,-0.03,-0.026,0.038,-0.0039,0.016,0.037,0.023
BedroomAbvGr,0.11,0.5,-0.048,1.0,-0.11,-0.031,-0.16,0.019,0.18,0.05,0.087,0.36,0.074,0.093,-0.045,0.52,0.25,0.0031,0.24,0.13,0.23,0.07,-0.0088,0.078,0.00025,0.056,0.086,-0.0085,0.073,0.037,0.17,0.0073,0.67,0.053,0.032,-0.053,-0.022,-0.02
BsmtFinSF1,0.46,-0.16,0.051,-0.11,1.0,-0.055,0.64,0.078,-0.48,-0.1,0.29,0.082,0.31,0.26,0.19,0.21,-0.0073,-0.017,-0.086,0.19,0.22,-0.066,-0.064,0.3,0.093,-0.00094,0.12,-0.05,0.28,0.084,0.39,0.097,0.052,0.54,0.22,0.28,0.15,0.023
BsmtFinSF2,0.084,-0.098,-0.023,-0.031,-0.055,1.0,0.16,0.099,-0.24,0.033,0.066,-0.075,0.0031,-0.015,-0.069,-0.018,-0.032,0.018,-0.038,0.084,0.047,-0.0049,-0.073,-0.016,-0.0051,-0.0096,-0.0059,0.042,-0.043,0.045,-0.011,0.063,-0.048,0.089,0.098,-0.028,-0.062,0.0089
BsmtFullBath,0.26,-0.16,0.027,-0.16,0.64,0.16,1.0,-0.15,-0.4,-0.068,0.17,-0.019,0.18,0.16,0.15,0.061,-0.033,0.00015,-0.018,0.13,0.11,-0.047,0.0099,0.14,-0.0046,-0.0036,0.081,-0.042,0.16,0.044,0.23,0.053,-0.039,0.33,0.19,0.21,0.13,0.045
BsmtHalfBath,0.011,-0.06,0.027,0.019,0.078,0.099,-0.15,1.0,-0.11,-0.0097,0.039,-0.047,-0.021,-0.033,-0.058,-0.044,-0.058,0.01,-0.065,0.026,-0.026,-0.013,-0.0019,0.015,0.037,0.023,-0.035,0.084,-0.041,0.067,-0.017,0.042,-0.05,0.012,0.052,-0.03,-0.046,-0.02
BsmtUnfSF,0.3,-0.00038,-0.0058,0.18,-0.48,-0.24,-0.4,-0.11,1.0,0.005,0.0048,0.27,0.16,0.18,0.17,0.23,-0.036,-0.014,0.065,0.021,0.11,0.047,-0.13,0.09,-0.01,0.023,0.12,-0.14,0.28,-0.032,0.21,-0.049,0.25,0.41,-0.039,0.13,0.17,-0.038
EnclosedPorch,-0.066,0.055,-0.033,0.05,-0.1,0.033,-0.068,-0.0097,0.005,1.0,0.00097,-0.12,-0.11,-0.13,-0.3,0.0033,-0.082,0.022,0.028,0.021,0.012,0.087,-0.021,-0.11,0.0087,-0.021,-0.06,0.071,-0.14,0.093,-0.13,-0.064,0.015,-0.086,-0.12,-0.37,-0.22,-0.0011


Need to create a mapping off of this data for all other values that was not covered from the NULL pass through