In [1]:
%matplotlib inline

import os, sys, csv
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
import seaborn as sns; sns.set(style="ticks", color_codes=True)

from xgboost import XGBClassifier, XGBRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, RandomForestRegressor
from sklearn.grid_search import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.cross_validation import cross_val_score
from sklearn.mixture import GaussianMixture
from sklearn.svm import LinearSVC, LinearSVR, SVC, SVR
from sklearn.linear_model import LinearRegression, ElasticNetCV, LassoCV
#from sklearn import ensemble, tree, linear_model



In [2]:
base_path = '../data/house-prices-advanced-regression-techniques'
!ls {base_path}

data_description.txt   sample_submission.csv.gz  test.csv.gz  train.csv.gz
sample_submission.csv  test.csv			 train.csv


In [3]:
train = pd.read_csv(os.path.join(base_path, 'train.csv'))
test = pd.read_csv(os.path.join(base_path, 'test.csv'))
#label = pd.read_csv(os.path.join(base_path, 'trainLabels.csv'), header=None)
print(train.shape)
#print(label.shape)
print(test.shape)

(1460, 81)
(1459, 80)


In [4]:
train.head(3)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500


In [None]:
features.isnull().sum().sort_values(ascending=False)[0:20]

## feature engineering

In [5]:
full_data = [train, test]

In [6]:
for ii, dataset in enumerate(full_data):
    dataset = dataset.drop(['YearRemodAdd','YearBuilt','Id'], axis=1, errors='ignore')
    full_data[ii] = dataset

In [7]:
for ii, dataset in enumerate(full_data):
    dataset= dataset.drop(['LotFrontage', 'Alley', 'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature'], axis=1, errors='ignore')
    full_data[ii] = dataset

In [8]:
for ii, dataset in enumerate(full_data):
    dataset[['GarageFinish','GarageQual','GarageType','GarageYrBlt','GarageCond']] = \
        dataset[['GarageFinish','GarageQual','GarageType','GarageYrBlt','GarageCond']].fillna('NoGarage')

    dataset[['GarageFinish','GarageQual','GarageType','GarageYrBlt','GarageCond']] = \
        dataset[['GarageFinish','GarageQual','GarageType','GarageYrBlt','GarageCond']].fillna('NoGarage')

    dataset[['BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2']] = \
        dataset[['BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2']].fillna('NoBSMT') 
    
    dataset[['MSSubClass','MoSold','YrSold']] = dataset[['MSSubClass','MoSold','YrSold']].astype(str)
    
    for _col in list(dataset.columns[dataset.isnull().sum() > 0]):
        dataset[_col] = dataset[_col].fillna(dataset[_col].mode()[0])
    full_data[ii] = dataset

In [9]:
for dataset in full_data:
    # Check any number of columns with NaN
    print(dataset.isnull().any().sum(), ' / ', len(dataset.columns))
    # Check any number of data points with NaN
    print(dataset.isnull().any(axis=1).sum(), ' / ', len(test))

0  /  72
0  /  1459
0  /  71
0  /  1459


In [13]:
full_data[1].head(1)

Unnamed: 0,MSSubClass,MSZoning,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,...,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,20,RH,11622,Pave,Reg,Lvl,AllPub,Inside,Gtl,NAmes,...,0,0,0,120,0,0,6,2010,WD,Normal


In [16]:
features_dm_all = []
for dataset in full_data:
    features_dm = pd.get_dummies(dataset[['MSZoning','Street','LotConfig']],
                   prefix=['MSZoning','Street','LotConfig'])
    features_dm_all.append(features_dm)

In [19]:
for ii, dataset in enumerate(full_data):
    dataset['LotShape'] = dataset['LotShape'].map({'IR3':0, 'IR2':1, 'IR1':2, 'Reg':3})
    dataset['LandSlope'] = dataset['LandSlope'].map({'Sev':0, 'Mod':1, 'Gtl':2})
    dataset['MasVnrType'] = dataset['MasVnrType'].map({'None':0, 'BrkCmn':0, 'BrkFace':1, 'Stone':2})
    dataset['ExterQual'] = dataset['ExterQual'].map({'Ex':0, 'Gd':1, 'TA':2, 'Fa':3, 'Po':4, 'NoBSMT': 5})
    dataset['ExterCond'] = dataset['ExterCond'].map({'Ex':0, 'Gd':1, 'TA':2, 'Fa':3, 'Po':4, 'NoBSMT': 5})
    dataset['BsmtQual'] = dataset['BsmtQual'].map({'Ex':0, 'Gd':1, 'TA':2, 'Fa':3, 'Po':4, 'NoBSMT': 5})
    dataset['BsmtCond'] = dataset['BsmtCond'].map({'Ex':0, 'Gd':1, 'TA':2, 'Fa':3, 'Po':4, 'NoBSMT': 5})
    dataset['BsmtExposure'] = dataset['BsmtExposure'].map({'NoBSMT':0, 'No':1, 'Mn':2, 'Av':3, 'Gd':4})
    dataset['BsmtFinType1'] = dataset['BsmtFinType1'].map({'NoBSMT':0, 'Unf':1, 'LwQ':2, 'Rec':3, 'BLQ':4, 'ALQ':5, 'GLQ':6})
    dataset['BsmtFinType2'] = dataset['BsmtFinType2'].map({'NoBSMT':0, 'Unf':1, 'LwQ':2, 'Rec':3, 'BLQ':4, 'ALQ':5, 'GLQ':6})
    dataset['HeatingQC'] = dataset['HeatingQC'].map({'Ex':0, 'Gd':1, 'TA':2, 'Fa':3, 'Po':4, 'NA': 5})
    dataset['CentralAir'] = dataset['CentralAir'].map({'N':0, 'Y':1})
    dataset['KitchenQual'] = dataset['KitchenQual'].map({'Ex':0, 'Gd':1, 'TA':2, 'Fa':3, 'Po':4, 'NA': 5})
    dataset['Functional'] = dataset['Functional'].map({'Sal':0, 'Sev':1, 'Maj2':2, 'Maj1':3, 'Mod':4, 'Min2':5, 'Min1':6, 'Typ':7})
    dataset['GarageFinish'] = dataset['GarageFinish'].map({'NoGarage':0, 'Unf':1, 'RFn':2, 'Fin':3})
    dataset['GarageQual'] = dataset['GarageQual'].map({'Ex':0, 'Gd':1, 'TA':2, 'Fa':3, 'Po':4, 'NoGarage': 5})
    dataset['GarageCond'] = dataset['GarageCond'].map({'Ex':0, 'Gd':1, 'TA':2, 'Fa':3, 'Po':4, 'NoGarage': 5})
    dataset['PavedDrive'] = dataset['PavedDrive'].map({'N':0, 'P':1, 'Y':2})
    
    full_data[ii] = dataset

In [20]:
ii = 0
for dataset, features_dm in zip(full_data, features_dm_all):
    dataset = pd.concat([dataset, features_dm], axis=1)
    full_data[ii] = dataset
    ii += 1

In [24]:
# 'OverallQual','GrLivArea','GarageCars'
for ii, dataset in enumerate(full_data):
    dataset['OverallQual_sqt'] = dataset['OverallQual'] ** 2
    dataset['GrLivArea_sqt'] = dataset['GrLivArea'] ** 2
    dataset['GarageCars_sqt'] = dataset['GarageCars'] ** 2
    
    full_data[ii] = dataset

In [25]:
for ii, dataset in enumerate(full_data):
    dataset = dataset.drop(dataset.select_dtypes(include=['O']).columns.tolist(), axis=1, errors='ignore')
    full_data[ii] = dataset

In [46]:
train = full_data[0]
test = full_data[1]

In [None]:
colormap = plt.cm.RdBu
plt.figure(figsize=(14,12))
plt.title('Pearson Correlation of Features', y=1.05, size=15)
sns.heatmap(train.astype(float).corr(),linewidths=0.1,vmax=1.0, 
            square=True, cmap=colormap, linecolor='white', annot=True)

<matplotlib.axes._subplots.AxesSubplot at 0x7efeb061b710>

In [27]:
# Going to use these 5 base models for the stacking
from sklearn.ensemble import (RandomForestClassifier, AdaBoostClassifier, 
                              GradientBoostingClassifier, ExtraTreesClassifier)
from sklearn.svm import SVC
from sklearn.cross_validation import KFold

In [49]:
ntest


1459

In [47]:
ntrain = train.shape[0]
ntest = test.shape[0]
SEED = 0 # for reproducibility
NFOLDS = 5 # set folds for out-of-fold prediction
kf = KFold(ntrain, n_folds= NFOLDS, random_state=SEED)

# Class to extend the Sklearn classifier
class SklearnHelper(object):
    def __init__(self, clf, seed=0, params=None):
        params['random_state'] = seed
        self.clf = clf(**params)

    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)

    def predict(self, x):
        return self.clf.predict(x)
    
    def fit(self,x,y):
        return self.clf.fit(x,y)
    
    def feature_importances(self,x,y):
        print(self.clf.fit(x,y).feature_importances_)
    

In [50]:
def get_oof(clf, x_train, y_train, x_test):
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((NFOLDS, ntest))

    for i, (train_index, test_index) in enumerate(kf):
        x_tr = x_train[train_index]
        y_tr = y_train[train_index]
        x_te = x_train[test_index]

        clf.train(x_tr, y_tr)

        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(x_test)

    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)

In [51]:
# Put in our parameters for said classifiers
# Random Forest parameters
rf_params = {
    'n_jobs': -1,
    'n_estimators': 500,
     'warm_start': True, 
     #'max_features': 0.2,
    'max_depth': 6,
    'min_samples_leaf': 2,
    'max_features' : 'sqrt',
    'verbose': 0
}

# Extra Trees Parameters
et_params = {
    'n_jobs': -1,
    'n_estimators':500,
    #'max_features': 0.5,
    'max_depth': 8,
    'min_samples_leaf': 2,
    'verbose': 0
}

# AdaBoost parameters
ada_params = {
    'n_estimators': 500,
    'learning_rate' : 0.75
}

# Gradient Boosting parameters
gb_params = {
    'n_estimators': 500,
     #'max_features': 0.2,
    'max_depth': 5,
    'min_samples_leaf': 2,
    'verbose': 0
}

# Support Vector Classifier parameters 
svc_params = {
    'kernel' : 'linear',
    'C' : 0.025
    }

In [52]:
# Create 5 objects that represent our 4 models
rf = SklearnHelper(clf=RandomForestClassifier, seed=SEED, params=rf_params)
et = SklearnHelper(clf=ExtraTreesClassifier, seed=SEED, params=et_params)
ada = SklearnHelper(clf=AdaBoostClassifier, seed=SEED, params=ada_params)
gb = SklearnHelper(clf=GradientBoostingClassifier, seed=SEED, params=gb_params)
svc = SklearnHelper(clf=SVC, seed=SEED, params=svc_params)

In [53]:
y_train = train['SalePrice'].ravel()
train = train.drop(['SalePrice'], axis=1)
x_train = train.values # Creates an array of the train data
x_test = test.values # Creats an array of the test data

In [54]:
y_train.shape

(1460,)

In [None]:
et_oof_train, et_oof_test = get_oof(et, x_train, y_train, x_test) # Extra Trees
#rf_oof_train, rf_oof_test = get_oof(rf,x_train, y_train, x_test) # Random Forest
ada_oof_train, ada_oof_test = get_oof(ada, x_train, y_train, x_test) # AdaBoost 
gb_oof_train, gb_oof_test = get_oof(gb,x_train, y_train, x_test) # Gradient Boost
svc_oof_train, svc_oof_test = get_oof(svc,x_train, y_train, x_test) # Support Vector Classifier

print("Training is complete")

In [None]:
#rf_feature = rf.feature_importances(x_train,y_train)
et_feature = et.feature_importances(x_train, y_train)
ada_feature = ada.feature_importances(x_train, y_train)
gb_feature = gb.feature_importances(x_train,y_train)

In [None]:
#rf_features = [0.10474135,  0.21837029,  0.04432652,  0.02249159,  0.05432591,  0.02854371
#  ,0.07570305,  0.01088129 , 0.24247496,  0.13685733 , 0.06128402]
et_features = [ 0.12165657,  0.37098307  ,0.03129623 , 0.01591611 , 0.05525811 , 0.028157
  ,0.04589793 , 0.02030357 , 0.17289562 , 0.04853517,  0.08910063]
ada_features = [0.028 ,   0.008  ,      0.012   ,     0.05866667,   0.032 ,       0.008
  ,0.04666667 ,  0.     ,      0.05733333,   0.73866667,   0.01066667]
gb_features = [ 0.06796144 , 0.03889349 , 0.07237845 , 0.02628645 , 0.11194395,  0.04778854
  ,0.05965792 , 0.02774745,  0.07462718,  0.4593142 ,  0.01340093]


In [None]:
cols = train.columns.values
# Create a dataframe with features
feature_dataframe = pd.DataFrame( {'features': cols,
     'Random Forest feature importances': rf_features,
     'Extra Trees  feature importances': et_features,
      'AdaBoost feature importances': ada_features,
    'Gradient Boost feature importances': gb_features
    })

In [None]:
base_predictions_train = pd.DataFrame( {'RandomForest': rf_oof_train.ravel(),
     'ExtraTrees': et_oof_train.ravel(),
     'AdaBoost': ada_oof_train.ravel(),
      'GradientBoost': gb_oof_train.ravel()
    })
base_predictions_train.head()