In [1]:
import numpy as np 
import pandas as pd

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ShuffleSplit

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.externals import joblib
from sklearn.linear_model import Ridge, Lasso
from sklearn import svm

import xgboost as xgb




In [2]:
#loading data
train = pd.read_csv("./data/train.csv")
test = pd.read_csv("./data/test.csv")

Create base case using Lasso and Ridge regression, using only numerical features.

In [3]:
df = pd.concat([train,test], axis=0)

In [4]:
df.head()

Unnamed: 0,1stFlrSF,2ndFlrSF,3SsnPorch,Alley,BedroomAbvGr,BldgType,BsmtCond,BsmtExposure,BsmtFinSF1,BsmtFinSF2,...,SaleType,ScreenPorch,Street,TotRmsAbvGrd,TotalBsmtSF,Utilities,WoodDeckSF,YearBuilt,YearRemodAdd,YrSold
0,856,854,0,,3,1Fam,TA,No,706.0,0.0,...,WD,0,Pave,8,856.0,AllPub,0,2003,2003,2008
1,1262,0,0,,3,1Fam,TA,Gd,978.0,0.0,...,WD,0,Pave,6,1262.0,AllPub,298,1976,1976,2007
2,920,866,0,,3,1Fam,TA,Mn,486.0,0.0,...,WD,0,Pave,6,920.0,AllPub,0,2001,2002,2008
3,961,756,0,,3,1Fam,Gd,No,216.0,0.0,...,WD,0,Pave,7,756.0,AllPub,0,1915,1970,2006
4,1145,1053,0,,4,1Fam,TA,Av,655.0,0.0,...,WD,0,Pave,9,1145.0,AllPub,192,2000,2000,2008


In [5]:
col_nan = len(df.columns[df.isnull().any()])
print("Number of columns with NaN: {}/{}".format(col_nan, df.shape[1]))

Number of columns with NaN: 35/81


In [6]:
# Keep rows where SalePrice value is not null
keep_rows = df['SalePrice'].notnull()
# Split into inputs and labels before removing Nans naively as
# SalePrice has Nans as well.
y = pd.DataFrame(df.pop('SalePrice'))
X = df
# Keep only columns where there are no Nan values
X = X[df.columns[X.notnull().all()]][keep_rows]
y = y[keep_rows]

In [7]:
# Confirm dimensions make sense
print(X.shape)
print(y.shape)

(1460, 46)
(1460, 1)


In [8]:
numeric = []
for feature in X.columns:
    if X[feature].dtype != 'object':
        numeric.append(feature)
        print("{}: {}".format(feature, X[feature].dtype))

1stFlrSF: int64
2ndFlrSF: int64
3SsnPorch: int64
BedroomAbvGr: int64
EnclosedPorch: int64
Fireplaces: int64
FullBath: int64
GrLivArea: int64
HalfBath: int64
Id: int64
KitchenAbvGr: int64
LotArea: int64
LowQualFinSF: int64
MSSubClass: int64
MiscVal: int64
MoSold: int64
OpenPorchSF: int64
OverallCond: int64
OverallQual: int64
PoolArea: int64
ScreenPorch: int64
TotRmsAbvGrd: int64
WoodDeckSF: int64
YearBuilt: int64
YearRemodAdd: int64
YrSold: int64


In [9]:
# Naively keep only numerical features
X = X[numeric]

In [10]:
X.head()

Unnamed: 0,1stFlrSF,2ndFlrSF,3SsnPorch,BedroomAbvGr,EnclosedPorch,Fireplaces,FullBath,GrLivArea,HalfBath,Id,...,OpenPorchSF,OverallCond,OverallQual,PoolArea,ScreenPorch,TotRmsAbvGrd,WoodDeckSF,YearBuilt,YearRemodAdd,YrSold
0,856,854,0,3,0,0,2,1710,1,1,...,61,5,7,0,0,8,0,2003,2003,2008
1,1262,0,0,3,0,1,2,1262,0,2,...,0,8,6,0,0,6,298,1976,1976,2007
2,920,866,0,3,0,1,2,1786,1,3,...,42,5,7,0,0,6,0,2001,2002,2008
3,961,756,0,3,272,1,1,1717,0,4,...,35,5,7,0,0,7,0,1915,1970,2006
4,1145,1053,0,4,0,1,2,2198,1,5,...,84,5,8,0,0,9,192,2000,2000,2008


In [12]:
# Construct pipelines
pipe_ridge = Pipeline([('scl', StandardScaler()),
            ('pca', PCA()),
            ('regr', Ridge())])

pipe_lasso = Pipeline([('scl', StandardScaler()),
            ('pca', PCA()),
            ('regr', Lasso())])

pipe_xgb = Pipeline([('scl', StandardScaler()),
            ('pca', PCA()),
            ('regr', xgb.XGBRegressor())])

# Parameters to use for grid search
param_ridge = {
    'pca__n_components': (2,3,4),
    'regr__alpha': np.logspace(0,10,10) # alpha >= 0
}

param_lasso = {
    'pca__n_components': (2,3,4),
    'regr__alpha': np.logspace(-10,5,10)
}

param_xgb = {
 'regr__max_depth': range(3,10,2),
 'regr__min_child_weight': range(1,6,2)
}

# Create dictionary for easy referencing
pipe_dict = {
    "ridge": ('Ridge', pipe_ridge, param_ridge),
    "lasso": ('Lasso', pipe_lasso, param_lasso),
    "xgb": ('XGB', pipe_xgb, param_xgb)
}

# For cross validation
regressor = ["ridge", "lasso", "xgb"]
cv =  ShuffleSplit(n_splits=5, test_size=0.2, random_state=42)

best_pipe = best_regr = best = None

for regr in regressor:
    model, pipe, param_grid = pipe_dict[regr]
    grid = GridSearchCV(pipe, param_grid=param_grid, cv=cv)
    grid.fit(X, y)
    
    print("{}: The best parameters are {} with a score of {:0.6f}".format(
          model, grid.best_params_, grid.best_score_)
    )
    if grid.best_score_ > best:
        best_pipe = pipe
        best_regr = grid.best_estimator_
        best = grid.best_score_

Ridge: The best parameters are {'regr__alpha': 1.0, 'pca__n_components': 4} with a score of 0.753937




Lasso: The best parameters are {'regr__alpha': 0.00046415888336127822, 'pca__n_components': 4} with a score of 0.754328
XGB: The best parameters are {'regr__max_depth': 3, 'regr__min_child_weight': 3} with a score of 0.833328
