In [172]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import FunctionTransformer, StandardScaler
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import KFold, GridSearchCV, cross_val_score
from sklearn.linear_model import Lasso

I will be going through the Housing dataset and utilizing Pipelines to clean and prep the data for modeling.

Choices for filling null values:
After looking through the data the values that were NaN looked to be that the house did not have that feature.  For this reason I filled the NaN values in the LotFrontage with 0 and then for the object columns with Nan values I filled those in with the string 'NA'

For the year columns instead of treating them as a continuous variable I turned them into a category of Decade and added it to the object columns to be dummied.

One problem I had when making dummy columns was that some categories do not show up in the train data that are present in the test data, and vice versa.  To work around this I concatinated the train and test data together excluding the Saleprice.

In [173]:
df = pd.read_csv('train.csv', index_col='Id')
test = pd.read_csv('test.csv', index_col='Id')


In [None]:
df = pd.concat((train.loc[:,'MSSubClass':'SaleCondition'],
                      test.loc[:,'MSSubClass':'SaleCondition']))

In [230]:
def objectnan_to_NA(df):
    for i in range(len(df.columns)):
        if df.dtypes[i] == 'object':
            df[df.columns[i]].fillna('NA',inplace=True)
    return df
objectnan_to_NA_tf = FunctionTransformer(objectnan_to_NA, validate=False)

def areanan_to_zero(df):
    for i in range(len(df.columns)):
        if df.dtypes[i] == 'float64' or df.dtypes[i] == 'int64':
            df[df.columns[i]].fillna(0, inplace=True)
    return df
areanan_to_zero_tf = FunctionTransformer(areanan_to_zero, validate=False)

def float_to_int(df):
    for i in range(len(df.columns)):
        if df.dtypes[i] == 'float64':
            df[df.columns[i]] = df[df.columns[i]].astype(int)
    return df
float_to_int_tf = FunctionTransformer(float_to_int, validate=False)

def decade(year):
    return str(year)[0:3] + '0'

def years_to_decades(df):
    years = ['YearBuilt', 'YearRemodAdd', 'GarageYrBlt', 'YrSold']
    for column in years:
        df[column] = df[column].apply(decade)
    return df
years_to_decades_tf = FunctionTransformer(years_to_decades, validate=False)

def cat_dums(df):
    for c in df.columns:
        if c not in areas and c != 'SalePrice':
            c_cats = np.union1d(df[c].values, df[c].values)
            df[c] = pd.Categorical(df[c], categories=c_cats)
    return df
cat_dums_tf = FunctionTransformer(cat_dums, validate=False)
def dummies(df):
    areas = ['LotFrontage', 'LotArea', 'MasVnrArea', 'BsmtFinSF1','BsmtFinSF2','BsmtUnfSF','TotalBsmtSF','1stFlrSF','2ndFlrSF',
         'LowQualFinSF', 'GrLivArea','GarageArea','WoodDeckSF','OpenPorchSF','EnclosedPorch','3SsnPorch','PoolArea','MiscVal', 'ScreenPorch']
    return pd.get_dummies(df, columns=[cols for cols in df.columns if (cols not in areas and cols != 'SalePrice')])

dummies_tf = FunctionTransformer(dummies, validate=False)
years = ['YearBuilt', 'YearRemodAdd', 'GarageYrBlt', 'YrSold']
areas = ['LotFrontage', 'LotArea', 'MasVnrArea', 'BsmtFinSF1','BsmtFinSF2','BsmtUnfSF','TotalBsmtSF','1stFlrSF','2ndFlrSF',
         'LowQualFinSF', 'GrLivArea','GarageArea','WoodDeckSF','OpenPorchSF','EnclosedPorch','3SsnPorch','PoolArea','MiscVal', 'ScreenPorch']
def clean_features(df):
    return df[areas]
clean_features_tf = FunctionTransformer(clean_features, validate=False)

def test_base_features(df):
    return df[train.shape[0]:]
test_base_features_tf = FunctionTransformer(test_base_features, validate=False)


In [238]:
def rmse_cv(model):
    rmse= np.sqrt(-cross_val_score(model, df, df['SalePrice'], scoring="neg_mean_squared_error"))
    return(rmse)

In [232]:
dummy_pipe = Pipeline([
    ('objectnan_to_NA', objectnan_to_NA_tf),
    ('years_to_decades', years_to_decades_tf),
    ('cat_dums', cat_dums_tf),
    ('dummies',dummies_tf)
])

In [234]:
dummy_pipe.fit_transform(df).dtypes

LotFrontage              int32
LotArea                  int64
MasVnrArea               int32
BsmtFinSF1               int64
BsmtFinSF2               int64
BsmtUnfSF                int64
TotalBsmtSF              int64
1stFlrSF                 int64
2ndFlrSF                 int64
LowQualFinSF             int64
GrLivArea                int64
GarageArea               int64
WoodDeckSF               int64
OpenPorchSF              int64
EnclosedPorch            int64
3SsnPorch                int64
ScreenPorch              int64
PoolArea                 int64
MiscVal                  int64
SalePrice                int64
MSSubClass_20            uint8
MSSubClass_30            uint8
MSSubClass_40            uint8
MSSubClass_45            uint8
MSSubClass_50            uint8
MSSubClass_60            uint8
MSSubClass_70            uint8
MSSubClass_75            uint8
MSSubClass_80            uint8
MSSubClass_85            uint8
                         ...  
MiscFeature_TenC         uint8
MoSold_1

In [224]:
area_pipe = Pipeline([
    ('areanan_to_zero', areanan_to_zero_tf),
    ('float_to_int', float_to_int_tf),
    ('clean_features', clean_features_tf)
])

In [214]:
clean_pipe.fit_transform(df).dtypes

LotFrontage      int32
LotArea          int64
MasVnrArea       int32
BsmtFinSF1       int64
BsmtFinSF2       int64
BsmtUnfSF        int64
TotalBsmtSF      int64
1stFlrSF         int64
2ndFlrSF         int64
LowQualFinSF     int64
GrLivArea        int64
GarageArea       int64
WoodDeckSF       int64
OpenPorchSF      int64
EnclosedPorch    int64
3SsnPorch        int64
PoolArea         int64
MiscVal          int64
ScreenPorch      int64
dtype: object

In [215]:
features = FeatureUnion([
    ('clean_pipe',clean_pipe),
    ('dummies', dummies_tf)
])

In [216]:
features.fit_transform(df)

array([[   65,  8450,   196, ...,     0,     1,     0],
       [   80,  9600,     0, ...,     0,     1,     0],
       [   68, 11250,   162, ...,     0,     1,     0],
       ..., 
       [   66,  9042,     0, ...,     0,     1,     0],
       [   68,  9717,     0, ...,     0,     1,     0],
       [   75,  9937,     0, ...,     0,     1,     0]], dtype=int64)

In [235]:
pipe = Pipeline([
    ('dummy_pipe',dummy_pipe),
    ('ss', StandardScaler()),
    ('lasso', Lasso())    
])

In [236]:
pipe.fit(df,df['SalePrice'])

Pipeline(steps=[('dummy_pipe', Pipeline(steps=[('objectnan_to_NA', FunctionTransformer(accept_sparse=False,
          func=<function objectnan_to_NA at 0x000000000D393B38>,
          inv_kw_args=None, inverse_func=None, kw_args=None, pass_y=False,
          validate=False)), ('years_to_decades', FunctionTran...e=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False))])

In [239]:
rmse_cv(pipe).mean()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


ValueError: operands could not be broadcast together with shapes (487,384) (408,) (487,384) 

In [220]:
params = {
    'lasso__alpha': np.linspace(0,5,10)
}

In [221]:
kf = KFold(n_splits=5, shuffle=True, random_state=2008)

In [222]:
gs = GridSearchCV(pipe, param_grid = params, scoring='r2', cv=kf)

In [223]:

gs.fit(df,df['SalePrice'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


ValueError: operands could not be broadcast together with shapes (292,379) (431,) (292,379) 

In [114]:
gs.best_score_

0.72059794630882845

In [58]:
354 / 146.

2.4246575342465753