In [192]:
# Frquent
import pandas as pd
import numpy as np
import re
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

# ML
import sklearn
#import xgboost as xgb

#
from scipy.stats import skew

#
from sklearn.preprocessing import LabelBinarizer

In [193]:
# Load in the train and test datasets
train = pd.read_csv('../input/house/train.csv')
test = pd.read_csv('../input/house/test.csv')
train = train.drop(train[(train['GrLivArea']>4000) & (train['SalePrice']<300000)].index)
train = train.drop(train[train['SalePrice']>700000].index)
train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [194]:
from sklearn.base import BaseEstimator, TransformerMixin

class DFSelector(BaseEstimator,TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names]
    
class SkewTransform(BaseEstimator,TransformerMixin):
    def __init__(self,num_skew = True, cat_encode = True):
        self.num_skew = num_skew
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        skewed_feats = X.apply(lambda x: skew(x.dropna())) #compute skewness
        skewed_feats = skewed_feats[skewed_feats > 0.75]
        skewed_feats = skewed_feats.index
        X[skewed_feats] = np.log1p(X[skewed_feats])
        return X
class DFImputer(BaseEstimator, TransformerMixin):
    def __init__(self):
        """Impute missing values.

        Columns of dtype object are imputed with the most frequent value 
        in column.

        Columns of other types are imputed with mean of column.

        """
    def fit(self, X, y=None):

        self.fill = pd.Series([X[c].value_counts().index[0]
            if X[c].dtype == np.dtype('O') else X[c].median() for c in X],
            index=X.columns)

        return self

    def transform(self, X, y=None):
        return X.fillna(self.fill)
    
class CateEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        None
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        cate_encoded = pd.get_dummies(X)
        return cate_encoded

In [195]:
y = np.log(train['SalePrice'])
X = train.drop(['SalePrice'],axis=1)
df = pd.concat([X, test])
#df.set_index('Id',inplace=True)


In [196]:
# missing values
df["PoolQC"] = df["PoolQC"].fillna("None")
df["MiscFeature"] = df["MiscFeature"].fillna("None")
df["Alley"] = df["Alley"].fillna("None")
df["Fence"] = df["Fence"].fillna("None")
df["FireplaceQu"] = df["FireplaceQu"].fillna("None")
df["LotFrontage"] = df.groupby("Neighborhood")["LotFrontage"].transform(lambda x: x.fillna(x.median()))
for col in ('GarageType', 'GarageFinish', 'GarageQual', 'GarageCond'):
    df[col] = df[col].fillna('None')
for col in ( 'GarageArea', 'GarageCars'):
    df[col] = df[col].fillna(0)
df['GarageYrBlt'] = df['GarageYrBlt'].fillna(min(df['GarageYrBlt'])-1)  
for col in ('BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF','TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath'):
    df[col] = df[col].fillna(0)
for col in ('BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2'):
    df[col] = df[col].fillna('None')
df["MasVnrType"] = df["MasVnrType"].fillna("None")
df["MasVnrArea"] = df["MasVnrArea"].fillna(0)
df['MSZoning'] = df['MSZoning'].fillna(df['MSZoning'].mode()[0])
df = df.drop(['Utilities'], axis=1)
df["Functional"] = df["Functional"].fillna("Typ")
df['Electrical'] = df['Electrical'].fillna(df['Electrical'].mode()[0])
df['KitchenQual'] = df['KitchenQual'].fillna(df['KitchenQual'].mode()[0])
df['Exterior1st'] = df['Exterior1st'].fillna(df['Exterior1st'].mode()[0])
df['Exterior2nd'] = df['Exterior2nd'].fillna(df['Exterior2nd'].mode()[0])
df['SaleType'] = df['SaleType'].fillna(df['SaleType'].mode()[0])
df['MSSubClass'] = df['MSSubClass'].fillna("None")

In [197]:
df['MSSubClass'] = df['MSSubClass'].astype('str')
df['TotalSF'] = df['TotalBsmtSF'] + df['1stFlrSF'] + df['2ndFlrSF']

In [198]:
num_attr = df.dtypes[df.dtypes != 'object'].index
cate_attr = df.dtypes[df.dtypes == 'object'].index

In [199]:
from sklearn.preprocessing import Imputer,StandardScaler,RobustScaler
from sklearn.pipeline import Pipeline, FeatureUnion
numpipe = Pipeline([
    ('selector',DFSelector(num_attr)),
    #('imputer',DFImputer()),
    ('trans',SkewTransform()),
    ('scaler',StandardScaler())#RobustScaler())#
])
catepipe = Pipeline([
    ('selector',DFSelector(cate_attr)),
    #('imputer',DFImputer()),
    ('encode',CateEncoder())
    
])
full_pipeline = FeatureUnion(transformer_list=[
    ('num_pipe',numpipe),
    ('cat_pipe',catepipe)
        
])

In [200]:
df_trans=full_pipeline.fit_transform(df)

In [201]:
Xed = df_trans[:train.shape[0]]
test_trans = df_trans[train.shape[0]:]

In [202]:
from sklearn.model_selection import cross_val_score, GridSearchCV
# base learner
from sklearn.linear_model import Ridge,Lasso,ElasticNet 
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import xgboost as xgb
#import lightgbm as lgb

In [203]:
def getbestparam(model, param_grid):
    grid_search = GridSearchCV(model, param_grid=param_grid,cv=10,scoring='neg_mean_squared_error')
    grid_search.fit(Xed, y)
    bestpara = grid_search.best_params_
    model.set_params(**bestpara)
    print model

    #cvres= grid_search.cv_results_
    #for meantestscore, params in zip(cvres['mean_test_score'],cvres['params']):
    #    print (np.sqrt(-meantestscore), params)

    scores = cross_val_score(model,Xed, y, scoring='neg_mean_squared_error',cv=10 )
    rmse_scores = np.sqrt(-scores)
    def display_scores(scores):
        print ('{0:f} ({1:f})').format(scores.mean(),scores.std())
    display_scores(rmse_scores)

In [90]:
model = Ridge()
param_grid = [
    {'alpha':[5,10,20,30]}
]
getbestparam(model, param_grid)

Ridge(alpha=10, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)
0.109201 (0.014640)


In [28]:
model = Lasso()
param_grid = [
    {'alpha':[0.0001,0.0005,0.001]}
]
getbestparam(model, param_grid)

Lasso(alpha=0.0005, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)
0.107393 (0.015098)


In [29]:
model = ElasticNet()
param_grid = [
    {'alpha':[0.0001,0.0008,0.001],
     'l1_ratio':[0.45,0.48,0.50,0.6,0.7]
    }
]
getbestparam(model, param_grid)

ElasticNet(alpha=0.0008, copy_X=True, fit_intercept=True, l1_ratio=0.5,
      max_iter=1000, normalize=False, positive=False, precompute=False,
      random_state=None, selection='cyclic', tol=0.0001, warm_start=False)
0.107403 (0.015296)


In [162]:
model = RandomForestRegressor()
param_grid = [
    {
        'n_estimators':[300],
        'max_depth':[10,20,None],
        'max_features':['sqrt',None],
        'min_samples_leaf':[2,6,10]
        
    }
]

In [157]:
model = GradientBoostingRegressor()
param_grid = [
    {
        'n_estimators':[300],
        'learning_rate':[0.01,0.03,0.1],
        'max_depth':[5,10,15,20],
        'max_features':['sqrt',None],
        'min_samples_leaf':[2,6,10],
        'loss':['ls','huber']
        
    }
]

In [246]:
model = xgb.XGBRegressor()
param_grid = [
    {
        'n_estimators':[1000],
        'learning_rate':[0.005,0.01,0.1],
        'max_depth':[5,10,15,20],
        'reg_alpha':[0.01,0.1], 
        'reg_lambda':[0.01,0.1],
        'subsample' : [0.6,0.8],
        'colsample_bytree' : [0.6,0.8],
        'gamma' : [0.01,0.1]
    }
]

In [225]:
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold
class StackingAveragedModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, base_models, meta_model, n_folds=5):
        self.base_models = base_models
        self.meta_model = meta_model
        self.n_folds = n_folds
   
    # We again fit the data on clones of the original models
    def fit(self, X, y):
        X=pd.DataFrame(X)
        self.base_models_ = [list() for x in self.base_models]
        self.meta_model_ = clone(self.meta_model)
        kfold = KFold(n_splits=self.n_folds, shuffle=True, random_state=156)
        
        # Train cloned base models then create out-of-fold predictions
        # that are needed to train the cloned meta-model
        out_of_fold_predictions = np.zeros((X.shape[0], len(self.base_models)))
        for i, model in enumerate(self.base_models):
            print model
            for train_index, holdout_index in kfold.split(X, y):
                instance = clone(model)
                self.base_models_[i].append(instance)
                instance.fit(X.iloc[train_index], y.iloc[train_index])
                y_pred = instance.predict(X.iloc[holdout_index])
                out_of_fold_predictions[holdout_index, i] = y_pred
                
        # Now train the cloned  meta-model using the out-of-fold predictions as new feature
        self.meta_model_.fit(out_of_fold_predictions, y)
        return self
   
    #Do the predictions of all base models on the test data and use the averaged predictions as 
    #meta-features for the final prediction which is done by the meta-model
    def predict(self, X):
        meta_features = np.column_stack([
            np.column_stack([model.predict(X) for model in base_models]).mean(axis=1)
            for base_models in self.base_models_ ])
        return self.meta_model_.predict(meta_features)

In [243]:
ridge = Ridge(alpha=10)
lasso = Lasso(alpha=0.0005)
Enet = ElasticNet(alpha=0.0008, l1_ratio=0.5)

In [226]:
stacked_averaged_models = StackingAveragedModels(base_models = (ridge, lasso, Enet),
                                                 meta_model = Ridge())
stacked_averaged_models.fit(Xed,y)

Ridge(alpha=10, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)
Lasso(alpha=0.0005, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)
ElasticNet(alpha=0.0008, copy_X=True, fit_intercept=True, l1_ratio=0.5,
      max_iter=1000, normalize=False, positive=False, precompute=False,
      random_state=None, selection='cyclic', tol=0.0001, warm_start=False)


StackingAveragedModels(base_models=(Ridge(alpha=10, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001), Lasso(alpha=0.0005, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cy...False, precompute=False,
      random_state=None, selection='cyclic', tol=0.0001, warm_start=False)),
            meta_model=Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001),
            n_folds=5)

In [244]:
lasso.fit(Xed,y)

Lasso(alpha=0.0005, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [245]:
sub = pd.DataFrame()
sub['Id'] = test.Id
sub['SalePrice'] = np.exp(lasso.predict(test_trans))
sub.to_csv('submission.csv',index=False)


In [235]:
np.log1p(9)

2.3025850929940459