## Modelling

In [2]:
import future
import numpy as np 
import pandas as pd 
from sklearn.linear_model import Lasso
from sklearn.kernel_ridge import KernelRidge
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.base import clone
from sklearn.model_selection import KFold, cross_val_score, train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error
import xgboost as xgb

### Reload_data

In [3]:
train = pd.read_csv('intermediate_data/train_ft_processed.csv')
test = pd.read_csv('intermediate_data/test_ft_processed.csv')
y_train = pd.read_csv('intermediate_data/train_log1p.csv')
y_train = y_train['SalePrice']

### Base Models

In [5]:
train = train[:100]
y_train = y_train[:100]

In [6]:
# param_grid = {'lasso__alpha':[0.0001, 0.001, 0.005, 0.01, 0.1, 1, 10]}
param_grid = {'lasso__alpha':[0.002, 0.003, 0.004, 0.005, 0.006, 0.007, 0.008, 0.009]}

lasso = make_pipeline(RobustScaler(), Lasso(random_state=228))
gs = GridSearchCV(lasso, param_grid=param_grid, scoring='neg_mean_squared_error', cv=5)
gs.fit(train, y_train)
print gs.best_params_
print np.sqrt(-gs.best_score_)

{'lasso__alpha': 0.008}
0.153171283567


In [13]:
# param_grid = [{'kernelridge__kernel':['linear'], 
#                'kernelridge__alpha':[0.0001, 0.001, 0.01, 0.1, 1, 10]}, 
#              {'kernelridge__kernel':['polynomial'],
#               'kernelridge__degree': [1,2,3,4],
#               'kernelridge__coef0': [0.001, 0.01, 0.1, 1, 10, 100],
#               'kernelridge__alpha':[0.0001, 0.001, 0.005, 0.01, 0.1, 1, 10]},
#              ]
param_grid = [
             {'kernelridge__kernel':['polynomial'],
              'kernelridge__degree': [1,2],
              'kernelridge__coef0': [500, 1000, 10000, 100000],
              'kernelridge__alpha':[0.03, 0.05, 0.08]},
             ]

krr = make_pipeline(RobustScaler(), KernelRidge())
gs = GridSearchCV(krr, param_grid=param_grid, scoring='neg_mean_squared_error', cv=5)
gs.fit(train, y_train)
print gs.best_params_
print np.sqrt(-gs.best_score_)

{'kernelridge__alpha': 0.05, 'kernelridge__coef0': 100000, 'kernelridge__kernel': 'polynomial', 'kernelridge__degree': 1}
0.154524956261


In [16]:
param_grid = [{'n_estimators':[500,1000,1500,2000],
               'max_depth': [None,3,6,10],
               'max_features': ['auto','sqrt']}
             ]
rf = RandomForestRegressor(n_jobs=-1, random_state=228)
gs = GridSearchCV(rf, param_grid=param_grid, scoring='neg_mean_squared_error', cv=5)
gs.fit(train, y_train)
print gs.best_params_
print np.sqrt(-gs.best_score_)

{'max_features': 'auto', 'n_estimators': 500, 'max_depth': 10}
0.175374760254


In [18]:
param_grid = [{'n_estimators':[500,1000,1500,2000],
               'learning_rate':[0.1, 0.2, 0.3],
               'max_depth':[3,6,10],
               'subsample':[0.8, 1],
               'colsample_bytree': [0.8, 1],
               'reg_lambda':[0.01,0.1,1]}
             ]
xgb_model = xgb.XGBRegressor(n_jobs=-1, random_state=228)
gs = GridSearchCV(xgb_model, param_grid=param_grid, scoring='neg_mean_squared_error', cv=5)
gs.fit(train, y_train)
print gs.best_params_
print np.sqrt(-gs.best_score_)

{'colsample_bytree': 1, 'learning_rate': 0.1, 'n_estimators': 500, 'subsample': 0.8, 'reg_lambda': 0.01, 'max_depth': 3}
0.161471961196


## Stacking models

### In this approach, we add a meta-model on averaged base models and use the out-of-folds predictions of these base models to train our meta-model.

In [19]:
import future
import numpy as np 
import pandas as pd 
from sklearn.linear_model import Lasso
from sklearn.kernel_ridge import KernelRidge
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.base import clone
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error
import xgboost as xgb

In [20]:
train = pd.read_csv('intermediate_data/train_ft_processed.csv')
test = pd.read_csv('intermediate_data/test_ft_processed.csv')
y_train = pd.read_csv('intermediate_data/train_log1p.csv')

train.reset_index(inplace=True)
test.reset_index(inplace=True)
y_train.reset_index(inplace=True)

y_train = y_train['SalePrice']

train = train.values.astype(np.float)
test = test.values.astype(np.float)
y_train = y_train.values.astype(np.float)

### 1. As XGBoost does not work with mlxtend package, I have to write a stacking method myself
### 2. XGBoost does not support clone method in sklearn

In [21]:
# write a stacking model with lasso, rf, xgb

class StackingModels():
    def __init__(self, n_folds=5):
        self.lasso_folds = []
        self.krr_folds = []
        self.rf_folds = []
        self.xgb_folds = []
        self.meta_model = KernelRidge(alpha =0.01, kernel = 'linear')
        self.n_folds = n_folds
   
    # We again fit the data and initiate new model on each fold, clone does not work on xgb
    def fit(self, X, y):  
        # Train cloned base models then create out-of-fold predictions
        # that are needed to train the cloned meta-model
        
        lasso = make_pipeline(RobustScaler(), Lasso(alpha =0.008,random_state=228))
        krr = KernelRidge(alpha=0.05,kernel='polynomial',degree=1,coef0=100000)
        rf = RandomForestRegressor(n_estimators=500,max_depth=10,max_features='auto',random_state=228)
        # xgb_model = xgb.XGBRegressor(n_estimators=2000, random_state=228)
        out_of_fold_predictions = np.zeros((X.shape[0], 4))
        
        for i in range(self.n_folds):
            
            lasso_ = clone(lasso)
            krr_ = clone(krr)
            rf_ = clone(rf)
            xgb_ = xgb.XGBRegressor(colsample_bytree=1,learning_rate=0.1,n_estimators=500,
                                    subsample=0.8,reg_lambda=0.01,max_depth=3,random_state=228)
            
            self.lasso_folds.append(lasso_)
            self.krr_folds.append(krr_)
            self.rf_folds.append(rf_)
            self.xgb_folds.append(xgb_)
            
            holdout_ind = np.array(range(X.shape[0])[i::self.n_folds])
            train_ind = np.array(list(set(range(X.shape[0]))-set(holdout_ind)))
            
            lasso_.fit(X[train_ind], y[train_ind])
            krr_.fit(X[train_ind], y[train_ind])
            rf_.fit(X[train_ind], y[train_ind])
            xgb_.fit(X[train_ind], y[train_ind])
            
            y_pred = lasso_.predict(X[holdout_ind])
            out_of_fold_predictions[holdout_ind, 0] = y_pred  
            y_pred = krr_.predict(X[holdout_ind])
            out_of_fold_predictions[holdout_ind, 1] = y_pred 
            y_pred = rf_.predict(X[holdout_ind])
            out_of_fold_predictions[holdout_ind, 2] = y_pred          
            y_pred = xgb_.predict(X[holdout_ind])
            out_of_fold_predictions[holdout_ind, 3] = y_pred
                
        self.meta_model.fit(out_of_fold_predictions, y)
        return self
   
    #Do the predictions of all base models on the test data and use the averaged predictions as 
    #meta-features for the final prediction which is done by the meta-model
    def predict(self, X):

        lasso_pred = np.column_stack([model.predict(X) for model in self.lasso_folds]).mean(axis=1)
        krr_pred = np.column_stack([model.predict(X) for model in self.krr_folds]).mean(axis=1)
        rf_pred = np.column_stack([model.predict(X) for model in self.rf_folds]).mean(axis=1)
        xgb_pred = np.column_stack([model.predict(X) for model in self.xgb_folds]).mean(axis=1)
        
        meta_ft = np.column_stack((lasso_pred, krr_pred, rf_pred, xgb_pred))
            
        return self.meta_model.predict(meta_ft)

### cross_val_score does not work on self defined StackingModels, write a cross validation myself

In [22]:
# ##### for test
train = train[:100]
y_train = y_train[:100]

n_folds=5
rmse = []
for train_index, test_index in KFold(n_folds, shuffle=True).split(train):
    stack_ = StackingModels()
    stack_.fit(train[train_index], y_train[train_index])
    rmse.append(np.sqrt(np.mean((stack_.predict(train[test_index])-y_train[test_index])**2)))
score = np.array(rmse)

print("Stacking Models score: {:.4f} ({:.4f})".format(score.mean(), score.std()))

Stacking Models score: 0.1468 (0.0512)


## Submission!

In [23]:
import future
import numpy as np 
import pandas as pd 
from sklearn.linear_model import Lasso
from sklearn.kernel_ridge import KernelRidge
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.base import clone
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error
import xgboost as xgb

In [24]:
train = pd.read_csv('intermediate_data/train_ft_processed.csv')
test = pd.read_csv('intermediate_data/test_ft_processed.csv')
y_train = pd.read_csv('intermediate_data/train_log1p.csv')

train.reset_index(inplace=True)
test.reset_index(inplace=True)
y_train.reset_index(inplace=True)

y_train = y_train['SalePrice']

train = train.values.astype(np.float)
test = test.values.astype(np.float)
y_train = y_train.values.astype(np.float)

In [25]:
stack = StackingModels()
stack.fit(train, y_train)
pred = stack.predict(test)

In [26]:
sample = pd.read_csv('raw_data/sample_submission.csv')
sample.drop(columns=['SalePrice'], inplace=True)
sub = pd.read_csv('raw_data/test.csv')
sub = sub['Id'].values

pred_inverse = np.expm1(pred)

sub = sub.reshape(-1,1)
pred_inverse = pred_inverse.reshape(-1,1)
sub = np.concatenate((sub, pred_inverse), axis=1)
sub = pd.DataFrame(sub, columns=['Id','SalePrice'])

sub = sample.merge(sub, on=['Id'], how='left')

sub.to_csv('outputs/20181029_skewFix_GridSearch_stack_4_krr.csv', index=False)
print len(sub)
print sub.isnull().sum()

1459
Id           0
SalePrice    0
dtype: int64
