In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set()


#modeling
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import ElasticNet, Lasso, Ridge
from sklearn.kernel_ridge import KernelRidge
from sklearn.neighbors import KNeighborsRegressor
import xgboost as xgb

import warnings
def ignore_warn(*args, **kwargs):
    pass
warnings.warn = ignore_warn #ignore annoying warning (from sklearn and seaborn)

In [4]:
train = pd.read_csv('data/train_preProcess.csv')
test = pd.read_csv('data/test_preProcess.csv')

In [5]:
train_y = train.SalePrice
train.drop(['SalePrice'], axis=1, inplace=True)

train.drop(['Id'], axis=1, inplace=True)

test_id = test['Id']
test.drop(['Id'], axis=1, inplace=True)

df = pd.concat([train, test], axis= 0)

In [7]:
lasso = make_pipeline(RobustScaler(), Lasso(alpha = 0, random_state=123))

ridge_kern = make_pipeline(RobustScaler(), KernelRidge(alpha=0.05,degree = 1, coef0 = 4.5))

ridge= make_pipeline(RobustScaler(), Ridge(alpha=0.05, random_state=123))

ENet = make_pipeline(RobustScaler(), ElasticNet(alpha= 0.001, l1_ratio=0.37931034482758619, 
                    random_state=123))

KNN = make_pipeline(RobustScaler(), KNeighborsRegressor(n_neighbors= 7, weights='distance'))


XGB = xgb.XGBRegressor(colsample_bytree= 0.060000000000000005,
 gamma= 0.017000000000000001,
 learning_rate= 0.029999999999999999,
 max_depth= 3, n_estimators= 1315, silent = 1, random_state = 123, nthread = 4)

In [30]:
models = [ENet, XGB, ridge_kern, lasso, KNN]
meta_model = [ridge]

In [31]:
#create an empty list in insert predictions from each model 
out_of_fold_predictions = np.zeros((train.shape[0], len(models)))
#we are going to split the training set to 5 folds
kfold = KFold(n_splits=5, shuffle=True, random_state=123)

#gather all the model parameters into a list so we can iterate over it
base_models = [list() for x in models]

for i, model in enumerate(models):
    #create training set and holdout set with "kfold"
    for train_index, holdout_index in kfold.split(train.values, train_y.values):
        #iterate through models
        model = models[i]
        base_models[i].append(model)
        #fit training set with current model
        model.fit(train.values[train_index], train_y.values[train_index])
        #predict with holdout set with current model
        y_pred = model.predict(train.values[holdout_index])
        #add predictions to out_of_fold_predictions
        out_of_fold_predictions[holdout_index, i] = y_pred
        
    #after iterate through all the models, we would fit meta model with out_of_fold preds
    meta_model[0].fit(out_of_fold_predictions, train_y.values)
    

out_of_bag_preds  = np.zeros((train.shape[0], len(models)))

#retrain all models with the whole training data
for i, model in enumerate(models):
    model = models[i]
    model.fit(train.values, train_y)
    y_pred = model.predict(train.values)
    #insert prediction of current model to the list
    out_of_bag_preds[:, i] = y_pred

#calculate cv scores
cv_score = cross_val_score(meta_model[0], out_of_bag_preds, train_y, cv=10, scoring="neg_mean_squared_error")
print("the 10-fold cv score is: ", np.mean(np.sqrt(np.abs(cv_score))))
#predict out_of_bag_preds with meta model
final_pred = meta_model[0].predict(out_of_bag_preds)

the 10-fold cv score is:  0.000130513250686


In [32]:
#create an empty list in insert predictions from each model 
out_of_fold_predictions = np.zeros((train.shape[0], len(models)))
#we are going to split the training set to 5 folds
kfold = KFold(n_splits=5, shuffle=True, random_state=123)

#gather all the model parameters into a list so we can iterate over it
base_models = [list() for x in models]

for i, model in enumerate(models):
    #create training set and holdout set with "kfold"
    for train_index, holdout_index in kfold.split(train.values, train_y.values):
        #iterate through models
        model = models[i]
        base_models[i].append(model)
        #fit training set with current model
        model.fit(train.values[train_index], train_y.values[train_index])
        #predict with holdout set with current model
        y_pred = model.predict(train.values[holdout_index])
        #add predictions to out_of_fold_predictions
        out_of_fold_predictions[holdout_index, i] = y_pred
        
    #after iterate through all the models, we would fit meta model with out_of_fold preds
    meta_model[0].fit(out_of_fold_predictions, train_y.values)

In [33]:
out_of_bag_preds  = np.zeros((test.shape[0], len(models)))

#retrain all models with the whole training data
for i, model in enumerate(models):
    model = models[i]
    model.fit(train.values, train_y)
    y_pred = model.predict(test.values)
    #insert prediction of current model to the list
    out_of_bag_preds[:, i] = y_pred

#predict out_of_bag_preds with meta model
final_pred = meta_model[0].predict(out_of_bag_preds)

In [34]:
final_pred = np.exp(final_pred) - 1
df_sub = pd.DataFrame({'id': test_id, "SalePrice":final_pred})
df_sub.to_csv('stacking_out.csv', index=False)