In [1]:
import sys
sys.path.append('../')

import pandas as pd
import numpy as np
from mlxtend.regressor import StackingRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import Lasso, Ridge, ElasticNet
from sklearn.svm import SVR
import xgboost as xgb
import lightgbm as lgb

from modules.featureEng import *
from modules.modelEng import *

# to display entire data
#pd.set_option('display.max_rows', 1500)
#pd.set_option('display.max_columns', 500)

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
"""
Load Data
"""

train_df = pd.read_csv('data/train.csv', index_col='Id')
test_df = pd.read_csv('data/test.csv', index_col='Id')

In [3]:
"""
Feature Engineering
"""

train_X_bf = train_df.loc[:,train_df.columns!='SalePrice']
train_Y_bf = train_df['SalePrice']
test_X_bf = test_df

### for testing ###
#from sklearn.model_selection import train_test_split
#train_X_bf, test_X_bf, train_Y_bf, test_Y = train_test_split(train_X_bf, train_Y_bf, random_state=777)
###################

train_X, train_Y, test_X = pre_processing(train_X_bf.copy(), train_Y_bf.copy(), test_X_bf.copy())

In [4]:
#from statsmodels.stats.outliers_influence import variance_inflation_factor
#from statsmodels.tools.tools import add_constant
#t = add_constant(train_X)
#list(zip(t.columns,[variance_inflation_factor(t.values, i) for i in range(t.shape[1])]))

In [5]:
"""
Predict house price
"""

#elastic_net = grid_search(train_X, train_Y, ElasticNet())
#lasso = grid_search(train_X, train_Y, Lasso())
#ridge = grid_search(train_X, train_Y, Ridge())
#random_forest = grid_search(train_X, train_Y, RandomForestRegressor())
support_vector_regressor = grid_search(train_X, train_Y, SVR())
#gradient_boost_regressor = grid_search(train_X, train_Y, GradientBoostingRegressor())
#XGBoost = grid_search(train_X, train_Y, xgb.XGBRegressor())
#light_GBM = grid_search(train_X, train_Y, lgb.LGBMRegressor())

stacked_regression = StackingRegressor(
        #regressors=[elastic_net, lasso, ridge, random_forest, XGBoost, light_GBM],
        regressors=[support_vector_regressor],
        meta_regressor=support_vector_regressor
)

stacked_regression.fit(train_X, train_Y)

stacked = stacked_regression.predict(test_X)

ensembled = np.expm1(#(0.1 * elastic_net.predict(test_X)) +
                     #(0.1 * lasso.predict(test_X)) +
                     #(0.1 * ridge.predict(test_X)) +
                     #(0.05 * random_forest.predict(test_X)) +
                     #(1 * support_vector_regressor.predict(test_X))) +
                     #(0.3 * XGBoost.predict(test_X)) +
                     #(0.2 * light_GBM.predict(test_X)) +
                     (1 * stacked))
                     

stacked_regression.score(train_X, train_Y)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:   22.6s
[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed:  1.1min finished


Model: GradientBoostingRegressor Score: (0.11195378540045162) Best params: {'learning_rate': 0.01, 'loss': 'huber', 'max_depth': 4, 'max_features': 'sqrt', 'min_samples_leaf': 20, 'min_samples_split': 3, 'n_estimators': 3000, 'random_state': 45}
GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
                          learning_rate=0.01, loss='huber', max_depth=4,
                          max_features='sqrt', max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=20, min_samples_split=3,
                          min_weight_fraction_leaf=0.0, n_estimators=3000,
                          n_iter_no_change=None, presort='auto',
                          random_state=45, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)


0.9605572793620101

In [6]:
### for testing ###
#RMSE = np.mean((ensembled - test_Y)**2)**(1/2) 
#print('Score : ' + str(RMSE))
###################

"""
Export submission data
"""
submission = pd.DataFrame({
    'Id':test_X.index + (len(train_X_bf) - len(train_X) + 1),
    'SalePrice':ensembled
})
submission.to_csv('data/submission.csv', index=False)

In [7]:
#train_X.hist(bins=50, figsize=(15,10))