In [1]:
import sys
sys.path.append('../')

import pandas as pd
import numpy as np
from mlxtend.regressor import StackingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Lasso, Ridge, ElasticNet
from sklearn.svm import SVR

from modules.featureEng import *
from modules.modelEng import *

# to display entire data
pd.set_option('display.max_rows', 1500)
pd.set_option('display.max_columns', 150)

In [2]:
"""
Load Data
"""

train_df = pd.read_csv('data/train.csv', index_col='Id')
test_df = pd.read_csv('data/test.csv', index_col='Id')

In [3]:
"""
Feature Engineering
"""

train_X = train_df.loc[:,train_df.columns!='SalePrice']
train_Y = np.log1p(train_df['SalePrice'])
test_X = test_df.copy()

train_X, test_X = pre_processing(train_X, test_X)

In [None]:
"""
Predict house price
"""

elastic_net = grid_search(train_X, train_Y, ElasticNet())
lasso = grid_search(train_X, train_Y, Lasso())
ridge = grid_search(train_X, train_Y, Ridge())
random_forest = grid_search(train_X, train_Y, RandomForestRegressor())
support_vector_regressor = grid_search(train_X, train_Y, SVR())

stacked_regression = StackingRegressor(
        regressors=[elastic_net, lasso, ridge, random_forest, support_vector_regressor],
        meta_regressor=SVR(kernel='sigmoid', gamma='scale')
)

stacked_regression.fit(train_X, train_Y)

stacked = stacked_regression.predict(test_X)

ensembled = np.expm1((0.3 * elastic_net.predict(test_X)) +
                     (0.2 * lasso.predict(test_X)) +
                     (0.2 * ridge.predict(test_X)) +
                     (0.2 * random_forest.predict(test_X)) +
                     (0.1 * stacked))

ensembled

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    1.7s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    1.9s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.0s finished


Estimator: ElasticNet score: (0.14664710402602504) best params: {'alpha': 0.0003, 'l1_ratio': 0.9, 'max_iter': 10000}
ElasticNet(alpha=0.0003, copy_X=True, fit_intercept=True, l1_ratio=0.9,
           max_iter=10000, normalize=False, positive=False, precompute=False,
           random_state=None, selection='cyclic', tol=0.0001, warm_start=False)
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Estimator: Lasso score: (0.14675520856656327) best params: {'alpha': 0.0005, 'normalize': False}
Lasso(alpha=0.0005, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False)
Fitting 5 folds for each of 2 candidates, totalling 10 fits
Estimator: Ridge score: (0.14740483547658) best params: {'alpha': 10.5}
Ridge(alpha=10.5, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=None, solver='auto', tol=0.001)
Fitting 5 folds for each of 27

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 104 out of 135 | elapsed:    0.7s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done 135 out of 135 | elapsed:    0.9s finished


Estimator: RandomForestRegressor score: (0.16403177918274586) best params: {'max_depth': 5, 'min_samples_leaf': 5, 'n_estimators': 20}
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=5,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=5, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=20,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.


In [None]:
"""
Export submission data
"""
submission = pd.DataFrame({
    'Id':test_X.index,
    'SalePrice':ensembled
})
submission.to_csv('data/submission.csv', index=False)