In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

In [2]:
from sklearn.linear_model import ElasticNet, Lasso,  BayesianRidge, LassoLarsIC
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor
from sklearn.kernel_ridge import KernelRidge

from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import lightgbm as lgb

## Import Cleaned Data

In [3]:
train = pd.read_csv('train_new.csv')
test = pd.read_csv('test_new.csv')

In [5]:
y = train.SalePrice
train = train.drop(columns=['Id', 'SalePrice'])

In [6]:
print(f'Train shape: {train.shape}')
print(f'Test shape: {test.shape}')

Train shape: (1451, 341)
Test shape: (1459, 341)


In [7]:
X_train, X_test, y_train, y_test = train_test_split(train, y, test_size = 0.30, random_state=0)
print(f'X_train : {X_train.shape}')
print(f'X_test : {X_test.shape}')
print(f'y_train : {y_train.shape}')
print(f'y_test : {y_test.shape}')

X_train : (1015, 341)
X_test : (436, 341)
y_train : (1015,)
y_test : (436,)


In [8]:
stdSc = StandardScaler()
X_train = pd.DataFrame(stdSc.fit_transform(X_train), columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(stdSc.transform(X_test), columns=X_test.columns, index=X_test.index)

In [9]:
from sklearn.model_selection import KFold # for repeated K-fold cross validation
from sklearn.model_selection import cross_val_score # score evaluation
from sklearn.model_selection import cross_val_predict # prediction
from sklearn.linear_model import ElasticNetCV, LassoCV, RidgeCV
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error

from sklearn.ensemble import GradientBoostingRegressor
from mlxtend.regressor import StackingCVRegressor
from sklearn.ensemble import StackingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
import time
SEED = 42

In [10]:
# Repeated K-fold cross validation
kfolds = KFold(n_splits=10, shuffle=True, random_state=SEED)

# Return root mean square error of model prediction (Used for test prediction)
def rmse(y, y_pred):
    return np.sqrt(mean_squared_error(y, y_pred))

# Return root mean square error applied cross validation (Used for training prediction)
def evaluate_model_cv(model, X, y):
    rmse = np.sqrt(-cross_val_score(model, X, y, scoring="neg_mean_squared_error", cv=kfolds))
    return (rmse)

In [29]:
def construct_models():
    # Initialize parameters for models
    alphas_ridge = [0.005, 0.01, 0.1, 1, 5, 10, 15]
    alphas_lasso = [5e-05, 0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007, 0.0008]
    e_alphas_elas = [0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007]
    e_l1ratio_elas = [0.8, 0.85, 0.9, 0.95, 0.99, 1]
    
    # Constructing the models
    models = dict()
    
    models['ridge'] = RidgeCV(alphas=alphas_ridge, cv=kfolds)
    models['lasso'] = LassoCV(alphas=alphas_lasso, random_state=SEED, cv=kfolds, max_iter=100000)
    models['elasticnet'] = ElasticNetCV(alphas=e_alphas_elas, cv=kfolds, l1_ratio=e_l1ratio_elas, max_iter=100000)
    models['svr'] = SVR(C = 20, epsilon = 0.008, gamma =0.0003)
    models['gbr'] = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05, 
                                              max_depth=4, max_features='sqrt',
                                              min_samples_leaf=15, min_samples_split=10, 
                                              loss='huber',random_state =SEED) 
    models['lgbm'] = LGBMRegressor(objective='regression', num_leaves=4,
                                   learning_rate=0.01, n_estimators=5000,
                                   max_bin=200, bagging_fraction=0.75,
                                   bagging_freq=5, bagging_seed=7,
                                   feature_fraction=0.2,
                                   feature_fraction_seed=7, verbose=-1,
                                  colsample_bytree=None, subsample=None, subsample_freq=None)
    models['xgboost'] = XGBRegressor(learning_rate=0.01, n_estimators=3460,
                                     max_depth=3, min_child_weight=0,
                                     gamma=0, subsample=0.7,
                                     colsample_bytree=0.7, verbosity = 0,
                                     objective='reg:squarederror', nthread=-1,
                                     scale_pos_weight=1, seed=SEED, reg_alpha=0.00006)
    return models

# Construct the set of model
models = construct_models()

In [30]:
for name, model in models.items():
    # Start counting time
    start = time.perf_counter()
    
    model = model.fit(X_train, y_train)
    rmse_result = rmse(y_train, model.predict(np.array(X_train)))
    print(f'{name}\'s rmse after training: {rmse_result}')
    
    # Compute time for executing each algo
    run = time.perf_counter() - start
    print(f'Computational runtime of this algo: {round(run, 2)} seconds\n')

ridge's rmse after training: 0.08155772280917212
Computational runtime of this algo: 1.84 seconds

lasso's rmse after training: 0.08392343814400487
Computational runtime of this algo: 7.3 seconds

elasticnet's rmse after training: 0.0835580227754813
Computational runtime of this algo: 31.65 seconds

svr's rmse after training: 0.04139831980774069
Computational runtime of this algo: 0.86 seconds

gbr's rmse after training: 0.03602560608418233
Computational runtime of this algo: 14.47 seconds

lgbm's rmse after training: 0.06458878955879056
Computational runtime of this algo: 3.94 seconds

xgboost's rmse after training: 0.04056840334885486
Computational runtime of this algo: 19.69 seconds



In [31]:
cv_rmse_result = dict()
cv_rmse_mean = dict()
cv_rmse_std = dict()

for name, model in models.items():
    # Start counting time
    start = time.perf_counter()
    
    cv_rmse_result[name] = evaluate_model_cv(model, X_train, y_train)
    cv_rmse_mean[name] = cv_rmse_result[name].mean()
    cv_rmse_std[name] = cv_rmse_result[name].std()
    print(f'Finish {name}\'s model')
    
    # Compute time for executing each algo
    run = time.perf_counter() - start
    print(f'Computational runtime of this algo: {round(run, 2)} seconds\n')

Finish ridge's model
Computational runtime of this algo: 19.52 seconds

Finish lasso's model
Computational runtime of this algo: 78.24 seconds

Finish elasticnet's model
Computational runtime of this algo: 199.14 seconds

Finish svr's model
Computational runtime of this algo: 2.94 seconds

Finish gbr's model
Computational runtime of this algo: 150.2 seconds

Finish lgbm's model
Computational runtime of this algo: 42.94 seconds

Finish xgboost's model
Computational runtime of this algo: 187.4 seconds



In [32]:
ML_cv = pd.DataFrame({'cv_rsme_mean' : cv_rmse_mean, 'cv_rmse_std' : cv_rmse_std})
ML_cv

Unnamed: 0,cv_rsme_mean,cv_rmse_std
ridge,0.116259,0.008429
lasso,0.111549,0.00803
elasticnet,0.111875,0.008017
svr,0.116066,0.013142
gbr,0.113841,0.010256
lgbm,0.113976,0.009972
xgboost,0.112972,0.010082


In [33]:
# Type 1 stacking model
stack_model = StackingCVRegressor(regressors=(models['ridge'], models['lasso'], models['xgboost'],
                                              models['elasticnet'], models['gbr'], models['lgbm']),
                                  meta_regressor=models['xgboost'], use_features_in_secondary=True)

In [34]:
# Time performance counter
start = time.perf_counter()

stack_model = stack_model.fit(X_train, y_train)
print('Finish training')

# Compute rmse with cross-validation technique
# rmse_stack_cv = evaluate_model_cv(stack_model, np.array(df_train), y_train)
# print(f'stack_model\'s rmse (using cv) after training: {rmse_stack_cv.mean()}')

# Compute rmse without cross-validation technique
rmse_stack = rmse(y_train, stack_model.predict(X_train))
print(f'stack_model\'s rmse (using cv) after training: {rmse_stack}')

# Compute time for executing each algo
run = time.perf_counter() - start
print(f'Computational runtime of this algo: {round(run, 2)} seconds\n')

Finish training
stack_model's rmse (using cv) after training: 0.035773373105488025
Computational runtime of this algo: 407.66 seconds



In [35]:
def blend_models_predict(X):
    return ((0.05 * models['ridge'].predict(np.array(X))) + \
            (0.05 * models['lasso'].predict(np.array(X))) + \
            (0.05 * models['elasticnet'].predict(np.array(X))) + \
            (0.15 * models['gbr'].predict(np.array(X))) + \
            (0.15 * models['lgbm'].predict(np.array(X))) + \
            (0.25 * models['xgboost'].predict(np.array(X))) + \
            (0.3 * stack_model.predict(np.array(X))))

In [36]:
print('RMSLE score on train data:')
print(rmse(y_train, blend_models_predict(np.array(X_train))))

RMSLE score on train data:
0.0421510285058451


In [37]:
test = pd.DataFrame(stdSc.transform(test), columns=test.columns, index=test.index)

test_pred = blend_models_predict(test)
# submit = pd.concat((test, pd.Series(np.expm1(blend_models_predict(test)), 
#                                        name='SalePrice')), axis=1)


In [38]:
print(test_pred)

[9.29641504 9.43226314 9.57495978 ... 9.62646897 9.17034131 9.84694192]


In [39]:
test_pred = np.expm1(test_pred)

In [40]:
submission = pd.concat([pd.read_csv('test.csv').Id, pd.DataFrame(test_pred, columns=['SalePrice'])], axis=1)

In [41]:
submission.to_csv('August17.csv', index=False)