In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('seaborn')
%matplotlib inline

In [2]:
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import make_scorer, mean_squared_error
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import ElasticNet, LinearRegression, Lasso,Ridge
from sklearn.linear_model import ElasticNetCV, LassoCV
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import StackingRegressor, RandomForestRegressor

In [3]:
from numpy import hstack

## Import Data 

In [4]:
train = pd.read_csv('train_v2.csv')
test = pd.read_csv('test_v2.csv')

In [5]:
y = train.SalePrice
train = train.drop(columns=['Id', 'SalePrice'])

In [6]:
print(f'Train shape: {train.shape}')
print(f'Test shape: {test.shape}')

Train shape: (1451, 335)
Test shape: (1459, 335)


In [7]:
X_train, X_val, y_train, y_val = train_test_split(train, y, test_size = 0.30, random_state=0)
print(f'X_train : {X_train.shape}')
print(f'X_test : {X_val.shape}')
print(f'y_train : {y_train.shape}')
print(f'y_test : {y_val.shape}')

X_train : (1015, 335)
X_test : (436, 335)
y_train : (1015,)
y_test : (436,)


In [8]:
stdSc = StandardScaler()
X_train = pd.DataFrame(stdSc.fit_transform(X_train), columns=X_train.columns, index=X_train.index)
X_val = pd.DataFrame(stdSc.transform(X_val), columns=X_val.columns, index=X_val.index)

In [9]:
scorer = make_scorer(mean_squared_error, greater_is_better=False)

def rmse_train(model):
    r = np.sqrt(-cross_val_score(model, X_train, y_train, scoring = scorer, cv = 10))
    return(r)
    
def rmse_val(model):
    r = np.sqrt(-cross_val_score(model, X_val, y_val, scoring = scorer, cv = 10))
    return(r)

In [16]:
def get_models():
    models = list()
    models.append(('elasticNet', ElasticNet(l1_ratio=0.3, alpha=0.01, max_iter=300000, tol=1e-9)))
    models.append(('lasso', Lasso(alpha=0.003, max_iter=400000, tol=1e-9)))
    models.append(('Ridge', Ridge(alpha=300, tol=1e-9)))
#     models.append(('knr', KNeighborsRegressor()))
#     models.append(('xgb', XGBRegressor(n_estimators=150, eta=0.1, min_child_weight=10)))
#     models.append(('svr', SVR(kernel='poly', degree=1, C=0.1, coef0=3, max_iter=100000)))
#     models.append(('elasticNetCV', ElasticNetCV(l1_ratio=[0.001,0.003,0.01,0.03,0.1,0.3, 1,3],
#                                                 alphas=[0.001,0.003,0.01,0.03,0.1,0.3, 1,3],
#                                                 max_iter=200000,
#                                                 cv=10,
#                                                 tol=0.000001)))
#     models.append(('lassoCV', LassoCV(alphas=[0.001,0.003,0.01,0.03,0.1,0.3, 1,3],
#                                       max_iter=100000,
#                                       cv=10,tol=0.000001)))
    models.append(('xgboost', XGBRegressor(n_estimators=1000,
                                             objective='reg:squarederror',
                                             max_depth=5,
                                             nthread=-1,
                                             eta=0.1, 
                                             subsample=0.5,
                                             colsample_bytree=0.7,
                                             min_child_weight=0,
                                             reg_lambda=0.01,
                                             reg_alpha=0.01
                                            )))
    models.append(('lightbgm', LGBMRegressor(boosting_type='goss',
                                              objective='regression',
                                              learning_rate=0.1,
                                              n_estimators=1000,
                                              colsample_bytree=0.5,
                                              feature_fraction=0.7,
                                              feature_fraction_bynode=0.7,
                                              feature_fraction_seed=np.random.randint(10),
                                              num_leaves=10,
                                              lambda_l1=0.01,
                                              lambda_l2=0.1
                                             )))
    return models

In [11]:
def fit_ensemble(models, X_train, X_val, y_train, y_val):
    # fit all models on the training set and predict on hold out set
    meta_X = list()
    for _, model in models:
        # fit in training set
        print(model)
        model.fit(X_train, y_train)
        # predict on hold out set
        yhat = model.predict(X_val)
        # reshape predictions into a matrix with one column
        yhat = yhat.reshape(len(yhat), 1)
        # store predictions as input for blending
        meta_X.append(yhat)
    # create 2d array from predictions, each set is an input feature
    meta_X = hstack(meta_X)
    # define blending model
    blender =  StackingRegressor(estimators=models,
                                   final_estimator=RandomForestRegressor(bootstrap=True,
                                                                        random_state=1,
                                                                        min_samples_leaf=5,
                                                                        min_samples_split=2,  
                                                                        max_depth=15,
                                                                        max_features='sqrt',
                                                                        ),
                                    cv=10)
    # fit on predictions from base models
    blender.fit(meta_X, y_val)
    return blender

In [12]:
# make a prediction with the blending ensemble
def predict_ensemble(models, blender, X, y):
    # make predictions with base models
    meta_X = list()
    for _, model in models:
        # predict with base model
        yhat = model.predict(X)
        # reshape predictions into a matrix with one column
        yhat = yhat.reshape(len(yhat), 1)
        # store prediction
        meta_X.append(yhat)
    # create 2d array from predictions, each set is an input feature
    meta_X = hstack(meta_X)
    
    print(f'Model: Blender, StackRegressor')
    yhat = blender.predict(meta_X)
    print(f'Stack Gen RMSE: {mean_squared_error(y, yhat, squared=False)}')
    
    stack_gen = blender.predict(meta_X)
    stack_gen = np.reshape(stack_gen, (-1,1))
    
    meta_X = np.concatenate((meta_X, stack_gen), axis=1)
    coeff = np.array([0.15,0.15,0.15,0.15,0.15,0.25])
    predict = meta_X * coeff
    predict = np.sum(predict, axis=1)
    # predict
#     return blender.predict(meta_X)
    return predict

In [17]:
# create the base models
models = get_models()
# train the blending ensemble
blender = fit_ensemble(models, X_train, X_val, y_train, y_val)

ElasticNet(alpha=0.01, l1_ratio=0.3, max_iter=300000, tol=1e-09)
Lasso(alpha=0.003, max_iter=400000, tol=1e-09)
Ridge(alpha=300, tol=1e-09)
XGBRegressor(base_score=None, booster=None, colsample_bylevel=None,
             colsample_bynode=None, colsample_bytree=0.7, eta=0.1, gamma=None,
             gpu_id=None, importance_type='gain', interaction_constraints=None,
             learning_rate=None, max_delta_step=None, max_depth=5,
             min_child_weight=0, missing=nan, monotone_constraints=None,
             n_estimators=1000, n_jobs=None, nthread=-1, num_parallel_tree=None,
             random_state=None, reg_alpha=0.01, reg_lambda=0.01,
             scale_pos_weight=None, subsample=0.5, tree_method=None,
             validate_parameters=None, verbosity=None)
LGBMRegressor(boosting_type='goss', colsample_bytree=0.5, feature_fraction=0.7,
              feature_fraction_bynode=0.7, feature_fraction_seed=6,
              lambda_l1=0.01, lambda_l2=0.1, n_estimators=1000, num_leaves=

In [18]:
for name, model in models:
    # fit the model on the training dataset
    model.fit(X_train, y_train)
    # make a prediction on the test dataset
    print(f'Model: {name}')
    yhat_train = model.predict(X_train)
    yhat_val = model.predict(X_val)
    print(f'Train rmse: {mean_squared_error(y_train, yhat_train, squared=False)}')
    print(f'Val rmse: {mean_squared_error(y_val, yhat_val, squared=False)}')
    print('-------------------------------')
    # evaluate the predictions

Model: elasticNet
Train rmse: 0.09194476768336686
Val rmse: 0.10326579327091416
-------------------------------
Model: lasso
Train rmse: 0.09191378018721592
Val rmse: 0.1036235152449364
-------------------------------
Model: Ridge
Train rmse: 0.08886586208671236
Val rmse: 0.10661316733953716
-------------------------------
Model: xgboost
Train rmse: 0.0011188035439923024
Val rmse: 0.11692198025662312
-------------------------------
Model: lightbgm
Train rmse: 0.01497783809459202
Val rmse: 0.12114754103279138
-------------------------------


In [19]:
print('Training Set---------')
yhat_train = predict_ensemble(models, blender, X_train, y_train)
score = mean_squared_error(y_train, yhat_train, squared=False)
print('Blending train rmse: %.7f' % score)

print('\nValidation Set---------')
yhat_val = predict_ensemble(models, blender, X_val, y_val)
score = mean_squared_error(y_val, yhat_val, squared=False)
print('Blending val rmse: %.7f' % score)

Training Set---------
Model: Blender, StackRegressor
Stack Gen RMSE: 0.07332640294370807
Blending train rmse: 0.0583444

Validation Set---------
Model: Blender, StackRegressor
Stack Gen RMSE: 0.09156254158733766
Blending val rmse: 0.0984635


## Submission

In [80]:
# make a prediction with the blending ensemble
def submission_predict(models, blender, X):
    # make predictions with base models
    meta_X = list()
    for _, model in models:
        # predict with base model
        yhat = model.predict(X)
        # reshape predictions into a matrix with one column
        yhat = yhat.reshape(len(yhat), 1)
        # store prediction
        meta_X.append(yhat)
    # create 2d array from predictions, each set is an input feature
    meta_X = hstack(meta_X)
    
    yhat = blender.predict(meta_X)
    yhat = np.reshape(yhat, (-1,1))
    
    meta_X = np.concatenate((meta_X, yhat), axis=1)
    coeff = np.array([0.15,0.15,0.15,0.15,0.15,0.25])
    predict = meta_X * coeff
    predict = np.sum(predict, axis=1)
    # predict
#     return blender.predict(meta_X)
    return predict

In [81]:
test = pd.read_csv('test_v2.csv')
test = pd.DataFrame(stdSc.transform(test), columns=test.columns, index=test.index)
prediction = submission_predict(models, blender, test)
prediction

array([11.68677344, 12.001685  , 12.09417187, ..., 11.97469046,
       11.66828026, 12.29861061])

In [82]:
prediction = np.expm1(prediction)
submission = pd.DataFrame(prediction, columns=['SalePrice'], index=pd.read_csv('test.csv').Id)
submission.head()

Unnamed: 0_level_0,SalePrice
Id,Unnamed: 1_level_1
1461,118986.465744
1462,163028.264559
1463,178825.595405
1464,192027.316499
1465,179278.509608


In [83]:
blend_1 = pd.read_csv('blend_dataset/House_Prices_submit.csv')
blend_2 = pd.read_csv('blend_dataset/best_submission.csv')
blend_3 = pd.read_csv('blend_dataset/hybrid_solution.csv')
blend_4 = pd.read_csv('blend_dataset/lasso_sol22_Median.csv')

In [84]:
blend_1.index = blend_1.Id
blend_1 = blend_1.drop(columns='Id')
blend_1.head()

Unnamed: 0_level_0,SalePrice
Id,Unnamed: 1_level_1
1461,121118.161176
1462,166296.677557
1463,185264.902418
1464,195140.874098
1465,190226.389555


In [85]:
blend_2.index = blend_2.Id
blend_2 = blend_2.drop(columns='Id')
blend_2.head()

Unnamed: 0_level_0,SalePrice
Id,Unnamed: 1_level_1
1461,122242.0
1462,163769.0
1463,184763.0
1464,199176.0
1465,191727.0


In [86]:
blend_3.index = blend_3.Id
blend_3 = blend_3.drop(columns='Id')
blend_3.head()

Unnamed: 0_level_0,SalePrice
Id,Unnamed: 1_level_1
1461,120635.089551
1462,166995.910189
1463,188855.10508
1464,201677.418621
1465,189999.640754


In [87]:
blend_4 = blend_4.rename(columns={'SalePrice':'Id', 'id':'SalePrice'})

In [88]:
blend_4.index = blend_4.Id
blend_4 = blend_4.drop(columns='Id')
blend_4.head()

Unnamed: 0_level_0,SalePrice
Id,Unnamed: 1_level_1
1461,123869.209797
1462,160819.061703
1463,178337.500093
1464,199800.787062
1465,198795.333026


In [89]:
submission['SalePrice'] = (0.2*submission.SalePrice + 
                           0.2*blend_1.SalePrice + 
                           0.2*blend_2.SalePrice +
                           0.2*blend_3.SalePrice +
                           0.2*blend_4.SalePrice
                          )
submission = np.floor(submission)
submission.head()

Unnamed: 0_level_0,SalePrice
Id,Unnamed: 1_level_1
1461,121370.0
1462,164181.0
1463,183209.0
1464,197564.0
1465,190005.0


In [90]:
# plt.boxplot(submission)

In [91]:
# Brutal approach to deal with predictions close to outer range 
q1 = submission['SalePrice'].quantile(0.01)
q2 = submission['SalePrice'].quantile(0.99)

submission['SalePrice'] = submission['SalePrice'].apply(lambda x: x if x > q1 else x*0.8)
submission['SalePrice'] = submission['SalePrice'].apply(lambda x: x if x < q2 else x*1.1)
submission.head()

Unnamed: 0_level_0,SalePrice
Id,Unnamed: 1_level_1
1461,121370.0
1462,164181.0
1463,183209.0
1464,197564.0
1465,190005.0


In [92]:
submission = np.floor(submission)
submission.to_csv('BE_FinalEstimatorForest_v5.csv')
submission.head()

Unnamed: 0_level_0,SalePrice
Id,Unnamed: 1_level_1
1461,121370.0
1462,164181.0
1463,183209.0
1464,197564.0
1465,190005.0


In [None]:
def predict_stackgen(models, blender, X, y):
    # make predictions with base models
    meta_X = list()
    for _, model in models:
        # predict with base model
        yhat = model.predict(X)
        # reshape predictions into a matrix with one column
        yhat = yhat.reshape(len(yhat), 1)
        # store prediction
        meta_X.append(yhat)
    # create 2d array from predictions, each set is an input feature
    meta_X = hstack(meta_X)
    
    print(f'Model: Blender, StackRegressor')
    yhat = blender.predict(meta_X)
    print(f'Stack Gen RMSE: {mean_squared_error(y, yhat, squared=False)}')
    
    stack_gen = blender.predict(meta_X)
#     stack_gen = np.reshape(stack_gen, (-1,1))
    
#     meta_X = np.concatenate((meta_X, stack_gen), axis=1)
#     coeff = np.array([0.15,0.15,0.15,0.15,0.15,0.25])
#     predict = meta_X * coeff
#     predict = np.sum(predict, axis=1)
    # predict
#     return blender.predict(meta_X)
    return stack_gen

In [None]:
print('Training Set---------')
yhat_train = predict_stackgen(models, blender, X_train, y_train)
score = mean_squared_error(y_train, yhat_train, squared=False)
print('Blending train rmse: %.7f' % score)

print('\nValidation Set---------')
yhat_val = predict_ensemble(models, blender, X_val, y_val)
score = mean_squared_error(y_val, yhat_val, squared=False)
print('Blending val rmse: %.7f' % score)