# House Prices Model

## Imports

In [1]:
import numpy as np
import pandas as pd
import math

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, KFold, cross_val_score
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
import xgboost as xgb

%load_ext autoreload
%autoreload 2

## Load Data

In [2]:
X_train = pd.read_csv("./processed_dataset/X_train.csv").values
y_train = pd.read_csv("./processed_dataset/y_train.csv").values.ravel()
X_test = pd.read_csv("./processed_dataset/X_test.csv").values

## Model Options

In [3]:
grid_search_options = {
    # choose from sorted(metrics.SCORERS.keys())
    'scoring': 'neg_mean_squared_log_error',

    # jobs to run in parallel (-1 means using all processors)
    'n_jobs': 8,

    # cross-validation splitting strategy
    # (None, to use the default 5-fold cross validation)
    'cv': None,
    'verbose': 10
}

## SVM

In [4]:
# SVR RANDOM
scaler = StandardScaler()
estimator = SVR(cache_size=1000)
pipe = make_pipeline(
    scaler,
    estimator
)

svr_rand_grid = {
    'C': [1, 10, 100], 'kernel': ['linear', 'rbf']
}

svr_rand_grid = {pipe.steps[1][0] + '__' +
                 k: v for k, v in svr_rand_grid.items()}

svr_rand = RandomizedSearchCV(
    estimator=pipe,
    param_distributions=svr_rand_grid,
    n_iter=2,
    **grid_search_options
)

In [5]:
# svr_rand.fit(X_train, y_train)

# print("best estimator: ", svr_rand.best_estimator_)
# print("best cross_validation score: ", round(sqrt(-svr_rand.best_score_), 5))

In [4]:
# # SVR PARAMETERS

# svr_param_grid = {
#     'C': [1, 10, 100], 'kernel': ['linear']
# }

# svr_param_grid = {pipe.steps[1][0] + '__' + k: v for k, v in svr_param_grid.items()}

# svr_grid = GridSearchCV(
#     estimator=SVR(),
#     param_grid=svr_param_grid,
#     **grid_search_options
# )
# svr_grid.fit(X_train, y_train)

# print("best estimator: ", svr_grid.best_estimator_)
# print("best cross_validation score: ", round(sqrt(-svr_grid.best_score_), 5))

## Random Forest

In [5]:
# RF RANDOM

# It is necessary to bundle together the scaler with the estimator
# otherwise the scaler fits all the trainset
# and information leaks int the cross-validation

scaler = StandardScaler()
estimator = RandomForestRegressor()
pipe = make_pipeline(scaler, estimator)

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start=500, stop=4000, num=8)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(30, 110, num=9)]
max_depth.append(None)
# # Minimum number of samples required to split a node
# min_samples_split = [2, 5, 10]
# # Minimum number of samples required at each leaf node
# min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

rf_rand_grid = {
    'n_estimators': n_estimators,
    'max_features': max_features,
    'max_depth': max_depth,
    #     'min_samples_split': min_samples_split,
    #     'min_samples_leaf': min_samples_leaf,
    'bootstrap': bootstrap
}

rf_rand_grid = {pipe.steps[1][0] + '__' +
                k: v for k, v in rf_rand_grid.items()}

rf_rand = RandomizedSearchCV(
    estimator=pipe,
    param_distributions=rf_rand_grid,
    n_iter=10,
    **grid_search_options
)

In [6]:
# rf_rand.fit(X_train, y_train)

# print("best estimator: ", rf_rand.best_estimator_)
# print("best cross_validation score: ", round(sqrt(-rf_rand.best_score_), 5))

In [None]:
# # RF PARAMETERS

# rf_param_grid = [
#     {'n_estimators': [800, 1400, 1600]},
#     {'max_features': ['auto', 'sqrt']},
#     {'max_depth': [None, 50, 60, 70]},
#     {'bootstrap': [True, False]}
# ]

# rf_grid = GridSearchCV(
#     estimator=RandomForestRegressor(),
#     param_grid=rf_param_grid,
#     **grid_search_options
# )

# rf_grid.fit(X_train, y_train)

# print("best estimator: ", rf_grid.best_estimator_)
# print("best cross_validation score: ", round(sqrt(-rf_grid.best_score_),5))

## XGBoost

In [8]:
xgboost = xgb.XGBRegressor(
    colsample_bytree=0.4603,
    gamma=0.0468,
    learning_rate=0.05,
    max_depth=3,
    min_child_weight=1.7817,
    n_estimators=2200,
    reg_alpha=0.4640,
    reg_lambda=0.8571,
    subsample=0.5213,
    silent=1,
    random_state=7,
    nthread=-1
)

xgboost = xgb.XGBRegressor(
    learning_rate=0.01,
    n_estimators=3460,
    max_depth=3,
    min_child_weight=0,
    gamma=0,
    subsample=0.7,
    colsample_bytree=0.7,
    objective='reg:squarederror',
    nthread=-1,
    scale_pos_weight=1,
    seed=27,
    reg_alpha=0.00006
)

In [10]:
xgb_rand_grid = {
    'learning_rate': [0.01, 0.05],
    'n_estimators': [2000, 3000],
    'max_depth': [3, 4],
    'min_child_weight': [0, 1, 2],
    'gamma': [0, 0.5, 1, 2],
    'subsample': [0.5, 0.6, 0.7],
    'colsample_bytree': [0.4, 0.6, 0.7],
    'objective': ['reg:squarederror'],
    'nthread': [-1]
}

xgboost = xgb.XGBRegressor()

xgb_random = RandomizedSearchCV(
    estimator=xgboost, 
    param_distributions=xgb_rand_grid,
    n_iter=10,
    **grid_search_options
)

In [None]:
xgb_random.fit(X_train, y_train)

In [13]:
print("best estimator: ", xgb_random.best_estimator_)
print("best cross_validation score: ", round(np.sqrt(-xgb_random.best_score_), 5))

best estimator:  XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.6, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.01, max_delta_step=0, max_depth=4,
             min_child_weight=2, missing=nan, monotone_constraints='()',
             n_estimators=3000, n_jobs=-1, nthread=-1, num_parallel_tree=1,
             random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
             subsample=0.7, tree_method='exact', validate_parameters=1,
             verbosity=None)
best cross_validation score:  0.00884


In [16]:
def rmsle(y, y_pred):
    return np.sqrt(mean_squared_error(y, y_pred))


# def cv_rmse(model, X, y):
#     kfolds = KFold(n_splits=5, shuffle=True)
#     rmse = np.sqrt(-cross_val_score(model, X, y,
#                                     scoring="neg_mean_squared_error", cv=kfolds))
#     return (rmse)

In [10]:
# cv_rmse(xgboost, X_train, y_train)

array([0.11513306, 0.11096707, 0.10428389, 0.11088586, 0.12052089])

In [11]:
# xgboost.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.7, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.01, max_delta_step=0, max_depth=3,
             min_child_weight=0, missing=nan, monotone_constraints='()',
             n_estimators=3460, n_jobs=-1, nthread=-1, num_parallel_tree=1,
             random_state=27, reg_alpha=6e-05, reg_lambda=1, scale_pos_weight=1,
             seed=27, subsample=0.7, tree_method='exact', validate_parameters=1,
             verbosity=None)

In [12]:
# cv_rmse(rf_rand, X_train, y_train)

## Select Best Model

In [14]:
# select model
model = xgb_random

# predictions
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

In [17]:
# test set evaluation
rmse = rmsle(y=y_train, y_pred=y_pred_train)
print("trainset RMSE: ", round(rmse, 5))

trainset RMSE:  0.03796


In [18]:
df_test = pd.read_csv('./input/test.csv')

predictions = pd.DataFrame(np.exp(y_pred_test), columns=['SalePrice'])
submission = pd.concat([df_test['Id'], predictions], axis=1)
submission.head()

Unnamed: 0,Id,SalePrice
0,1461,122013.078125
1,1462,159874.03125
2,1463,191197.875
3,1464,192373.375
4,1465,181166.375


In [19]:
submission['SalePrice'].describe()

count      1459.000000
mean     178454.218750
std       78826.281250
min       39919.558594
25%      127867.539062
50%      156499.156250
75%      209989.687500
max      662572.250000
Name: SalePrice, dtype: float64

In [20]:
submission.to_csv('output/submission.csv', index=False)

In [22]:
!kaggle competitions submit -c house-prices-advanced-regression-techniques -f ./output/submission.csv -m test_upload

Successfully submitted to House Prices: Advanced Regression Techniques



  0%|          | 0.00/22.6k [00:00<?, ?B/s]
 35%|###5      | 8.00k/22.6k [00:00<00:00, 53.9kB/s]
100%|##########| 22.6k/22.6k [00:03<00:00, 6.18kB/s]


## Appendix (unused code)

In [None]:
# mse = make_scorer(mean_squared_error ,greater_is_better=False)

In [None]:
# r1 = SVR(C=1000, kernel='linear')
# r2 = RandomForestRegressor(n_estimators=1000)

# er = VotingRegressor(
#     estimators=[('svr', r1), ('rf', r2)],
#     n_jobs=-1,
#     verbose=True
# )

# er.fit(X_train, y_train)
# y_pred = er.predict(X_test)