# Competition where metric is rmse of log of prices
[Competition](https://www.kaggle.com/c/house-prices-advanced-regression-techniques/data?select=train.csv)

In [1]:
# Set up code checking
# Set up filepaths
import os
os.chdir(os.path.join(os.path.expanduser('~'), 'kaggle'))

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Load data into dataframes

In [3]:
train_path = os.path.join('data', 'house-prices-advanced-regression', 'train.csv')
train_data = pd.read_csv(train_path)

test_path = os.path.join('data', 'house-prices-advanced-regression', 'test.csv')
test_data_original = pd.read_csv(test_path)

## Select numerical and categorical variables, and separate target from features

In [4]:
y = np.log(train_data['SalePrice'])
train_data.drop(columns=['SalePrice', 'Id'], inplace=True)
test_data = test_data_original.drop(columns=['Id'])

# remove stupid columns
train_data.drop(columns=['LowQualFinSF'], inplace=True)
test_data.drop(columns=['LowQualFinSF'], inplace=True)

numerical_columns = train_data.select_dtypes(include=np.number).columns.tolist()

numerical_columns.remove('MSSubClass')

categorical_columns = [c for c in train_data.columns if c not in numerical_columns]

print(train_data.shape, test_data.shape)

(1460, 78) (1459, 78)


## Some EDA and feature removal

In [5]:
# fig = train_data.hist(column=numerical_columns, figsize=(20, 20), rwidth=0.9)

gaussable_columns = [
    'LotFrontage',
    'OverallQual',
    'OverallCond',
    'TotalBsmtSF',
    '1stFlrSF', 
    '2ndFlrSF', 
    'GrLivArea',
    'BedroomAbvGr',
    'TotRmsAbvGrd',
    'GarageArea'
]


## Add missing data col for every column with missing data

In [6]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PowerTransformer, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# adds additional column which indicates whether data was missing for given feature
class AddMissingIndicator(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.num_cols_with_na = [] 

    def fit(self, X, y=None):
        df = pd.DataFrame(X)
        self.num_cols_with_na = [c for c in df.columns if df[c].isna().any()]
        return self
    
    def transform(self, X, y=None):
        df_to_transform = pd.DataFrame(X)
        for c in self.num_cols_with_na:
            missing_col_name = f"{c}_is_missing"
            df_to_transform[missing_col_name] = np.where(X[c].isna(), 1, 0) 
        
        return df_to_transform 


cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='Missing')), 
    ('OHE', OneHotEncoder(sparse=False, handle_unknown='ignore'))
])

normal_numeric_columns = [c for c in numerical_columns if c not in gaussable_columns]

standard_num_pipeline = Pipeline(steps=[
    ('missing_indicator', AddMissingIndicator()),
    ('imputer', SimpleImputer(strategy='median')), 
    ('scaler', StandardScaler())
])

gauss_num_pipeline = Pipeline(steps=[
    ('missing_indicator', AddMissingIndicator()),
    ('imputer', SimpleImputer(strategy='median')), 
    ('power_scaler', PowerTransformer())
])


preprocessing = ColumnTransformer(transformers=[
    ('normal_numerical', standard_num_pipeline, normal_numeric_columns),
    ('gauss_numerical', gauss_num_pipeline, gaussable_columns),
    ('categorical', cat_pipeline, categorical_columns)
])



In [7]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error 

## Test different regression models

In [8]:
from sklearn.linear_model import Lasso, Ridge, BayesianRidge
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import GradientBoostingRegressor
from catboost import CatBoostRegressor


models = {
    'ridge': Ridge(),
    'lasso': Lasso(random_state=1),
    'xgboost': XGBRegressor(random_state=1, eval_metric='rmse'),
    'GBR': GradientBoostingRegressor(loss='squared_error', random_state=1),
    'lgbm': LGBMRegressor(), 
    'cat_boost': CatBoostRegressor(silent=True), 
    'b_ridge': BayesianRidge()
}

pipelines = {key: Pipeline(steps=[
    ('preprocessing', preprocessing),
    (f'{key}', models[key])
]) for key in models}

def get_rmse(predictor, X, y):
    return cross_val_score(predictor, X, y, scoring='neg_root_mean_squared_error', cv=5).mean()


for p in pipelines:
    predictor = pipelines[p]
    mae = get_rmse(predictor, train_data, y)
    print(f"Model: {p}")
    print(f'RMSE: {mae}')
    print()


Model: ridge
RMSE: -0.1328379901771169

Model: lasso
RMSE: -0.3992282792085989

Model: xgboost
RMSE: -0.13945533982395122

Model: GBR
RMSE: -0.1265072676050559

Model: lgbm
RMSE: -0.13256667689915594

Model: cat_boost
RMSE: -0.11871624177756683

Model: b_ridge
RMSE: -0.12952290582498036



## GridSearchCV parameters for models with small number of params

In [47]:
param_grid_small_model = {}

param_grid_small_model['ridge'] = {
    'ridge__alpha': np.logspace(1.5, 1.8, num=11),
    'ridge__tol': np.logspace(-6, -5, num=11)
}

param_grid_small_model['lasso'] = {
    'lasso__alpha': np.logspace(-2, 0, num=11),
    'lasso__tol': np.logspace(-4, -3.5, num=2)
}

In [48]:
from sklearn.model_selection import GridSearchCV

for model_name in param_grid_small_model:
    print(f"Model name: {model_name}")
    pipe = pipelines[model_name]
    search = GridSearchCV(estimator=pipe, param_grid=param_grid_small_model[model_name], scoring='neg_root_mean_squared_error')
    search.fit(train_data, y)
    print(f'Best values: {search.best_params_}')
    print(f'Best score MAE: {search.best_score_}')
    print()

Model name: ridge
Best values: {'ridge__alpha': 31.622776601683793, 'ridge__tol': 1e-06}
Best score MAE: -0.13034821571196342

Model name: lasso
Best values: {'lasso__alpha': 0.01, 'lasso__tol': 0.0001}
Best score MAE: -0.1484399573235937



## Grid search for models with many params

In [29]:
consecutive_param_grids = {}
consecutive_param_grids['xgboost'] = [ 
    {
        'xgboost__max_depth':range(1,4),
        'xgboost__n_estimators': range(200, 500, 40),
        'xgboost__min_child_weight': [1], 
        'xgboost__eval_metric': ['rmse'],
        'xgboost__gamma': [0]
    }, 
    {
        'xgboost__subsample':[i/10.0 for i in range(6,11)],
        'xgboost__colsample_bytree':[i/10.0 for i in range(2,9)]
    }, 
    {
        'xgboost__reg_alpha':[1e-4, 1e-3, 1e-2]
    }
]


# consecutive_param_grids['GBR'] = [
#     {
#         'GBR__n_estimators': range(300, 400, 10), 
#         'GBR__subsample': [1],
#         'GBR__loss': ['squared_error'],
#         'GBR__random_state': [1]
#     }, 
#     {
#         'GBR__max_depth': range(2,5), 
#         'GBR__min_samples_split': range(8,20,2),
#     }
# ]


In [30]:
for pipe_name in consecutive_param_grids:
    print(pipe_name)
    pipe = pipelines[pipe_name]
    n_steps = len(consecutive_param_grids[pipe_name])
    best_params = {}
    for step in range(n_steps):
        # coarse step
        sum_of_settings = {**best_params, **consecutive_param_grids[pipe_name][step]}
        print(sum_of_settings)
        coarse_search = GridSearchCV(pipe, param_grid=sum_of_settings, scoring='neg_root_mean_squared_error')
        coarse_search.fit(train_data, y)
        for param_name in coarse_search.best_params_:
            best_params[param_name] = [coarse_search.best_params_[param_name]]
        print(f'Best settings on step {step}: {best_params}')
        print(f'RMSE: {coarse_search.best_score_}')

    print()
    print(f"Model {pipe_name} best params: ")
    print(best_params)


xgboost
{'xgboost__max_depth': range(1, 4), 'xgboost__n_estimators': range(200, 500, 40), 'xgboost__min_child_weight': [1], 'xgboost__eval_metric': ['rmse'], 'xgboost__gamma': [0]}
Best settings on step 0: {'xgboost__eval_metric': ['rmse'], 'xgboost__gamma': [0], 'xgboost__max_depth': [2], 'xgboost__min_child_weight': [1], 'xgboost__n_estimators': [320]}
RMSE: -0.12516491630533721
{'xgboost__eval_metric': ['rmse'], 'xgboost__gamma': [0], 'xgboost__max_depth': [2], 'xgboost__min_child_weight': [1], 'xgboost__n_estimators': [320], 'xgboost__subsample': [0.6, 0.7, 0.8, 0.9, 1.0], 'xgboost__colsample_bytree': [0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]}
Best settings on step 1: {'xgboost__eval_metric': ['rmse'], 'xgboost__gamma': [0], 'xgboost__max_depth': [2], 'xgboost__min_child_weight': [1], 'xgboost__n_estimators': [320], 'xgboost__colsample_bytree': [0.4], 'xgboost__subsample': [1.0]}
RMSE: -0.12477966423507567
{'xgboost__eval_metric': ['rmse'], 'xgboost__gamma': [0], 'xgboost__max_depth': [2

In [14]:
from sklearn.ensemble import VotingRegressor
final_best_params = {}

final_best_params['xgboost'] = {
    'xgboost__eval_metric': 'rmse', 
    'xgboost__max_depth': 2, 
    'xgboost__n_estimators': 320,
    'xgboost__min_child_weight': 1, 
    'xgboost__gamma': 0.0, 
    'xgboost__colsample_bytree': 0.4, 
    'xgboost__subsample': 1, 
    'xgboost__reg_alpha': 0.001
}

final_best_params['GBR'] = {
    'GBR__loss': 'squared_error', 
    'GBR__n_estimators': 380, 
    'GBR__random_state': 1, 
    'GBR__subsample': 1, 
    'GBR__max_depth': 4, 
    'GBR__min_samples_split': 8 
}

final_best_params['ridge'] = {
    'ridge__alpha': 31.622776601683793, 
    'ridge__tol': 1e-06
}

final_best_params['lgbm'] = {}

final_best_params['cat_boost'] = {}

final_best_params['b_ridge'] = {}

list_of_models = []
scores = []
for pipe_name in final_best_params:
    # set the corresponding model with the optimal parameters
    pipelines[pipe_name] = pipelines[pipe_name].set_params(**final_best_params[pipe_name])
    pipe = pipelines[pipe_name]
    list_of_models.append((pipe_name, pipe))
    rmse = get_rmse(pipe, train_data, y)
    scores.append(-rmse)
    print(f'Model {pipe_name}: rmse {rmse:.5f}')

weights = [min(scores)/score for score in scores]

final_estimator = VotingRegressor(list_of_models, weights=weights)
print(f'Ensemble model rmse: {get_rmse(final_estimator, train_data, y):.5f}')

Model xgboost: rmse -0.12436
Model GBR: rmse -0.12421
Model ridge: rmse -0.13035
Model lgbm: rmse -0.13257
Model cat_boost: rmse -0.11872
Model b_ridge: rmse -0.12952
Ensemble model rmse: -0.11720


## Stacking model

In [None]:
from sklearn.ensemble import StackingRegressor 
from sklearn.linear_model import LinearRegression

# final_best_params['lasso'] = {
#     'lasso__alpha': 0.01, 
#     'lasso__tol': 0.0001
# }


print([i for i in final_best_params])
all_estimators = [(name, pipelines[name]) for name in final_best_params]
stacked_model = StackingRegressor(estimators=all_estimators, 
                                    final_estimator=CatBoostRegressor(silent=True))


print('Stacked model')
print(f'RMSE: {get_rmse(stacked_model, train_data, y):.5f}')

## Generate final submission 

In [28]:
final_predictions = final_estimator.fit(train_data, y).predict(test_data)
print(np.exp(final_predictions).shape, test_data_original['Id'].shape)
output = pd.DataFrame({'Id': test_data_original.Id,
                    'SalePrice': np.exp(final_predictions)})
output.to_csv(f'code/house-prices-advanced-regression/big_ensemble_model.csv', index=False)

(1459,) (1459,)
