In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer

import os
import random

def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

SEED = 42
seed_everything(SEED)

In [2]:
df = pd.read_csv('predict_home_value.csv')
df.head()

Unnamed: 0,ID,LOTAREA,BLDGTYPE,HOUSESTYLE,OVERALLCOND,YEARBUILT,ROOFSTYLE,EXTERCOND,FOUNDATION,BSMTCOND,...,GARAGETYPE,GARAGEFINISH,GARAGECARS,GARAGECOND,POOLAREA,POOLQC,FENCE,MOSOLD,YRSOLD,SALEPRICE
0,1,8450,1Fam,2Story,5,2003,Gable,TA,PConc,TA,...,Attchd,RFn,2,TA,0,,,2,2008,208500
1,2,9600,1Fam,1Story,8,1976,Gable,TA,CBlock,TA,...,Attchd,RFn,2,TA,0,,,5,2007,181500
2,3,11250,1Fam,2Story,5,2001,Gable,TA,PConc,TA,...,Attchd,RFn,2,TA,0,,,9,2008,223500
3,4,9550,1Fam,2Story,5,1915,Gable,TA,BrkTil,Gd,...,Detchd,Unf,3,TA,0,,,2,2006,140000
4,5,14260,1Fam,2Story,5,2000,Gable,TA,PConc,TA,...,Attchd,RFn,3,TA,0,,,12,2008,250000


In [3]:
df.shape

(1460, 32)

In [4]:
cat_cols = ['BLDGTYPE','HOUSESTYLE','ROOFSTYLE','EXTERCOND','FOUNDATION','BSMTCOND','HEATING','HEATINGQC','CENTRALAIR','ELECTRICAL','KITCHENQUAL','FIREPLACEQU','GARAGETYPE','GARAGEFINISH','GARAGECOND','POOLQC','FENCE']
num_cols = df.select_dtypes(include=['int64']).columns

In [5]:
df[cat_cols].head()

Unnamed: 0,BLDGTYPE,HOUSESTYLE,ROOFSTYLE,EXTERCOND,FOUNDATION,BSMTCOND,HEATING,HEATINGQC,CENTRALAIR,ELECTRICAL,KITCHENQUAL,FIREPLACEQU,GARAGETYPE,GARAGEFINISH,GARAGECOND,POOLQC,FENCE
0,1Fam,2Story,Gable,TA,PConc,TA,GasA,Ex,Y,SBrkr,Gd,,Attchd,RFn,TA,,
1,1Fam,1Story,Gable,TA,CBlock,TA,GasA,Ex,Y,SBrkr,TA,TA,Attchd,RFn,TA,,
2,1Fam,2Story,Gable,TA,PConc,TA,GasA,Ex,Y,SBrkr,Gd,TA,Attchd,RFn,TA,,
3,1Fam,2Story,Gable,TA,BrkTil,Gd,GasA,Gd,Y,SBrkr,Gd,Gd,Detchd,Unf,TA,,
4,1Fam,2Story,Gable,TA,PConc,TA,GasA,Ex,Y,SBrkr,Gd,TA,Attchd,RFn,TA,,


### Imputing NaN values and Encoding

In [6]:
impute_categorical = SimpleImputer(strategy="most_frequent")
onehot_categorical = OneHotEncoder(handle_unknown='ignore', sparse=False)

In [7]:
df[cat_cols] = pd.DataFrame(impute_categorical.fit_transform(df[cat_cols]), columns=cat_cols)

In [8]:
df_cat = pd.DataFrame(onehot_categorical.fit_transform(df[cat_cols]), columns=onehot_categorical.get_feature_names())

In [9]:
df_pd = pd.concat([df[num_cols], df_cat], axis=1)

In [10]:
df_pd.shape
df_pd.head()

Unnamed: 0,ID,LOTAREA,OVERALLCOND,YEARBUILT,FULLBATH,HALFBATH,BEDROOMABVGR,KITCHENABVGR,TOTRMSABVGRD,FIREPLACES,...,x14_Gd,x14_Po,x14_TA,x15_Ex,x15_Fa,x15_Gd,x16_GdPrv,x16_GdWo,x16_MnPrv,x16_MnWw
0,1,8450,5,2003,2,1,3,1,8,0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,2,9600,8,1976,2,0,3,1,6,1,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,3,11250,5,2001,2,1,3,1,6,1,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3,4,9550,5,1915,1,0,3,1,7,1,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
4,5,14260,5,2000,2,1,4,1,9,1,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


In [11]:
from sklearn.model_selection import train_test_split

feature_names = df_pd.columns

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

X = df_pd.drop(['ID','SALEPRICE'], axis=1)
y = df_pd['SALEPRICE']

X = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, shuffle=True, random_state=42)

In [12]:
print(df_pd.shape)
print("\n")
print("X_train: ", X_train.shape)
print("y_train: ", y_train.shape)
#print("\n")
print("X_test: ", X_test.shape)
print("y_test: ", y_test.shape)

(1460, 97)


X_train:  (1022, 95)
y_train:  (1022,)
X_test:  (438, 95)
y_test:  (438,)


## HyperOpt

In [13]:
#import required packages
import lightgbm as lgb
import xgboost as xgb
import catboost as cb
import gc
from hyperopt import hp, tpe, Trials, STATUS_OK
from hyperopt.fmin import fmin
from hyperopt.pyll.stochastic import sample
#optional but advised
import warnings
warnings.filterwarnings('ignore')

#GLOBAL HYPEROPT PARAMETERS
NUM_EVALS = 1000 #number of hyperopt evaluation rounds
N_FOLDS = 5 #number of cross-validation folds on data in each evaluation round

#LIGHTGBM PARAMETERS
LGBM_MAX_LEAVES = 2**11 #maximum number of leaves per tree for LightGBM
LGBM_MAX_DEPTH = 25 #maximum tree depth for LightGBM
EVAL_METRIC_LGBM_REG = 'mae' #LightGBM regression metric. Note that 'rmse' is more commonly used 
EVAL_METRIC_LGBM_CLASS = 'auc'#LightGBM classification metric

#XGBOOST PARAMETERS
XGB_MAX_LEAVES = 2**12 #maximum number of leaves when using histogram splitting
XGB_MAX_DEPTH = 25 #maximum tree depth for XGBoost
EVAL_METRIC_XGB_REG = 'mae' #XGBoost regression metric
EVAL_METRIC_XGB_CLASS = 'auc' #XGBoost classification metric

#CATBOOST PARAMETERS
CB_MAX_DEPTH = 8 #maximum tree depth in CatBoost
OBJECTIVE_CB_REG = 'MAE' #CatBoost regression metric
OBJECTIVE_CB_CLASS = 'Logloss' #CatBoost classification metric

#OPTIONAL OUTPUT
BEST_SCORE = 0

def quick_hyperopt(data, labels, package='lgbm', num_evals=NUM_EVALS, diagnostic=False):
    
    #==========
    #LightGBM
    #==========
    
    if package=='lgbm':
        
        print('Running {} rounds of LightGBM parameter optimisation:'.format(num_evals))
        #clear space
        gc.collect()
        
        integer_params = ['max_depth',
                         'num_leaves',
                          'max_bin',
                         'min_data_in_leaf',
                         'min_data_in_bin']
        
        def objective(space_params):
            
            #cast integer params from float to int
            for param in integer_params:
                space_params[param] = int(space_params[param])
            
            #extract nested conditional parameters
            if space_params['boosting']['boosting'] == 'goss':
                top_rate = space_params['boosting'].get('top_rate')
                other_rate = space_params['boosting'].get('other_rate')
                #0 <= top_rate + other_rate <= 1
                top_rate = max(top_rate, 0)
                top_rate = min(top_rate, 0.5)
                other_rate = max(other_rate, 0)
                other_rate = min(other_rate, 0.5)
                space_params['top_rate'] = top_rate
                space_params['other_rate'] = other_rate
            
            subsample = space_params['boosting'].get('subsample', 1.0)
            space_params['boosting'] = space_params['boosting']['boosting']
            space_params['subsample'] = subsample
            
            #for classification, set stratified=True and metrics=EVAL_METRIC_LGBM_CLASS
            cv_results = lgb.cv(space_params, train, nfold = N_FOLDS, stratified=False,
                                early_stopping_rounds=100, metrics=EVAL_METRIC_LGBM_REG, seed=42)
            
            best_loss = cv_results['l1-mean'][-1] #'l2-mean' for rmse
            #for classification, comment out the line above and uncomment the line below:
            #best_loss = 1 - cv_results['auc-mean'][-1]
            #if necessary, replace 'auc-mean' with '[your-preferred-metric]-mean'
            return{'loss':best_loss, 'status': STATUS_OK }
        
        train = lgb.Dataset(data, labels, free_raw_data = False)
                
        #integer and string parameters, used with hp.choice()
        boosting_list = [{'boosting': 'gbdt',
                          'subsample': hp.uniform('subsample', 0.5, 1)},
                         {'boosting': 'goss',
                          'subsample': 1.0,
                         'top_rate': hp.uniform('top_rate', 0, 0.5),
                         'other_rate': hp.uniform('other_rate', 0, 0.5)}] #if including 'dart', make sure to set 'n_estimators'
        metric_list = ['MAE', 'RMSE'] 
        #for classification comment out the line above and uncomment the line below
        #metric_list = ['auc'] #modify as required for other classification metrics
        objective_list_reg = ['huber', 'gamma', 'fair', 'tweedie']
        objective_list_class = ['binary', 'cross_entropy']
        #for classification set objective_list = objective_list_class
        objective_list = objective_list_reg

        space ={'boosting' : hp.choice('boosting', boosting_list),
                'num_leaves' : hp.quniform('num_leaves', 2, LGBM_MAX_LEAVES, 1),
                'max_depth': hp.quniform('max_depth', 2, LGBM_MAX_DEPTH, 1),
                'max_bin': hp.quniform('max_bin', 32, 255, 1),
                'min_data_in_leaf': hp.quniform('min_data_in_leaf', 1, 256, 1),
                'min_data_in_bin': hp.quniform('min_data_in_bin', 1, 256, 1),
                'min_gain_to_split' : hp.quniform('min_gain_to_split', 0.1, 5, 0.01),
                'lambda_l1' : hp.uniform('lambda_l1', 0, 5),
                'lambda_l2' : hp.uniform('lambda_l2', 0, 5),
                'learning_rate' : hp.loguniform('learning_rate', np.log(0.005), np.log(0.2)),
                'metric' : hp.choice('metric', metric_list),
                'objective' : hp.choice('objective', objective_list),
                'feature_fraction' : hp.quniform('feature_fraction', 0.5, 1, 0.01),
                'bagging_fraction' : hp.quniform('bagging_fraction', 0.5, 1, 0.01),
                'verbose': -1,
                'n_jobs': 1
            }
        
        #optional: activate GPU for LightGBM
        #follow compilation steps here:
        #https://www.kaggle.com/vinhnguyen/gpu-acceleration-for-lightgbm/
        #then uncomment lines below:
        #space['device'] = 'gpu'
        #space['gpu_platform_id'] = 0,
        #space['gpu_device_id'] =  0

        trials = Trials()
        best = fmin(fn=objective,
                    space=space,
                    algo=tpe.suggest,
                    max_evals=num_evals, 
                    trials=trials)
                
        #fmin() will return the index of values chosen from the lists/arrays in 'space'
        #to obtain actual values, index values are used to subset the original lists/arrays
        best['boosting'] = boosting_list[best['boosting']]['boosting']#nested dict, index twice
        best['metric'] = metric_list[best['metric']]
        best['objective'] = objective_list[best['objective']]
                
        #cast floats of integer params to int
        for param in integer_params:
            best[param] = int(best[param])
        
        print('{' + '\n'.join('{}: {}'.format(k, v) for k, v in best.items()) + '}')
        if diagnostic:
            return(best, trials)
        else:
            return(best)
    
    #==========
    #XGBoost
    #==========
    
    if package=='xgb':
        
        print('Running {} rounds of XGBoost parameter optimisation:'.format(num_evals))
        #clear space
        gc.collect()
        
        integer_params = ['max_depth']
        
        def objective(space_params):
            
            for param in integer_params:
                space_params[param] = int(space_params[param])
                
            #extract multiple nested tree_method conditional parameters
            #libera te tutemet ex inferis
            if space_params['tree_method']['tree_method'] == 'hist':
                max_bin = space_params['tree_method'].get('max_bin')
                space_params['max_bin'] = int(max_bin)
                if space_params['tree_method']['grow_policy']['grow_policy']['grow_policy'] == 'depthwise':
                    grow_policy = space_params['tree_method'].get('grow_policy').get('grow_policy').get('grow_policy')
                    space_params['grow_policy'] = grow_policy
                    space_params['tree_method'] = 'hist'
                else:
                    max_leaves = space_params['tree_method']['grow_policy']['grow_policy'].get('max_leaves')
                    space_params['grow_policy'] = 'lossguide'
                    space_params['max_leaves'] = int(max_leaves)
                    space_params['tree_method'] = 'hist'
            else:
                space_params['tree_method'] = space_params['tree_method'].get('tree_method')
                
            #for classification replace EVAL_METRIC_XGB_REG with EVAL_METRIC_XGB_CLASS
            cv_results = xgb.cv(space_params, train, nfold=N_FOLDS, metrics=[EVAL_METRIC_XGB_REG],
                             early_stopping_rounds=100, stratified=True, seed=42, verbose_eval=0)
            
            best_loss = cv_results['test-mae-mean'].iloc[-1] #or 'test-rmse-mean' if using RMSE
            #for classification, comment out the line above and uncomment the line below:
            #best_loss = 1 - cv_results['test-auc-mean'].iloc[-1]
            #if necessary, replace 'test-auc-mean' with 'test-[your-preferred-metric]-mean'
            return{'loss':best_loss, 'status': STATUS_OK }
        
        train = xgb.DMatrix(data, labels)
        
        #integer and string parameters, used with hp.choice()
        boosting_list = ['gbtree', 'gblinear'] #if including 'dart', make sure to set 'n_estimators'
        metric_list = ['MAE', 'RMSE'] 
        #for classification comment out the line above and uncomment the line below
        #metric_list = ['auc']
        #modify as required for other classification metrics classification
        
        tree_method = [{'tree_method' : 'exact'},
               {'tree_method' : 'approx'},
               {'tree_method' : 'hist',
                'max_bin': hp.quniform('max_bin', 2**3, 2**7, 1),
                'grow_policy' : {'grow_policy': {'grow_policy':'depthwise'},
                                'grow_policy' : {'grow_policy':'lossguide',
                                                  'max_leaves': hp.quniform('max_leaves', 32, XGB_MAX_LEAVES, 1)}}}]
        
        #if using GPU, replace 'exact' with 'gpu_exact' and 'hist' with
        #'gpu_hist' in the nested dictionary above
        
        objective_list_reg = ['reg:linear', 'reg:gamma', 'reg:tweedie']
        objective_list_class = ['reg:logistic', 'binary:logistic']
        #for classification change line below to 'objective_list = objective_list_class'
        objective_list = objective_list_reg
        
        space ={'boosting' : hp.choice('boosting', boosting_list),
                'tree_method' : hp.choice('tree_method', tree_method),
                'max_depth': hp.quniform('max_depth', 2, XGB_MAX_DEPTH, 1),
                'reg_alpha' : hp.uniform('reg_alpha', 0, 5),
                'reg_lambda' : hp.uniform('reg_lambda', 0, 5),
                'min_child_weight' : hp.uniform('min_child_weight', 0, 5),
                'gamma' : hp.uniform('gamma', 0, 5),
                'learning_rate' : hp.loguniform('learning_rate', np.log(0.005), np.log(0.2)),
                'eval_metric' : hp.choice('eval_metric', metric_list),
                'objective' : hp.choice('objective', objective_list),
                'colsample_bytree' : hp.quniform('colsample_bytree', 0.1, 1, 0.01),
                'colsample_bynode' : hp.quniform('colsample_bynode', 0.1, 1, 0.01),
                'colsample_bylevel' : hp.quniform('colsample_bylevel', 0.1, 1, 0.01),
                'subsample' : hp.quniform('subsample', 0.5, 1, 0.05),
                'nthread' : -1,
                'verbosity': 0,
            }
        
        trials = Trials()
        best = fmin(fn=objective,
                    space=space,
                    algo=tpe.suggest,
                    max_evals=num_evals, 
                    trials=trials)
        
        best['tree_method'] = tree_method[best['tree_method']]['tree_method']
        best['boosting'] = boosting_list[best['boosting']]
        best['eval_metric'] = metric_list[best['eval_metric']]
        best['objective'] = objective_list[best['objective']]
        
        #cast floats of integer params to int
        for param in integer_params:
            best[param] = int(best[param])
        if 'max_leaves' in best:
            best['max_leaves'] = int(best['max_leaves'])
        if 'max_bin' in best:
            best['max_bin'] = int(best['max_bin'])
        
        print('{' + '\n'.join('{}: {}'.format(k, v) for k, v in best.items()) + '}')
        
        if diagnostic:
            return(best, trials)
        else:
            return(best)
    
    #==========
    #CatBoost
    #==========
    
    if package=='cb':
        
        print('Running {} rounds of CatBoost parameter optimisation:'.format(num_evals))
        
        #clear memory 
        gc.collect()
            
        integer_params = ['depth',
                          #'one_hot_max_size', #for categorical data
                          'min_data_in_leaf',
                          'max_bin']
        
        def objective(space_params):
                        
            #cast integer params from float to int
            for param in integer_params:
                space_params[param] = int(space_params[param])
                
            #extract nested conditional parameters
            if space_params['bootstrap_type']['bootstrap_type'] == 'Bayesian':
                bagging_temp = space_params['bootstrap_type'].get('bagging_temperature')
                space_params['bagging_temperature'] = bagging_temp
                
            if space_params['grow_policy']['grow_policy'] == 'LossGuide':
                max_leaves = space_params['grow_policy'].get('max_leaves')
                space_params['max_leaves'] = int(max_leaves)
                
            space_params['bootstrap_type'] = space_params['bootstrap_type']['bootstrap_type']
            space_params['grow_policy'] = space_params['grow_policy']['grow_policy']
                           
            #random_strength cannot be < 0
            space_params['random_strength'] = max(space_params['random_strength'], 0)
            #fold_len_multiplier cannot be < 1
            space_params['fold_len_multiplier'] = max(space_params['fold_len_multiplier'], 1)
                       
            #for classification set stratified=True
            cv_results = cb.cv(train, space_params, fold_count=N_FOLDS, 
                             early_stopping_rounds=25, stratified=True, partition_random_seed=42)
           
            #best_loss = cv_results['test-MAE-mean'].iloc[-1] #'test-RMSE-mean' for RMSE
            #for classification, comment out the line above and uncomment the line below:
            best_loss = cv_results['test-Logloss-mean'].iloc[-1]
            #if necessary, replace 'test-Logloss-mean' with 'test-[your-preferred-metric]-mean'
            
            return{'loss':best_loss, 'status': STATUS_OK}
        
        train = cb.Pool(data, labels.astype('float32'))
        
        #integer and string parameters, used with hp.choice()
        bootstrap_type = [#{'bootstrap_type':'Poisson'}, 
                           {'bootstrap_type':'Bayesian',
                            'bagging_temperature' : hp.loguniform('bagging_temperature', np.log(1), np.log(50))},
                          {'bootstrap_type':'Bernoulli'}] 
        LEB = ['No', 'AnyImprovement'] #remove 'Armijo' if not using GPU
        #score_function = ['Correlation', 'L2', 'NewtonCorrelation', 'NewtonL2']
        grow_policy = [{'grow_policy':'SymmetricTree'},
                       {'grow_policy':'Depthwise'},
                       {'grow_policy':'Lossguide',
                        'max_leaves': hp.quniform('max_leaves', 2, 32, 1)}]
        eval_metric_list_reg = ['MAE', 'RMSE']
        eval_metric_list_class = ['Logloss', 'AUC']
        #for classification change line below to 'eval_metric_list = eval_metric_list_class'
        eval_metric_list = eval_metric_list_class
                
        space ={'depth': hp.quniform('depth', 2, CB_MAX_DEPTH, 1),
                'max_bin' : hp.quniform('max_bin', 1, 32, 1), #if using CPU just set this to 254
                'l2_leaf_reg' : hp.uniform('l2_leaf_reg', 0, 5),
                'min_data_in_leaf' : hp.quniform('min_data_in_leaf', 1, 50, 1),
                'random_strength' : hp.loguniform('random_strength', np.log(0.005), np.log(5)),
                #'one_hot_max_size' : hp.quniform('one_hot_max_size', 2, 16, 1), #uncomment if using categorical features
                'bootstrap_type' : hp.choice('bootstrap_type', bootstrap_type),
                'learning_rate' : hp.uniform('learning_rate', 0.05, 0.25),
                'eval_metric' : hp.choice('eval_metric', eval_metric_list),
                'objective' : OBJECTIVE_CB_CLASS,
                #'score_function' : hp.choice('score_function', score_function), #crashes kernel - reason unknown
                'leaf_estimation_backtracking' : hp.choice('leaf_estimation_backtracking', LEB),
                'grow_policy': hp.choice('grow_policy', grow_policy),
                #'colsample_bylevel' : hp.quniform('colsample_bylevel', 0.1, 1, 0.01),# CPU only
                'fold_len_multiplier' : hp.loguniform('fold_len_multiplier', np.log(1.01), np.log(2.5)),
                'od_type' : 'Iter',
                'od_wait' : 25,
                'task_type' : 'GPU',
                'verbose' : 0,
            }
        
        #optional: run CatBoost without GPU
        #uncomment line below
        space['task_type'] = 'CPU'
            
        trials = Trials()
        best = fmin(fn=objective,
                    space=space,
                    algo=tpe.suggest,
                    max_evals=num_evals, 
                    trials=trials)
        
        #unpack nested dicts first
        best['bootstrap_type'] = bootstrap_type[best['bootstrap_type']]['bootstrap_type']
        best['grow_policy'] = grow_policy[best['grow_policy']]['grow_policy']
        best['eval_metric'] = eval_metric_list[best['eval_metric']]
        
        #best['score_function'] = score_function[best['score_function']] 
        #best['leaf_estimation_method'] = LEM[best['leaf_estimation_method']] #CPU only
        best['leaf_estimation_backtracking'] = LEB[best['leaf_estimation_backtracking']]        
        
        #cast floats of integer params to int
        for param in integer_params:
            best[param] = int(best[param])
        if 'max_leaves' in best:
            best['max_leaves'] = int(best['max_leaves'])
        
        print('{' + '\n'.join('{}: {}'.format(k, v) for k, v in best.items()) + '}')
        
        if diagnostic:
            return(best, trials)
        else:
            return(best)
    
    else:
        print('Package not recognised. Please use "lgbm" for LightGBM, "xgb" for XGBoost or "cb" for CatBoost.')

In [75]:
xgb_params = quick_hyperopt(X_train, y_train, 'xgb', 1000)

Running 1000 rounds of XGBoost parameter optimisation:
100%|█████████████████████████████████████████████████| 1000/1000 [05:17<00:00,  3.15trial/s, best loss: 28140.5308594]
{boosting: gbtree
colsample_bylevel: 0.52
colsample_bynode: 0.64
colsample_bytree: 0.64
eval_metric: RMSE
gamma: 0.4025242673403483
learning_rate: 0.19960440670066074
max_depth: 8
min_child_weight: 4.832441995554798
objective: reg:linear
reg_alpha: 2.1471799927248414
reg_lambda: 0.004916348867284151
subsample: 0.9500000000000001
tree_method: exact}


In [76]:
xgb_params

{'boosting': 'gbtree',
 'colsample_bylevel': 0.52,
 'colsample_bynode': 0.64,
 'colsample_bytree': 0.64,
 'eval_metric': 'RMSE',
 'gamma': 0.4025242673403483,
 'learning_rate': 0.19960440670066074,
 'max_depth': 8,
 'min_child_weight': 4.832441995554798,
 'objective': 'reg:linear',
 'reg_alpha': 2.1471799927248414,
 'reg_lambda': 0.004916348867284151,
 'subsample': 0.9500000000000001,
 'tree_method': 'exact'}

In [97]:
xgb_params = {
    'boosting': 'gbtree',
    'colsample_bylevel': 0.52,
    'colsample_bynode': 0.64,
    'colsample_bytree': 0.64,
    'eval_metric': 'rmse',
    'gamma': 0.4025242673403483,
    'learning_rate': 0.19960440670066074,
    'max_depth': 8,
    'min_child_weight': 4.832441995554798,
    'objective': 'reg:linear',
    'reg_alpha': 2.1471799927248414,
    'reg_lambda': 0.004916348867284151,
    'subsample': 0.9500000000000001,
    'tree_method': 'exact'
}

In [98]:
reg_xgb = xgb.XGBRegressor()
reg_xgb.set_params(**xgb_params)
reg_xgb.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)], early_stopping_rounds=5)

[0]	validation_0-rmse:159292.73438	validation_1-rmse:161079.79688
Multiple eval metrics have been passed: 'validation_1-rmse' will be used for early stopping.

Will train until validation_1-rmse hasn't improved in 5 rounds.
[1]	validation_0-rmse:129286.82812	validation_1-rmse:132362.03125
[2]	validation_0-rmse:105374.43750	validation_1-rmse:108899.24219
[3]	validation_0-rmse:86544.04688	validation_1-rmse:90758.95312
[4]	validation_0-rmse:71596.00781	validation_1-rmse:76619.03906
[5]	validation_0-rmse:59698.97656	validation_1-rmse:65127.64453
[6]	validation_0-rmse:50439.01953	validation_1-rmse:57296.27734
[7]	validation_0-rmse:43297.63672	validation_1-rmse:50702.06250
[8]	validation_0-rmse:37543.15234	validation_1-rmse:45993.57422
[9]	validation_0-rmse:33135.32422	validation_1-rmse:43100.21875
[10]	validation_0-rmse:30148.29102	validation_1-rmse:41062.36719
[11]	validation_0-rmse:27580.50977	validation_1-rmse:39254.04688
[12]	validation_0-rmse:25860.95508	validation_1-rmse:38185.21094
[

XGBRegressor(base_score=0.5, booster='gbtree', boosting='gbtree',
             colsample_bylevel=0.52, colsample_bynode=0.64,
             colsample_bytree=0.64, eval_metric='rmse',
             gamma=0.4025242673403483, gpu_id=-1, importance_type='gain',
             interaction_constraints='', learning_rate=0.19960440670066074,
             max_delta_step=0, max_depth=8, min_child_weight=4.832441995554798,
             missing=nan, monotone_constraints='()', n_estimators=100, n_jobs=0,
             num_parallel_tree=1, objective='reg:linear', random_state=0,
             reg_alpha=2.1471799927248414, reg_lambda=0.004916348867284151,
             scale_pos_weight=1, subsample=0.9500000000000001,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [123]:
df_pd.head(2)['SALEPRICE']

0    208500
1    181500
Name: SALEPRICE, dtype: int64

In [122]:
reg_xgb.predict(X[0:2])

array([202224.55, 175889.7 ], dtype=float32)

In [136]:
from sklearn.metrics import mean_squared_error
from math import sqrt
y_pred_te = reg_xgb.predict(X_test)
sqrt(mean_squared_error(y_test, y_pred_te))

34960.065385903836

### LightGBM

In [128]:
lgb_params = quick_hyperopt(X_train, y_train, 'lgbm', 1000)

Running 1000 rounds of LightGBM parameter optimisation:
100%|█████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [07:47<00:00,  2.14trial/s, best loss: 22635.80794816267]
{bagging_fraction: 0.84
boosting: gbdt
feature_fraction: 0.67
lambda_l1: 3.0904688104661124
lambda_l2: 0.10041209033823585
learning_rate: 0.039574312573301115
max_bin: 120
max_depth: 7
metric: MAE
min_data_in_bin: 42
min_data_in_leaf: 1
min_gain_to_split: 4.5600000000000005
num_leaves: 1487
objective: tweedie
subsample: 0.6481492939476654}


In [129]:
lgb_params

{'bagging_fraction': 0.84,
 'boosting': 'gbdt',
 'feature_fraction': 0.67,
 'lambda_l1': 3.0904688104661124,
 'lambda_l2': 0.10041209033823585,
 'learning_rate': 0.039574312573301115,
 'max_bin': 120,
 'max_depth': 7,
 'metric': 'MAE',
 'min_data_in_bin': 42,
 'min_data_in_leaf': 1,
 'min_gain_to_split': 4.5600000000000005,
 'num_leaves': 1487,
 'objective': 'tweedie',
 'subsample': 0.6481492939476654}

In [14]:
lgb_params = {
    'bagging_fraction': 0.84,
    'boosting': 'gbdt',
    'feature_fraction': 0.67,
    'lambda_l1': 3.0904688104661124,
    'lambda_l2': 0.10041209033823585,
    'learning_rate': 0.039574312573301115,
    'max_bin': 120,
    'max_depth': 7,
    'metric': 'MAE',
    'min_data_in_bin': 42,
    'min_data_in_leaf': 1,
    'min_gain_to_split': 4.5600000000000005,
    'num_leaves': 1487,
    'objective': 'tweedie',
    'subsample': 0.6481492939476654
}

In [15]:
reg_lgb = lgb.LGBMRegressor()
reg_lgb.set_params(**lgb_params) 
reg_lgb.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=10)

[1]	valid_0's l1: 57670.1
Training until validation scores don't improve for 10 rounds
[2]	valid_0's l1: 55924.3
[3]	valid_0's l1: 54224.5
[4]	valid_0's l1: 52570.7
[5]	valid_0's l1: 51108.8
[6]	valid_0's l1: 49656.5
[7]	valid_0's l1: 48372.5
[8]	valid_0's l1: 47071.4
[9]	valid_0's l1: 45749.5
[10]	valid_0's l1: 44531.9
[11]	valid_0's l1: 43353.1
[12]	valid_0's l1: 42275.3
[13]	valid_0's l1: 41184.5
[14]	valid_0's l1: 40252.9
[15]	valid_0's l1: 39283.1
[16]	valid_0's l1: 38320.9
[17]	valid_0's l1: 37504.6
[18]	valid_0's l1: 36652
[19]	valid_0's l1: 35869.8
[20]	valid_0's l1: 35144.2
[21]	valid_0's l1: 34531.1
[22]	valid_0's l1: 33800.7
[23]	valid_0's l1: 33154.7
[24]	valid_0's l1: 32571.2
[25]	valid_0's l1: 32042
[26]	valid_0's l1: 31506.4
[27]	valid_0's l1: 31014.7
[28]	valid_0's l1: 30521.1
[29]	valid_0's l1: 30074.6
[30]	valid_0's l1: 29639.6
[31]	valid_0's l1: 29199.5
[32]	valid_0's l1: 28803.3
[33]	valid_0's l1: 28443.6
[34]	valid_0's l1: 28118.1
[35]	valid_0's l1: 27769.7
[36]	va

LGBMRegressor(bagging_fraction=0.84, boosting='gbdt', feature_fraction=0.67,
              lambda_l1=3.0904688104661124, lambda_l2=0.10041209033823585,
              learning_rate=0.039574312573301115, max_bin=120, max_depth=7,
              metric='MAE', min_data_in_bin=42, min_data_in_leaf=1,
              min_gain_to_split=4.5600000000000005, num_leaves=1487,
              objective='tweedie', subsample=0.6481492939476654)

In [17]:
from sklearn.metrics import mean_squared_error
from math import sqrt
y_pred_te = reg_lgb.predict(X_test)
sqrt(mean_squared_error(y_test, y_pred_te))

34899.1627728025

In [25]:
import torch
from pytorch_tabnet.tab_model import TabNetClassifier, TabNetRegressor

reg_tabnet = TabNetRegressor(
    n_d=20,
    n_a=20,
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=0.18),
    seed=0
)


reg_tabnet.fit(
    X_train=X_train, y_train=y_train.values.reshape(-1,1),
    eval_set=[(X_train, y_train.values.reshape(-1,1)), (X_test, y_test.values.reshape(-1,1))],
    eval_name=['train', 'valid'],
    eval_metric=['rmse'],
    max_epochs=1000 , 
    patience=40,
    #batch_size=1024, virtual_batch_size=128,
    #num_workers=0,
    drop_last=False,
)

Device used : cpu
epoch 0  | loss: 38892769280.0| train_rmse: 197202.74068| valid_rmse: 198436.50396|  0:00:00s
epoch 1  | loss: 38890680320.0| train_rmse: 197185.38462| valid_rmse: 198418.8346|  0:00:00s
epoch 2  | loss: 38887477248.0| train_rmse: 197157.531| valid_rmse: 198391.81905|  0:00:01s
epoch 3  | loss: 38883151872.0| train_rmse: 197122.87692| valid_rmse: 198356.18599|  0:00:01s
epoch 4  | loss: 38877188096.0| train_rmse: 197065.77806| valid_rmse: 198300.82247|  0:00:01s
epoch 5  | loss: 38868578304.0| train_rmse: 196919.98851| valid_rmse: 198156.75754|  0:00:02s
epoch 6  | loss: 38858465280.0| train_rmse: 196733.13057| valid_rmse: 197949.36374|  0:00:02s
epoch 7  | loss: 38845718528.0| train_rmse: 196623.0619| valid_rmse: 197833.66564|  0:00:03s
epoch 8  | loss: 38830223360.0| train_rmse: 196391.85345| valid_rmse: 197577.09732|  0:00:03s
epoch 9  | loss: 38812409856.0| train_rmse: 196202.01396| valid_rmse: 197389.37047|  0:00:03s
epoch 10 | loss: 38791589888.0| train_rmse: 19

epoch 88 | loss: 28109084672.0| train_rmse: 158167.47086| valid_rmse: 159772.22512|  0:00:37s
epoch 89 | loss: 27878967296.0| train_rmse: 158037.15376| valid_rmse: 159700.99775|  0:00:37s
epoch 90 | loss: 27620061184.0| train_rmse: 157909.04688| valid_rmse: 159572.81102|  0:00:38s
epoch 91 | loss: 27344791552.0| train_rmse: 158135.75482| valid_rmse: 159932.12879|  0:00:38s
epoch 92 | loss: 27114770432.0| train_rmse: 158164.86463| valid_rmse: 160008.81637|  0:00:39s
epoch 93 | loss: 26859554816.0| train_rmse: 156848.23699| valid_rmse: 158862.0073|  0:00:39s
epoch 94 | loss: 26585108480.0| train_rmse: 155412.84951| valid_rmse: 157434.44539|  0:00:39s
epoch 95 | loss: 26348267520.0| train_rmse: 154165.59538| valid_rmse: 156051.32332|  0:00:40s
epoch 96 | loss: 26090328064.0| train_rmse: 152759.38928| valid_rmse: 154634.148|  0:00:40s
epoch 97 | loss: 25850636288.0| train_rmse: 151754.61727| valid_rmse: 153665.00195|  0:00:41s
epoch 98 | loss: 25564751872.0| train_rmse: 151080.98259| valid

epoch 176| loss: 9609788416.0| train_rmse: 89011.48058| valid_rmse: 91681.84273|  0:01:15s
epoch 177| loss: 9370310656.0| train_rmse: 78827.72623| valid_rmse: 81981.23025|  0:01:15s
epoch 178| loss: 9236921344.0| train_rmse: 74806.87276| valid_rmse: 77326.3935|  0:01:15s
epoch 179| loss: 9025915904.0| train_rmse: 74476.60723| valid_rmse: 76460.09492|  0:01:16s
epoch 180| loss: 8914034688.0| train_rmse: 77070.42575| valid_rmse: 79070.15883|  0:01:16s
epoch 181| loss: 8842962944.0| train_rmse: 77682.76211| valid_rmse: 79605.03034|  0:01:17s
epoch 182| loss: 8720616448.0| train_rmse: 76001.68904| valid_rmse: 78172.51418|  0:01:17s
epoch 183| loss: 8575387136.0| train_rmse: 72242.73407| valid_rmse: 74710.25826|  0:01:17s
epoch 184| loss: 8473285632.0| train_rmse: 66098.08515| valid_rmse: 69471.10842|  0:01:18s
epoch 185| loss: 8267112448.0| train_rmse: 59114.0593| valid_rmse: 62791.82119|  0:01:18s
epoch 186| loss: 8256608768.0| train_rmse: 57228.82938| valid_rmse: 61717.58727|  0:01:19s
e

epoch 267| loss: 2491787520.0| train_rmse: 56447.02094| valid_rmse: 58457.82496|  0:02:02s
epoch 268| loss: 2373321984.0| train_rmse: 59681.50713| valid_rmse: 61780.26454|  0:02:03s
epoch 269| loss: 2366068224.0| train_rmse: 65358.88223| valid_rmse: 66635.27838|  0:02:03s
epoch 270| loss: 2463415808.0| train_rmse: 73280.07111| valid_rmse: 74420.49694|  0:02:03s
epoch 271| loss: 2651817472.0| train_rmse: 77291.62627| valid_rmse: 78341.85513|  0:02:04s
epoch 272| loss: 2860750592.0| train_rmse: 76241.66085| valid_rmse: 77104.98568|  0:02:04s
epoch 273| loss: 2798136064.0| train_rmse: 71146.21484| valid_rmse: 72324.49409|  0:02:05s
epoch 274| loss: 2741483264.0| train_rmse: 61420.11641| valid_rmse: 62870.14092|  0:02:05s
epoch 275| loss: 2511270144.0| train_rmse: 38895.0705| valid_rmse: 42719.47162|  0:02:06s
epoch 276| loss: 2313064704.0| train_rmse: 36803.8769| valid_rmse: 41628.87309|  0:02:06s
epoch 277| loss: 2127762304.0| train_rmse: 49310.68861| valid_rmse: 52728.51862|  0:02:07s
e

epoch 358| loss: 1196623488.0| train_rmse: 45619.92523| valid_rmse: 52227.1468|  0:02:39s
epoch 359| loss: 1214882304.0| train_rmse: 50606.15293| valid_rmse: 56778.97504|  0:02:40s
epoch 360| loss: 1171945728.0| train_rmse: 49667.51382| valid_rmse: 56200.32737|  0:02:40s
epoch 361| loss: 1016335360.0| train_rmse: 39710.116| valid_rmse: 47725.62037|  0:02:41s
epoch 362| loss: 917497536.0| train_rmse: 37607.78375| valid_rmse: 46099.36639|  0:02:41s
epoch 363| loss: 921652800.0| train_rmse: 30268.10812| valid_rmse: 40024.82118|  0:02:42s
epoch 364| loss: 831636736.0| train_rmse: 30504.19166| valid_rmse: 38676.17659|  0:02:42s
epoch 365| loss: 947042368.0| train_rmse: 31440.94027| valid_rmse: 39394.57957|  0:02:43s
epoch 366| loss: 950457728.0| train_rmse: 30146.96527| valid_rmse: 39201.83897|  0:02:43s
epoch 367| loss: 933328128.0| train_rmse: 31288.61647| valid_rmse: 41236.27693|  0:02:44s
epoch 368| loss: 852032000.0| train_rmse: 35865.13156| valid_rmse: 43994.11441|  0:02:45s
epoch 369

In [26]:
from sklearn.metrics import mean_squared_error
from math import sqrt
y_pred_te = reg_tabnet.predict(X_test)
sqrt(mean_squared_error(y_test, y_pred_te))

38676.17658841352

In [14]:
cb_params = {
    'bootstrap_type': 'Bernoulli',
    'depth': 3,
    'eval_metric': 'RMSE',
    'fold_len_multiplier': 1.4301509382549529,
    'grow_policy': 'Depthwise',
    'l2_leaf_reg': 0.0032859943038929806,
    'leaf_estimation_backtracking': 'AnyImprovement',
    'learning_rate': 0.24999323560121908,
    'max_bin': 10,
    #'max_leaves': 17,
    'min_data_in_leaf': 21,
    'random_strength': 0.2417876297895158
}

In [15]:
reg_cb = cb.CatBoostRegressor(**cb_params, early_stopping_rounds=10)
reg_cb.fit(X_train, y_train, eval_set = (X_test, y_test))

0:	learn: 66434.5627851	test: 71265.8045578	best: 71265.8045578 (0)	total: 159ms	remaining: 2m 38s
1:	learn: 58352.8717430	test: 63320.5250020	best: 63320.5250020 (1)	total: 163ms	remaining: 1m 21s
2:	learn: 52765.6756905	test: 57529.0917964	best: 57529.0917964 (2)	total: 167ms	remaining: 55.5s
3:	learn: 47769.3427375	test: 52347.6590845	best: 52347.6590845 (3)	total: 170ms	remaining: 42.3s
4:	learn: 44382.7045445	test: 48413.6067838	best: 48413.6067838 (4)	total: 172ms	remaining: 34.3s
5:	learn: 41481.6759834	test: 45705.0868017	best: 45705.0868017 (5)	total: 175ms	remaining: 29s
6:	learn: 39622.5273551	test: 43904.7916330	best: 43904.7916330 (6)	total: 178ms	remaining: 25.2s
7:	learn: 37993.6620384	test: 42400.3928719	best: 42400.3928719 (7)	total: 180ms	remaining: 22.4s
8:	learn: 36884.9297062	test: 41203.2854774	best: 41203.2854774 (8)	total: 183ms	remaining: 20.2s
9:	learn: 36202.4997931	test: 40415.3997831	best: 40415.3997831 (9)	total: 185ms	remaining: 18.3s
10:	learn: 35197.765

<catboost.core.CatBoostRegressor at 0x21a25d52850>

In [16]:
from sklearn.metrics import mean_squared_error
from math import sqrt
y_pred_te = reg_cb.predict(X_test)
sqrt(mean_squared_error(y_test, y_pred_te))

36498.998402355035