# Hyperparameter search

After having selected number of promising feature set candidates in Notebook 4, we perform similar validation experiments in order to select optimal hyperparameters for GBDT (CatBoost) and random forest regressors.

In these validation experiments, we use the same training and vaidation sets as in Notebook 4: use December 2014 for validation when training with all the preceding months. Similarly, clipped RMSE is used as the performance metric.

In the experiments grid searches are performed over plausible hyperparameter spaces of the respective classifiers. The search is performed separately for each of the five feature subsets with 9, 16 or 25 features selected.


In [1]:
import sys
import os.path
import json
import numpy as np
import pandas as pd 
import sklearn
import scipy.sparse 
from itertools import product
import gc
from tqdm import tqdm_notebook
import re
from catboost import CatBoostRegressor, Pool
from utils import load_feature_set, clipped_rmse, HoldOut
import pickle

IN_COLAB = 'google.colab' in sys.modules

if IN_COLAB:
  from google.colab import drive
  drive.mount('/content/gdrive') 
  if not os.path.isfile('SETTINGS.json'):
       # hard coded data directory in drive is used if SETTINGS.json not present 
       config={}
       config['DATA_DIR'] = '/content/gdrive/My Drive/kaggle-c1'
       with open('SETTINGS.json', 'w') as outfile:
         json.dump(config, outfile)

with open('SETTINGS.json') as config_file:
    config = json.load(config_file)

DATA_DIR = config['DATA_DIR']

print('Using DATA_DIR ', DATA_DIR)

DATA_FOLDER = DATA_DIR

Using DATA_DIR  /home/vvi/repos/kaggle-final-project/datadir


In [2]:
def iterate_parameter_combinations(options):
    keys = options.keys()
    values = (options[key] for key in keys)
    return [dict(zip(keys, combination)) for combination in product(*values)]

In [3]:
# read feature selection results from disk

with open(os.path.join(DATA_FOLDER,'search_path_sfs.pickle'), 'rb') as fp:    
            search_path_sfs = pickle.load(fp)

In [4]:
numsel=3
id = 'allfeat'
list(search_path_sfs[id][numsel]['feature_names'])

['target_lag_2', 'item_category_id', 'target_category_tfidf_unigram_256_lag_5']

## CatBoost regression

In [None]:
grid = {'learning_rate': [0.03, 0.1,0.3,0.5],
        'iterations': [100,300,1000],
        'depth': [10,12,14,15,16],
        'l2_leaf_reg': [0.3, 1, 3, 10],
          'task_type': ['GPU']}

best_params={}          

X_train, y_train, X_trainval, y_trainval, X_val, y_val, X_test, submissionidx2testidx = load_feature_set('allfeat', data_folder=DATA_FOLDER)

for id in ['basic','basicv2','allfeat','text','within']:
  for numsel in [9,16,25]:  
      comb_list=iterate_parameter_combinations(grid)
      score_list=[]
      for param in comb_list:
        print('Feature set {}_sel{} combination {}/{}'.format(id,numsel,len(score_list)+1,len(comb_list))) 
        print('keeping cols ', list(search_path_sfs[id][numsel]['feature_names']))
        to_drop_cols= [col for col in X_train.columns.values if not col in list(search_path_sfs[id][numsel]['feature_names'])]
        reg=CatBoostRegressor(metric_period=100, **param)
        reg.fit(X_train.drop(to_drop_cols,axis=1).to_numpy(), y_train)
        pred_val = np.clip(reg.predict(X_val.drop(to_drop_cols,axis=1).to_numpy()), 0, 20)
        rmse = clipped_rmse(y_val, pred_val)
        print('Feature set {} params {} Clipped RMSE {}'.format(id,param,rmse))
        score_list.append(rmse)
        
      print('** Best CatBoostRegressor params for feature set {}_{}: {}'.format(id,numsel,comb_list[np.argmin(score_list)]))      
      best_params['{}_sel{}'.format(id,numsel)]=comb_list[np.argmin(score_list)]
      with open(os.path.join(DATA_FOLDER,'best_params_catboost.json'), 'w') as fp:
        json.dump(best_params, fp)  
        

300:	learn: 0.8184550	total: 25.4s	remaining: 59.1s
400:	learn: 0.8159064	total: 33.6s	remaining: 50.2s
500:	learn: 0.8154781	total: 42s	remaining: 41.8s
600:	learn: 0.8146326	total: 50.3s	remaining: 33.4s
700:	learn: 0.8137579	total: 58.5s	remaining: 25s
800:	learn: 0.8118350	total: 1m 6s	remaining: 16.6s
900:	learn: 0.8106735	total: 1m 14s	remaining: 8.23s
999:	learn: 0.8098315	total: 1m 23s	remaining: 0us
Feature set basic params {'learning_rate': 0.3, 'iterations': 1000, 'depth': 14, 'l2_leaf_reg': 10, 'task_type': 'GPU'} Clipped RMSE 0.47350557042181085
Feature set basic_sel25 combination 173/240
keeping cols  ['target_item_lag_4', 'target_shop_lag_6', 'target_lag_2', 'target_shop_lag_5', 'target_lag_12', 'item_category_id', 'target_category_lag_3', 'target_category_lag_2', 'target_category_lag_6', 'target_lag_4', 'target_item_lag_2', 'target_lag_6', 'target_shop_lag_12', 'item_name_cyrillic_fraction', 'target_lag_5', 'target_category_lag_4', 'target_item_lag_5', 'target_item_lag_

## Random forest regression

In [None]:
from sklearn.ensemble import RandomForestRegressor

grid = {'max_depth': [10,12,14,16],
       'n_estimators': [30,50,100]}

best_params={}          

X_train, y_train, X_trainval, y_trainval, X_val, y_val, X_test, submissionidx2testidx = load_feature_set('allfeat', data_folder=DATA_FOLDER)

for id in ['basic','basicv2','allfeat','text','within']:
  for numsel in [9,16,25]:  
      comb_list=iterate_parameter_combinations(grid)
      score_list=[]
      for param in comb_list:
        print('Feature set {}_sel{} combination {}/{}'.format(id,numsel,len(score_list)+1,len(comb_list))) 
        print('param dict: ', param)
        print('keeping cols ', list(search_path_sfs[id][numsel]['feature_names']))
        to_drop_cols= [col for col in X_train.columns.values if not col in list(search_path_sfs[id][numsel]['feature_names'])]
        reg=RandomForestRegressor(verbose=2, n_jobs=-1, **param)
        reg.fit(X_train.drop(to_drop_cols,axis=1).to_numpy(), y_train)
        pred_val = np.clip(reg.predict(X_val.drop(to_drop_cols,axis=1).to_numpy()), 0, 20)
        rmse = clipped_rmse(y_val, pred_val)
        print('Feature set {} params {} Clipped RMSE {}'.format(id,param,rmse))
        score_list.append(rmse)
        
      print('** Best RandomForestRegressor params for feature set {}_{}: {}'.format(id,numsel,comb_list[np.argmin(score_list)]))      
      best_params['{}_sel{}'.format(id,numsel)]=comb_list[np.argmin(score_list)]
      with open(os.path.join(DATA_FOLDER,'best_params_rf.json'), 'w') as fp:
        json.dump(best_params, fp)  
        