In [1]:
import sys
import os.path
import json
import numpy as np
import pandas as pd 
import sklearn
import scipy.sparse 
from itertools import product
import gc
import re
from catboost import CatBoostRegressor, Pool
from utils import load_feature_set, clipped_rmse, HoldOut
import pickle
import joblib

IN_COLAB = 'google.colab' in sys.modules

if IN_COLAB:
  from google.colab import drive
  drive.mount('/content/gdrive') 
  if not os.path.isfile('SETTINGS.json'):
       # hard coded data directory in drive is used if SETTINGS.json not present 
       config={}
       config['DATA_DIR'] = '/content/gdrive/My Drive/kaggle-c1'
       with open('SETTINGS.json', 'w') as outfile:
         json.dump(config, outfile)

with open('SETTINGS.json') as config_file:
    config = json.load(config_file)

DATA_DIR = config['DATA_DIR']

print('Using DATA_DIR ', DATA_DIR)

DATA_FOLDER = DATA_DIR

Using DATA_DIR  c:\repos\c1-final-test\datadir


In [2]:
# read feature selection and hyperparameter search results from disk

with open(os.path.join(DATA_FOLDER,'search_path_sfs.pickle'), 'rb') as fp:    
            search_path_sfs = pickle.load(fp)
        
with open(os.path.join(DATA_FOLDER,'best_params_catboost.json')) as fp:        
    best_params_catboost = json.load(fp)

        
with open(os.path.join(DATA_FOLDER,'best_params_rf.json')) as fp:        
    best_params_rf = json.load(fp)
        

In [3]:
X_trainval, y_trainval = load_feature_set('allfeat', data_folder=DATA_FOLDER, datasets='trainval')

reading from file c:\repos\c1-final-test\datadir\feature_set_allfeat.csv


## CatBoost model training

In [None]:
%%time
for id in ['basic','basicv2','allfeat','text','within']:
  for numsel in [9,16,25]:  
    print('generating model_catboost_{}_sel{}'.format(id,numsel))
    print('keeping cols ', list(search_path_sfs[id][numsel]['feature_names']))
    to_drop_cols= [col for col in X_trainval.columns.values if not col in list(search_path_sfs[id][numsel]['feature_names'])]
    reg=CatBoostRegressor(**best_params_catboost['{}_sel{}'.format(id,numsel)])
    reg.fit(X_trainval.drop(to_drop_cols,axis=1).to_numpy(), y_trainval)
    joblib_filename = os.path.join(DATA_FOLDER,'model_catboost_{}_sel{}.joblib'.format(id,numsel))
    joblib.dump(reg, joblib_filename+'.bz2', compress=('bz2', 3))
    
    #pred_test = np.clip(reg.predict(X_test.drop(to_drop_cols,axis=1).to_numpy()), 0, 20)
    #output_name = 'submission-rf-bestparams-features_{}_combination_{}.csv'.format(id,'sel{}'.format(numsel))
    #write_predictions_by_array(pred_test[submissionidx2testidx], output_name)  


generating model_catboost_basic_sel9
keeping cols  ['target_item_lag_4', 'target_lag_2', 'item_category_id', 'target_item_lag_2', 'target_shop_lag_12', 'target_category_lag_4', 'target_item_lag_12', 'item_name_category_tfidf_bigram_256', 'time_of_year']
0:	learn: 1.1862245	total: 328ms	remaining: 5m 28s
1:	learn: 1.1862241	total: 387ms	remaining: 3m 13s
2:	learn: 1.1760949	total: 672ms	remaining: 3m 43s
3:	learn: 1.1760945	total: 733ms	remaining: 3m 2s
4:	learn: 1.1661914	total: 1.01s	remaining: 3m 21s
5:	learn: 1.1661914	total: 1.07s	remaining: 2m 57s
6:	learn: 1.1563673	total: 1.34s	remaining: 3m 10s
7:	learn: 1.1563673	total: 1.4s	remaining: 2m 53s
8:	learn: 1.1472533	total: 1.69s	remaining: 3m 5s
9:	learn: 1.1472533	total: 1.75s	remaining: 2m 53s
10:	learn: 1.1382590	total: 2.02s	remaining: 3m 2s
11:	learn: 1.1382590	total: 2.08s	remaining: 2m 51s
12:	learn: 1.1302272	total: 2.37s	remaining: 2m 59s
13:	learn: 1.1302271	total: 2.43s	remaining: 2m 50s
14:	learn: 1.1226577	total: 2.71

### Training time of catboost models

Real time needed to train all the catboost models was 25 minutes on a NVIDIA GTX 1050 GPU (i5-3570 CPU).  
The time taken to train the three models that were actually used in the final submission ensemble was 1 min 59s (i.e. 'model_catboost_allfeat_sel16', 'model_catboost_basic_sel25', 'model_catboost_basicv2_sel9', see Notebook 8).

## Random forest model training

In [None]:
%%time
from sklearn.ensemble import RandomForestRegressor

for id in ['basic','basicv2','allfeat','text','within']:
  for numsel in [9,16,25]:  
    print('generating model_rf_{}_sel{}'.format(id,numsel))
    print('keeping cols ', list(search_path_sfs[id][numsel]['feature_names']))
    to_drop_cols= [col for col in X_trainval.columns.values if not col in list(search_path_sfs[id][numsel]['feature_names'])]
    reg=RandomForestRegressor(verbose=2, n_jobs=-1, **best_params_rf['{}_sel{}'.format(id,numsel)])
    reg.fit(X_trainval.drop(to_drop_cols,axis=1).to_numpy(), y_trainval)
    joblib_filename = os.path.join(DATA_FOLDER,'model_rf_{}_sel{}.joblib'.format(id,numsel))
    joblib.dump(reg, joblib_filename+'.bz2', compress=('bz2', 3))
   

Public LB performance:

|Feature set / subset | sel9 | sel16 | sel25 |
|---|---|---|---|
|allfeat|  1.03695| 0.98305|0.993841 |
|basic| 1.012312 | 1.011059| 1.003010 |
|basicv2| 0.96716 | 0.99535| 0.99384 |

### Training time of random forest models

Real time needed to train all the random forest models was 2h 28 min on a i5-3570 CPU with 4 cores running at 3.4GHz (on an earlier run). 
The time taken to train the three models that were actually used in the final submission ensemble (i.e. 'model_rf_allfeat_sel16','model_rf_basic_sel25','model_rf_basicv2_sel25', see Notebook 8) was 24.7 minutes.