In [1]:
import sys
import os.path
import json
import numpy as np
import pandas as pd 
import sklearn
import scipy.sparse 
from itertools import product
import gc
from tqdm import tqdm_notebook
import re
from catboost import CatBoostRegressor, Pool
from utils import load_feature_set, clipped_rmse, HoldOut
import pickle
from sklearn.externals import joblib

IN_COLAB = 'google.colab' in sys.modules

if IN_COLAB:
  from google.colab import drive
  drive.mount('/content/gdrive') 
  if not os.path.isfile('SETTINGS.json'):
       # hard coded data directory in drive is used if SETTINGS.json not present 
       config={}
       config['DATA_DIR'] = '/content/gdrive/My Drive/kaggle-c1'
       with open('SETTINGS.json', 'w') as outfile:
         json.dump(config, outfile)

with open('SETTINGS.json') as config_file:
    config = json.load(config_file)

DATA_DIR = config['DATA_DIR']

print('Using DATA_DIR ', DATA_DIR)

DATA_FOLDER = DATA_DIR

Using DATA_DIR  C:\Users\ville\easema\repos\c1-final\datadir


In [2]:
# read feature selection and hyperparameter search results from disk

with open(os.path.join(DATA_FOLDER,'search_path_sfs.pickle'), 'rb') as fp:    
            search_path_sfs = pickle.load(fp)
        
with open(os.path.join(DATA_FOLDER,'best_params_catboost.json')) as fp:        
    best_params_catboost = json.load(fp)

        
with open(os.path.join(DATA_FOLDER,'best_params_rf.json')) as fp:        
    best_params_rf = json.load(fp)
        

In [3]:
best_params_catboost

{'basic_sel9': {'learning_rate': 0.03,
  'iterations': 1000,
  'depth': 14,
  'l2_leaf_reg': 1,
  'task_type': 'GPU'},
 'basic_sel16': {'learning_rate': 0.5,
  'iterations': 300,
  'depth': 14,
  'l2_leaf_reg': 0.3,
  'task_type': 'GPU'},
 'basic_sel25': {'learning_rate': 0.3,
  'iterations': 300,
  'depth': 10,
  'l2_leaf_reg': 10,
  'task_type': 'GPU'},
 'basicv2_sel9': {'learning_rate': 0.3,
  'iterations': 300,
  'depth': 15,
  'l2_leaf_reg': 3,
  'task_type': 'GPU'},
 'basicv2_sel16': {'learning_rate': 0.1,
  'iterations': 300,
  'depth': 15,
  'l2_leaf_reg': 10,
  'task_type': 'GPU'},
 'basicv2_sel25': {'learning_rate': 0.03,
  'iterations': 1000,
  'depth': 12,
  'l2_leaf_reg': 3,
  'task_type': 'GPU'},
 'allfeat_sel9': {'learning_rate': 0.3,
  'iterations': 1000,
  'depth': 14,
  'l2_leaf_reg': 10,
  'task_type': 'GPU'},
 'allfeat_sel16': {'learning_rate': 0.5,
  'iterations': 300,
  'depth': 15,
  'l2_leaf_reg': 10,
  'task_type': 'GPU'},
 'allfeat_sel25': {'learning_rate': 0.

In [4]:
X_train, y_train, X_trainval, y_trainval, X_val, y_val, X_test, submissionidx2testidx = load_feature_set('allfeat', data_folder=DATA_FOLDER)

reading from file C:\Users\ville\easema\repos\c1-final\datadir\feature_set_allfeat.csv


## CatBoost model training

In [None]:

for id in ['basic','basicv2','allfeat','text','within']:
  for numsel in [9,16,25]:  
    print('generating model_catboost_{}_sel{}'.format(id,numsel))
    print('keeping cols ', list(search_path_sfs[id][numsel]['feature_names']))
    to_drop_cols= [col for col in X_train.columns.values if not col in list(search_path_sfs[id][numsel]['feature_names'])]
    reg=CatBoostRegressor(**best_params_catboost['{}_sel{}'.format(id,numsel)])
    reg.fit(X_trainval.drop(to_drop_cols,axis=1).to_numpy(), y_trainval)
    joblib_filename = os.path.join(DATA_FOLDER,'model_catboost_{}_sel{}.joblib'.format(id,numsel))
    joblib.dump(reg, joblib_filename+'.bz2', compress=('bz2', 3))
    
    #pred_test = np.clip(reg.predict(X_test.drop(to_drop_cols,axis=1).to_numpy()), 0, 20)
    #output_name = 'submission-rf-bestparams-features_{}_combination_{}.csv'.format(id,'sel{}'.format(numsel))
    #write_predictions_by_array(pred_test[submissionidx2testidx], output_name)  


In [None]:
## Random forest model training

In [None]:
from sklearn.ensemble import RandomForestRegressor

for id in ['basic','basicv2','allfeat','text','within']:
  for numsel in [9,16,25]:  
    print('generating model_rf_{}_sel{}'.format(id,numsel))
    print('keeping cols ', list(search_path_sfs[id][numsel]['feature_names']))
    to_drop_cols= [col for col in X_train.columns.values if not col in list(search_path_sfs[id][numsel]['feature_names'])]
    reg=RandomForestRegressor(verbose=2, n_jobs=-1, **best_params_rf['{}_sel{}'.format(id,numsel)])
    reg.fit(X_trainval.drop(to_drop_cols,axis=1).to_numpy(), y_trainval)
    joblib_filename = os.path.join(DATA_FOLDER,'model_rf_{}_sel{}.joblib'.format(id,numsel))
    joblib.dump(reg, joblib_filename+'.bz2', compress=('bz2', 3))
    
    #pred_test = np.clip(reg.predict(X_test.drop(to_drop_cols,axis=1).to_numpy()), 0, 20)
    #output_name = 'submission-rf-bestparams-features_{}_combination_{}.csv'.format(id,'sel{}'.format(numsel))
    #write_predictions_by_array(pred_test[submissionidx2testidx], output_name)  


Public LB performance:

|Feature set / subset | sel9 | sel16 | sel25 |
|---|---|---|---|
|allfeat|  1.03695| 0.98305|0.993841 |
|basic| 1.012312 | 1.011059| 1.003010 |
|basicv2| 0.96716 | 0.99535| 0.99384 |
|text|
|within|
| mean fusion (3 best, all sel)|-|- | 0.961783| 
| mean fusion (all features sets, all sel)|-| - | 0.981348| 
| mean fusion (3 best, all sel, rf+catboost)|-|- | 0.961669| 
| mean fusion (3 best, rf_allsel+catboost_sel25, )|-|- | 0.959445|
| mean fusion (3 best, rf_allsel+catboost_sel25_balanced, )|-|- |0.960548|



In [None]:
subm = ['submission-rf-bestparams-features_{}_combination_sel25.csv'.format(id) for id in ['allfeat','basic','basicv2']]
subm += ['submission-rf-bestparams-features_{}_combination_sel16.csv'.format(id) for id in ['allfeat','basic','basicv2']]
subm += ['submission-rf-bestparams-features_{}_combination_sel9.csv'.format(id) for id in ['allfeat','basic','basicv2']]

average_submissions(subm,'rf_3best_estparams')

In [None]:
subm = ['submission-rf-bestparams-features_{}_combination_sel25.csv'.format(id) for id in ['allfeat','basic','basicv2']]

average_submissions(subm,'rf_3best_estparams')

In [None]:
subm = ['submission-rf-bestparams-features_{}_combination_sel25.csv'.format(id) for id in ['allfeat','basic','basicv2']]
subm += ['submission-rf-bestparams-features_{}_combination_sel16.csv'.format(id) for id in ['allfeat','basic','basicv2']]
subm += ['submission-rf-bestparams-features_{}_combination_sel9.csv'.format(id) for id in ['allfeat','basic','basicv2']]


subm += ['submission-catboost-features_sel9_combination_{}.csv'.format(id) for id in ['allfeat','basic','basicv2']]
subm += ['submission-catboost-bestparams-features_{}_combination_sel16.csv'.format(id) for id in ['allfeat','basic','basicv2']]
subm += ['submission-catboost-features_{}_combination_sel25.csv'.format(id) for id in ['allfeat','basic','basicv2']]


average_submissions(subm,'rf+catboost_3best_bestparams')

In [None]:
subm = ['submission-rf-bestparams-features_{}_combination_sel25.csv'.format(id) for id in ['allfeat','basic','basicv2']]
subm += ['submission-rf-bestparams-features_{}_combination_sel16.csv'.format(id) for id in ['allfeat','basic','basicv2']]
subm += ['submission-rf-bestparams-features_{}_combination_sel9.csv'.format(id) for id in ['allfeat','basic','basicv2']]


subm += ['submission-catboost-features_{}_combination_sel25.csv'.format(id) for id in ['allfeat','basic','basicv2']]


average_submissions(subm,'rf+catboost25_3best_bestparams')

In [None]:
subm = ['submission-average-rf_3best_bestparams.csv','submission-average-sel25_3best_bestparams.csv']

average_submissions(subm,'rf_allsel+catboost25_3best_bestparams_balanced')

In [None]:
len(X_train.columns)