In [None]:
import sys
import os.path
import json
import numpy as np
import pandas as pd 
import sklearn
import scipy.sparse 
from itertools import product
import gc
import re
from catboost import CatBoostRegressor, Pool
from utils import load_feature_set, clipped_rmse, HoldOut

IN_COLAB = 'google.colab' in sys.modules

if IN_COLAB:
  from google.colab import drive
  drive.mount('/content/gdrive') 
  if not os.path.isfile('SETTINGS.json'):
       # hard coded data directory in drive is used if SETTINGS.json not present 
       config={}
       config['DATA_DIR'] = '/content/gdrive/My Drive/kaggle-c1'
       with open('SETTINGS.json', 'w') as outfile:
         json.dump(config, outfile)

with open('SETTINGS.json') as config_file:
    config = json.load(config_file)

DATA_DIR = config['DATA_DIR']

print('Using DATA_DIR ', DATA_DIR)

DATA_FOLDER = DATA_DIR

# Feature selection

The purpose of the code in this notebook is to run an feature selection experiment.
In the experiment, subsets of 9, 16 and 25 features are selected using a sequential wrapper method 
from the pre-generated five feature subsets (basic, text, within, allfeat, basicv2) so that the performance 
is maximised in a validation experiment. As the indicator of performance, we employ the clipped RMSE metric 
of a trained CatBoost regressor (with fixed parameters).

## Choosing the validition setup

Choosing a good validation strategy is problematic. There may not be too much that can be done, because the validation data necessarily has different distribution as the actual testing data. This is because the temporal nature of the prediction problem. The distributions slowly drift during cause of time. Therefore, it would be good to have the validation period temporally close to the test period. On the other hand, data analysis shows strong seasonal (=yearly) effects. For example, if we eould choose last training month (Oct 2015) as the validation set, predicting October sales based on previous months simply is a very different problem to predicting December sales, as sales figures seem to peak strongly in December and may otherwise have special characteristics.

Here we have chosen another alternative: predicting the sales of the last training December  month (2014) based on the previous data. In return of better matching the yearly cycle, we sacrifice quite much in temporal closeness of validation and test periods, as well as in smaller amount of validation data.

We are fully aware of the shortcomings of the validation scheme. Therefore, we shamelessly also advocate validation on the public leaderboard in this case, if we have only a small number of choices to evaluate. This depends on a data leak from th the test set via the leaderboard, but in a competition anything goes if only rules are not violated.



## Feature selection code

In [None]:
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
import json

best_features_floating_best={}
search_path_floating_best={}



for id in ['allfeat','basic','basicv2','text','within']:
    print('selecting features for feature set {}'.format(id))
    X_train, y_train, X_trainval, y_trainval, X_val, y_val, X_test, submissionidx2testidx = load_feauture_set(id)

    X_paramsearch =  pd.concat([X_train, X_val],ignore_index=True)
    y_paramsearch = pd.concat([y_train, y_val],ignore_index=True)
    train_indices = np.arange(X_train.shape[0])
    val_indices = np.arange(X_val.shape[0]) + X_train.shape[0]

    # 'item_price' generation is wrongly performed -> drop that column, if present
    
    to_drop_cols=[col for col in X_paramsearch.columns.values if re.search('item_price',col) ]
    
    X_paramsearch=X_paramsearch.drop(to_drop_cols,axis=1)
    ncol=X_paramsearch.shape[1]
    
    for target_count in [9, 16, 25]:
        regparams={'learning_rate': 0.5, 'iterations': 100, 'depth': 12, 'l2_leaf_reg': 0.3, 'task_type': 'GPU', 'metric_period':100}
        model = CatBoostRegressor(**regparams)
        cv = HoldOut(train_indices=train_indices, test_indices=val_indices)

        sfs1 = SFS(model, 
               k_features=min(target_count,ncol-1), 
               forward=True, 
               floating=False, 
               scoring='neg_mean_squared_error',
               verbose=3,
               cv=cv,
               fixed_features=tuple(feature_lists_sel9[id])   )

        sfs1 = sfs1.fit(X_paramsearch, y_paramsearch)
        print('Feature selection path for feature subset {}:'.format(id))
        print(sfs1.subsets_)
        
        key='{}_sel{}'.format(id, target_count)
        
        search_path_floating_best[id]=sfs1.subsets_
        best_features_floating_best[id]=sfs1.k_feature_names_

        with open(os.path.join(DATA_FOLDER,'best_features_floating_best.json'), 'w') as fp:
            json.dump(best_features_floating_best, fp)
        with open(os.path.join(DATA_FOLDER,'best_features_floating_best.pickle'), 'wb') as fp:    
            pickle.dump (search_path_floating_best, fp)

        del model
        gc.collect()
        
    del X_train
    del y_train
    del X_trainval
    del y_trainval 
    del X_val
    del y_val
    del X_test
    del X_paramsearch
    del y_paramsearch
    gc.collect()

In [None]:
# the feature selection experiment runs for a long time. Here are the results

feature_lists_sel9={
      "text": [
        "shop_id",
        "target_category_tfidf_unigram_256_lag_6",
        "item_name_cyrillic_fraction",
        "target_category_frequent_256_within_shop_lag_6",
        "target_category_tfidf_bigram_256_within_shop_lag_12",
        "item_name_category_frequent_256",
        "item_name_category_tfidf_bigram_256",
        "item_category_id",
        "target_category_tfidf_unigram_256_lag_5"
      ],
      "within": [
        "target_category_frequent_256_within_shop_lag_12",
        "shop_id",
        "target_category_tfidf_unigram_256_within_shop_lag_4",
        "target_category_tfidf_bigram_256_within_shop_lag_5",
        "target_category_tfidf_bigram_256_within_shop_lag_12",
        "target_category_within_shop_lag_2",
        "target_category_tfidf_bigram_256_within_shop_lag_2",
        "item_category_id",
        "target_category_frequent_256_within_shop_lag_2"
      ],
      "allfeat": [
        "target_item_lag_6",
        "target_shop_lag_12",
        "target_item_lag_2",
        "target_lag_2",
        "target_category_tfidf_unigram_256_lag_6",
        "target_item_lag_12",
        "target_category_within_shop_lag_12",
        "target_item_lag_4",
        "item_category_id"
      ],
      "basic": [
        "target_item_lag_6",
        "target_category_lag_5",
        "target_shop_lag_12",
        "target_lag_5",
        "item_category_id",
        "item_id",
        "target_item_lag_3",
        "target_lag_2",
        "time_of_year"
      ],
      "basicv2": [
        "target_category_within_shop_lag_3",
        "target_item_lag_4",
        "target_item_lag_12",
        "target_shop_lag_12",
        "target_shop_lag_4",
        "target_item_lag_2",
        "item_category_id",
        "target_item_lag_3",
        "target_lag_2"
      ]} 

feature_lists_sel16={
      "text": [
        'target_category_tfidf_unigram_256_within_shop_lag_5',
         'shop_id',
         'target_category_frequent_256_lag_6',
         'target_category_tfidf_unigram_256_lag_6',
         'item_name_cyrillic_fraction',
         'target_category_tfidf_bigram_256_lag_2',
         'target_category_frequent_256_within_shop_lag_6',
         'target_category_tfidf_bigram_256_within_shop_lag_12',
         'target_category_tfidf_unigram_256_within_shop_lag_3',
         'target_category_tfidf_bigram_256_lag_12',
         'item_name_category_frequent_256',
         'item_name_category_tfidf_bigram_256',
         'item_category_id',
         'target_category_tfidf_unigram_256_lag_5',
         'target_category_tfidf_unigram_256_within_shop_lag_12',
         'target_category_tfidf_bigram_256_within_shop_lag_6'
      ],
      "within": [
         'target_category_frequent_256_within_shop_lag_12',
         'target_category_within_shop_lag_5',
         'shop_id',
         'target_category_within_shop_lag_3',
         'target_category_tfidf_unigram_256_within_shop_lag_4',
         'target_category_tfidf_bigram_256_within_shop_lag_5',
         'target_category_within_shop_lag_4',
         'target_category_tfidf_bigram_256_within_shop_lag_12',
         'target_category_tfidf_unigram_256_within_shop_lag_3',
         'target_category_within_shop_lag_2',
         'target_category_tfidf_bigram_256_within_shop_lag_2',
         'item_category_id',
         'item_id',
         'target_category_frequent_256_within_shop_lag_2',
         'target_category_tfidf_unigram_256_within_shop_lag_12',
         'target_category_tfidf_bigram_256_within_shop_lag_6'
      ],
      "allfeat": [
         'target_category_tfidf_unigram_256_within_shop_lag_4',
         'target_item_lag_6',
         'target_category_lag_5',
         'target_category_tfidf_unigram_256_lag_5',
         'item_name_cyrillic_fraction',
         'target_shop_lag_12',
         'target_item_lag_2',
         'target_lag_2',
         'target_category_within_shop_lag_3',
         'target_category_tfidf_unigram_256_lag_6',
         'target_item_lag_12',
         'target_category_tfidf_unigram_256_within_shop_lag_5',
         'target_category_within_shop_lag_12',
         'target_item_lag_4',
         'item_category_id',
         'target_category_within_shop_lag_2'
      ],
      "basic": [
         'shop_id',
         'target_lag_6',
         'target_item_lag_6',
         'target_item_lag_12',
         'target_category_lag_5',
         'target_shop_lag_12',
         'item_name_category_tfidf_bigram_256',
         'target_lag_5',
         'target_category_lag_2',
         'target_item_lag_2',
         'item_category_id',
         'item_id',
         'target_item_lag_3',
         'target_lag_2',
         'target_item_lag_5',
         'time_of_year'
      ],
      "basicv2": [
         'target_category_tfidf_unigram_256_within_shop_lag_5',
         'target_category_tfidf_bigram_256_within_shop_lag_6',
         'target_category_within_shop_lag_3',
         'target_item_lag_4',
         'item_name_cyrillic_fraction',
         'target_item_lag_12',
         'target_lag_3',
         'target_shop_lag_12',
         'target_shop_lag_4',
         'target_lag_5',
         'target_item_lag_2',
         'item_category_id',
         'target_category_tfidf_bigram_256_within_shop_lag_2',
         'item_id',
         'target_item_lag_3',
         'target_lag_2'
      ]}

feature_lists_sel25={
      "text": [
         'target_category_tfidf_bigram_256_lag_5',
         'target_category_tfidf_unigram_256_within_shop_lag_5',
         'shop_id',
         'target_category_frequent_256_lag_6',
         'target_category_tfidf_bigram_256_within_shop_lag_4',
         'date_block_num',
         'target_category_tfidf_unigram_256_lag_6',
         'item_name_cyrillic_fraction',
         'target_category_tfidf_bigram_256_lag_2',
         'item_name_category_tfidf_unigram_256',
         'target_category_tfidf_unigram_256_within_shop_lag_6',
         'target_category_frequent_256_within_shop_lag_6',
         'target_category_tfidf_bigram_256_within_shop_lag_12',
         'target_category_tfidf_unigram_256_within_shop_lag_3',
         'target_category_tfidf_bigram_256_lag_12',
         'item_name_category_frequent_256',
         'item_name_category_tfidf_bigram_256',
         'target_category_tfidf_bigram_256_within_shop_lag_2',
         'item_category_id',
         'item_id',
         'target_category_tfidf_unigram_256_lag_5',
         'target_category_tfidf_bigram_256_lag_3',
         'target_category_tfidf_unigram_256_within_shop_lag_12',
         'target_category_tfidf_bigram_256_within_shop_lag_6',
         'time_of_year'
      ],
      "within": [
         'target_category_frequent_256_within_shop_lag_12',
         'target_category_tfidf_unigram_256_within_shop_lag_5',
         'target_category_within_shop_lag_5',
         'shop_id',
         'target_category_tfidf_bigram_256_within_shop_lag_4',
         'target_category_within_shop_lag_12',
         'date_block_num',
         'target_category_within_shop_lag_3',
         'target_category_frequent_256_within_shop_lag_5',
         'target_category_tfidf_unigram_256_within_shop_lag_4',
         'target_category_tfidf_bigram_256_within_shop_lag_5',
         'target_category_within_shop_lag_4',
         'target_category_frequent_256_within_shop_lag_6',
         'target_category_tfidf_bigram_256_within_shop_lag_3',
         'target_category_tfidf_bigram_256_within_shop_lag_12',
         'target_category_tfidf_unigram_256_within_shop_lag_3',
         'target_category_within_shop_lag_2',
         'target_category_tfidf_bigram_256_within_shop_lag_2',
         'item_category_id',
         'item_id',
         'target_category_frequent_256_within_shop_lag_4',
         'target_category_frequent_256_within_shop_lag_2',
         'target_category_tfidf_unigram_256_within_shop_lag_12',
         'target_category_tfidf_bigram_256_within_shop_lag_6',
         'time_of_year' 
      ],
      "allfeat": [
         'target_category_tfidf_unigram_256_within_shop_lag_4',
         'target_item_lag_6',
         'target_category_lag_5',
         'target_category_frequent_256_within_shop_lag_6',
         'target_category_within_shop_lag_6',
         'target_category_tfidf_unigram_256_lag_12',
         'target_category_lag_2',
         'target_category_tfidf_unigram_256_lag_5',
         'target_category_frequent_256_within_shop_lag_2',
         'target_category_tfidf_bigram_256_within_shop_lag_6',
         'item_name_cyrillic_fraction',
         'target_shop_lag_12',
         'target_item_lag_2',
         'target_lag_2',
         'shop_id',
         'target_category_within_shop_lag_3',
         'target_category_tfidf_unigram_256_lag_6',
         'target_item_lag_12',
         'target_item_lag_3',
         'target_category_tfidf_unigram_256_within_shop_lag_5',
         'target_category_frequent_256_lag_6',
         'target_category_within_shop_lag_12',
         'target_item_lag_4',
         'item_category_id',
         'target_category_within_shop_lag_2'
      ],
      "basic": [
         'shop_id',
         'target_lag_6',
         'is_internet_store',
         'item_name_cyrillic_fraction',
         'target_item_lag_6',
         'target_category_lag_3',
         'item_name_category_tfidf_unigram_256',
         'target_item_lag_12',
         'target_category_lag_5',
         'item_name_category_frequent_256',
         'target_category_lag_4',
         'target_shop_lag_12',
         'item_name_category_tfidf_bigram_256',
         'target_lag_5',
         'target_category_lag_2',
         'target_item_lag_2',
         'target_shop_lag_3',
         'item_category_id',
         'item_id',
         'target_lag_4',
         'target_shop_lag_6',
         'target_item_lag_3',
         'target_lag_2',
         'target_item_lag_5',
         'time_of_year'
      ],
      "basicv2": [
         'target_category_tfidf_unigram_256_within_shop_lag_5',
         'target_category_tfidf_bigram_256_within_shop_lag_6',
         'target_category_lag_12',
         'target_category_within_shop_lag_5',
         'shop_id',
         'is_internet_store',
         'target_category_within_shop_lag_3',
         'target_item_lag_4',
         'item_name_cyrillic_fraction',
         'target_item_lag_6',
         'target_item_lag_12',
         'target_category_lag_5',
         'item_name_category_frequent_256',
         'target_lag_3',
         'target_shop_lag_12',
         'target_shop_lag_4',
         'target_lag_5',
         'target_category_lag_2',
         'target_item_lag_2',
         'item_category_id',
         'target_category_tfidf_bigram_256_within_shop_lag_2',
         'item_id',
         'target_lag_4',
         'target_item_lag_3',
         'target_lag_2'
      ]} 

# the results seem to make sense! The short 9 feature lists really contain features that
# one would expect to be most important in prediction

