<a href="https://colab.research.google.com/github/vvivvi/kaggle-c1/blob/master/Kaggle_C1_text_features.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd 
import sklearn
import scipy.sparse 
import lightgbm as lgb
from sklearn.metrics import r2_score
import catboost
import gc

from catboost import CatBoostRegressor, Pool

import re
import os

for p in [np, pd, scipy, sklearn, lgb, catboost]:
    print (p.__name__, p.__version__)
    
DATA_FOLDER = 'competitive-data-science-predict-future-sales'
test_spec = pd.read_csv(os.path.join(DATA_FOLDER, 'test.csv'))

index_cols=['item_id','shop_id','date_block_num']
date_block_val = 33
date_block_test = 35 # Dec 2015

test2submission_mapping_generated = False

numpy 1.18.1
pandas 0.25.3
scipy 1.4.1
sklearn 0.22.1
lightgbm 2.3.1
catboost 0.22


In [2]:
def downcast_dtypes(df):
    '''
        Changes column types in the dataframe: 
                
                `float64` type to `float32`
                `int64`   type to `int32`
    '''
    
    # Select columns to downcast
    float_cols = [c for c in df if df[c].dtype == "float64"]
    int_cols =   [c for c in df if df[c].dtype == "int64"]
    
    # Downcast
    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols]   = df[int_cols].astype(np.int32)
    
    return df

In [3]:
!pip install catboost



In [4]:
def write_predictions_by_array(array, filename):
  df=pd.DataFrame(array)
  df.columns=['item_cnt_month']
  df.to_csv(os.path.join(DATA_FOLDER, filename), index_label='ID')

In [5]:
def clipped_rmse(gt, predicted,clip_min=0, clip_max=20):
  target=np.minimum(np.maximum(gt,clip_min), clip_max)
  return np.sqrt((target-predicted)**2).mean()

In [6]:
# plan:
# list all feature files

# create data structure that tells in which feature file a given feature is found

# create evaluation function that evaluates a given combination
# of features and optionally writes test set predictions to a csv file
# the necessary feature files are loade automatically in memory

# first create a function for regressors implementing the sklearn fit/predict interface
# plan to use CatBoost for first experiments







In [7]:
feature_files = ['features_category.csv','features_lagged_basic.csv','features_targets.csv','features_text.csv']
feature_files += ['features_mean_encoded.csv']
feature_files += ['target_category_frequent_256_lagged.csv','target_category_frequent_256_within_shop_lagged.csv']
feature_files += ['target_category_frequent_32_lagged.csv']
feature_files += ['target_category_frequent_32_within_shop_lagged.csv']
feature_files += ['target_category_lagged.csv']
feature_files += ['target_category_tfidf_bigram_256_lagged.csv']
feature_files += ['target_category_tfidf_bigram_256_within_shop_lagged.csv']
feature_files += ['target_category_tfidf_bigram_32_lagged.csv']
feature_files += ['target_category_tfidf_bigram_32_within_shop_lagged.csv']
feature_files += ['target_category_tfidf_unigram_256_lagged.csv']
feature_files += ['target_category_tfidf_unigram_256_within_shop_lagged.csv']
feature_files += ['target_category_tfidf_unigram_32_lagged.csv']
feature_files += ['target_category_tfidf_unigram_32_within_shop_lagged.csv']
feature_files += ['target_category_within_shop_lagged.csv']
feature_files += ['target_item_lagged.csv']
# feature_files += ['target_item_within_shop_lagged.csv']
feature_files += ['target_lagged.csv']
feature_files += ['target_shop_lagged.csv']



In [8]:
feature2filename={}

# create data structure that tells in which feature file a given feature is found
for f in feature_files:
    df =pd.read_csv(DATA_FOLDER + '/' + f)
    for col in df.columns.values:
        feature2filename[col]=f
        
                    

In [9]:
# list all features
sorted(feature2filename.keys())

['Unnamed: 0',
 'date_block_num',
 'item_category_id',
 'item_category_id_enc_train',
 'item_category_id_enc_trainval',
 'item_id',
 'item_id_enc_train',
 'item_id_enc_trainval',
 'item_name_category_frequent_256',
 'item_name_category_frequent_256_enc_train',
 'item_name_category_frequent_256_enc_trainval',
 'item_name_category_frequent_32',
 'item_name_category_frequent_32_enc_train',
 'item_name_category_frequent_32_enc_trainval',
 'item_name_category_tfidf_bigram_256',
 'item_name_category_tfidf_bigram_256_enc_train',
 'item_name_category_tfidf_bigram_256_enc_trainval',
 'item_name_category_tfidf_bigram_32',
 'item_name_category_tfidf_bigram_32_enc_train',
 'item_name_category_tfidf_bigram_32_enc_trainval',
 'item_name_category_tfidf_unigram_256',
 'item_name_category_tfidf_unigram_256_enc_train',
 'item_name_category_tfidf_unigram_256_enc_trainval',
 'item_name_category_tfidf_unigram_32',
 'item_name_category_tfidf_unigram_32_enc_train',
 'item_name_category_tfidf_unigram_32_enc_t

In [10]:
def evaluate_lgb(feature_lists, first_date_to_keep=None):
    
    # feature_lists : list of feature lists (= feature combinations) to be red into memory 
    # simultaneously and then evaluated in one go
    
    files_and_columns_to_read={}
    lagged=False
    
    flat_features = set([item for sublist in feature_lists for item in sublist])
    
    for feat in flat_features.union({'target'}):
        is_index=False
        for idx in index_cols:
            if feat == idx:
                is_index = True
        # index columns are present in every feature file, no need to read
        # them explicitly
        
        if is_index: 
            continue
            
        if re.search('lag', feat):
            lagged=True
        file = feature2filename[feat]
        featlist=files_and_columns_to_read.get(file,[])
        featlist += [feat]
        files_and_columns_to_read[file] = featlist
    print('file -> columns mapping: ', files_and_columns_to_read)   
    
    first=True
    for file in files_and_columns_to_read:
        to_read_cols = files_and_columns_to_read[file]
        to_keep_cols = list(set(to_read_cols + index_cols)) # add index columns, remove duplicates
        
        if first:
            all_data = pd.read_csv(DATA_FOLDER + '/' + file )[to_keep_cols]
            first=False
        else:
            df = pd.read_csv(DATA_FOLDER + '/' + file )[to_keep_cols]
            df = downcast_dtypes(df)
            all_data = pd.merge(all_data, df, on=index_cols)
            del df
            gc.collect()
        if lagged:
            all_data=all_data[all_data['date_block_num'] >= 14]
        if first_date_to_keep is not None:
            all_data=all_data[all_data['date_block_num'] >= first_date_to_keep]
        all_data = downcast_dtypes(all_data)
        gc.collect()
        
        
     
    print('read columns:', all_data.columns.values)
    
    dates = all_data['date_block_num']
    
    y_train = np.clip(all_data.loc[dates <  date_block_val, 'target'].values,0,20)
    y_trainval = np.clip(all_data.loc[dates <  date_block_test, 'target'].values,0,20)
    y_val =  np.clip(all_data.loc[dates == date_block_val, 'target'].values,0,20)
    
    print(y_train.shape)
    
    lgb_params = {
               'feature_fraction': 0.75,
               'metric': 'rmse',
               'nthread':1, 
               'min_data_in_leaf': 2**7, 
               'bagging_fraction': 0.75, 
               'learning_rate': 0.03, 
               'objective': 'mse', 
               'bagging_seed': 2**7, 
               'num_leaves': 2**7,
               'bagging_freq':1,
               'verbose':2
    }
              
    for features in feature_lists:    
        model = lgb.train(lgb_params, lgb.Dataset(all_data.loc[dates <  date_block_val, features], label=y_train), 100)
        pred_lgb_val = np.clip(model.predict(all_data.loc[dates ==  date_block_val, features]), 0, 20)
        #print('Validation R-squared for LightGBM is %f' % r2_score(y_val, pred_lgb_val))
        print('{}: Clipped RMSE {}'.format(features,clipped_rmse(y_val, pred_lgb_val)))
    
    generate_submission=False
    
    if generate_submission:
        
        shop_item2submissionid={}
        for idx, row in test_spec.iterrows():
            shop_item2submissionid[str(row['shop_id'])+'_'+str(row['item_id'])] = row['ID']
    
        test_data=all_data.loc[dates == date_block_test, ['shop_id','item_id']]    
    
        testidx2submissionidx=np.zeros(test_data.shape[0], dtype=np.int32)
        for idx in range(test_data.shape[0]):
            row =test_data.iloc[idx]
            testidx2submissionidx[idx] = shop_item2submissionid[str(row['shop_id'])+'_'+str(row['item_id'])]
    
        print(len(testidx2submissionidx))    

#invert the mapping
        submissionidx2testidx=np.zeros(test_data.shape[0], dtype=np.int32)
        for i in range(test_data.shape[0]):
            submissionidx2testidx[testidx2submissionidx[i]]=i
    
        del test_data
        gc.collect()    
    

In [11]:
def evaluate_by_model(model, feature_lists, first_date_to_keep=None):
    
    # feature_lists : list of feature lists (= feature combinations) to be red into memory 
    # simultaneously and then evaluated in one go
    
    files_and_columns_to_read={}
    lagged=False
    
    flat_features = set([item for sublist in feature_lists for item in sublist])
    
    for feat in flat_features.union({'target'}):
        is_index=False
        for idx in index_cols:
            if feat == idx:
                is_index = True
        # index columns are present in every feature file, no need to read
        # them explicitly
        
        if is_index: 
            continue
            
        if re.search('lag', feat):
            lagged=True
        file = feature2filename[feat]
        featlist=files_and_columns_to_read.get(file,[])
        featlist += [feat]
        files_and_columns_to_read[file] = featlist
    print('file -> columns mapping: ', files_and_columns_to_read)   
    
    first=True
    for file in files_and_columns_to_read:
        to_read_cols = files_and_columns_to_read[file]
        to_keep_cols = list(set(to_read_cols + index_cols)) # add index columns, remove duplicates
        
        if first:
            all_data = pd.read_csv(DATA_FOLDER + '/' + file )[to_keep_cols]
            first=False
        else:
            df = pd.read_csv(DATA_FOLDER + '/' + file )[to_keep_cols]
            df = downcast_dtypes(df)
            all_data = pd.merge(all_data, df, on=index_cols)
            del df
            gc.collect()
        if lagged:
            all_data=all_data[all_data['date_block_num'] >= 14]
        if first_date_to_keep is not None:
            all_data=all_data[all_data['date_block_num'] >= first_date_to_keep]
        all_data = downcast_dtypes(all_data)
        gc.collect()
        
        
     
    print('read columns:', all_data.columns.values)
    
    dates = all_data['date_block_num']
    
    y_train = np.clip(all_data.loc[dates <  date_block_val, 'target'].values,0,20)
    y_trainval = np.clip(all_data.loc[dates <  date_block_test, 'target'].values,0,20)
    y_val =  np.clip(all_data.loc[dates == date_block_val, 'target'].values,0,20)
    
    print(y_train.shape)
    
              
    for features in feature_lists:    
        model = model.fit(all_data.loc[dates <  date_block_val, features].to_numpy(), y_train)
        pred_val = np.clip(model.predict(all_data.loc[dates ==  date_block_val, features].to_numpy()), 0, 20)
        #print('Validation R-squared for LightGBM is %f' % r2_score(y_val, pred_lgb_val))
        print('{}: Clipped RMSE {}'.format(features,clipped_rmse(y_val, pred_val)))
    
    generate_submission=False
    
    if generate_submission:
        
        shop_item2submissionid={}
        for idx, row in test_spec.iterrows():
            shop_item2submissionid[str(row['shop_id'])+'_'+str(row['item_id'])] = row['ID']
    
        test_data=all_data.loc[dates == date_block_test, ['shop_id','item_id']]    
    
        testidx2submissionidx=np.zeros(test_data.shape[0], dtype=np.int32)
        for idx in range(test_data.shape[0]):
            row =test_data.iloc[idx]
            testidx2submissionidx[idx] = shop_item2submissionid[str(row['shop_id'])+'_'+str(row['item_id'])]
    
        print(len(testidx2submissionidx))    

#invert the mapping
        submissionidx2testidx=np.zeros(test_data.shape[0], dtype=np.int32)
        for i in range(test_data.shape[0]):
            submissionidx2testidx[testidx2submissionidx[i]]=i
    
        del test_data
        gc.collect()    
    

In [12]:
#evaluate(' ',['item_id_enc_train','shop_id','target_lag_2','target_lag_3','target_lag_4','target_lag_5','target_lag_12','target_lag_14'])

In [13]:
# evaluate text features

In [13]:
text_categories = [c for c in sorted(feature2filename.keys()) if re.search('item_name_category',c)]
text_categories = [c for c in text_categories if not re.search('trainval',c)]

In [14]:
# validate text categories in one-by-one setting 
evaluate_lgb([[feat] for feat in text_categories])

NameError: name 'evaluate_lgb' is not defined

In [None]:
lists = [['item_id','shop_id','item_category_id']]
lists += [['item_id_enc_train','shop_id','item_category_id']]
lists += [['item_id','shop_id','item_category_id_enc_train']]
lists += [['item_id_enc_train','shop_id_enc_train','item_category_id_enc_train']]
lists += [['item_id_enc_train','shop_id_enc_train','item_category_id_enc_train','item_id','shop_id','item_category_id']]
lists += [['item_category_id_enc_train','item_id','shop_id','item_category_id']]
evaluate_lgb(lists)

In [17]:
lists = [['item_category_id_enc_train','item_id','shop_id','item_category_id',feat] for feat in text_categories]
evaluate_lgb(lists)


file -> columns mapping:  {'features_mean_encoded.csv': ['item_name_category_tfidf_bigram_256_enc_train', 'item_name_category_tfidf_unigram_256_enc_train', 'item_name_category_tfidf_unigram_32_enc_train', 'item_category_id_enc_train', 'item_name_category_frequent_256_enc_train', 'item_name_category_tfidf_bigram_32_enc_train', 'item_name_category_frequent_32_enc_train'], 'features_text.csv': ['item_name_category_tfidf_unigram_32', 'item_name_category_frequent_32', 'item_name_category_frequent_256', 'item_name_category_tfidf_bigram_32', 'item_name_category_tfidf_unigram_256', 'item_name_category_tfidf_bigram_256'], 'target_lagged.csv': ['target'], 'features_category.csv': ['item_category_id']}
read columns: ['item_name_category_tfidf_bigram_256_enc_train'
 'item_name_category_tfidf_unigram_32_enc_train'
 'item_name_category_tfidf_unigram_256_enc_train'
 'item_category_id_enc_train' 'item_id' 'shop_id'
 'item_name_category_frequent_256_enc_train'
 'item_name_category_tfidf_bigram_32_enc_t

In [19]:
lists=[['target_lag_12', 'target_lag_13', 'target_lag_14', 'target_lag_2', 'target_lag_3', 'target_lag_4', 'target_lag_5', 'target_lag_6']]
lists+=[['target_lag_2', 'target_lag_3', 'target_lag_4', 'target_lag_5', 'target_lag_6']]
lists+=[['target_lag_2', 'target_lag_3', 'target_lag_6']]
lists+=[['target_lag_2', 'target_lag_3', 'target_lag_4']]
lists+=[['target_lag_12',  'target_lag_2', 'target_lag_3', 'target_lag_4', 'target_lag_5', 'target_lag_6']]
lists+=[['target_lag_12',  'target_lag_2', 'target_lag_3', 'target_lag_4']]

evaluate_lgb(lists)

file -> columns mapping:  {'target_lagged.csv': ['target_lag_14', 'target_lag_12', 'target_lag_3', 'target', 'target_lag_2', 'target_lag_5', 'target_lag_6', 'target_lag_13', 'target_lag_4']}
read columns: ['target_lag_14' 'target_lag_12' 'target' 'target_lag_3' 'item_id'
 'target_lag_2' 'target_lag_5' 'shop_id' 'target_lag_6' 'target_lag_13'
 'target_lag_4' 'date_block_num']
(4499858,)
['target_lag_12', 'target_lag_13', 'target_lag_14', 'target_lag_2', 'target_lag_3', 'target_lag_4', 'target_lag_5', 'target_lag_6']: Clipped RMSE 0.37270642671600185
['target_lag_2', 'target_lag_3', 'target_lag_4', 'target_lag_5', 'target_lag_6']: Clipped RMSE 0.37348156144421235
['target_lag_2', 'target_lag_3', 'target_lag_6']: Clipped RMSE 0.3773256767539508
['target_lag_2', 'target_lag_3', 'target_lag_4']: Clipped RMSE 0.37879492747742627
['target_lag_12', 'target_lag_2', 'target_lag_3', 'target_lag_4', 'target_lag_5', 'target_lag_6']: Clipped RMSE 0.37257159613227697
['target_lag_12', 'target_lag_2',

In [16]:
lists=[['target_lag_12', 'target_lag_2', 'target_lag_3', 'target_lag_4', 'target_lag_5', 'target_lag_6']]
lists+=[['target_shop_lag_12', 'target_shop_lag_2', 'target_shop_lag_3', 'target_shop_lag_4', 'target_shop_lag_5', 'target_shop_lag_6']]
lists+=[['target_item_lag_12', 'target_item_lag_2', 'target_item_lag_3', 'target_item_lag_4', 'target_item_lag_5', 'target_item_lag_6']]
lists+=[['target_category_lag_12', 'target_category_lag_2', 'target_category_lag_3', 'target_category_lag_4', 'target_category_lag_5', 'target_category_lag_6']]

evaluate_lgb(lists)

file -> columns mapping:  {'target_category_lagged.csv': ['target_category_lag_3', 'target_category_lag_5', 'target_category_lag_6', 'target_category_lag_2', 'target_category_lag_12', 'target_category_lag_4'], 'target_shop_lagged.csv': ['target_shop_lag_6', 'target_shop_lag_2', 'target_shop_lag_3', 'target_shop_lag_5', 'target_shop_lag_4', 'target_shop_lag_12'], 'target_item_lagged.csv': ['target_item_lag_12', 'target_item_lag_4', 'target_item_lag_6', 'target_item_lag_3', 'target_item_lag_2', 'target_item_lag_5'], 'target_lagged.csv': ['target_lag_5', 'target_lag_3', 'target_lag_12', 'target', 'target_lag_2', 'target_lag_6', 'target_lag_4']}
read columns: ['target_category_lag_3' 'target_category_lag_4' 'target_category_lag_5'
 'target_category_lag_6' 'target_category_lag_2' 'item_id' 'shop_id'
 'target_category_lag_12' 'date_block_num' 'target_shop_lag_6'
 'target_shop_lag_3' 'target_shop_lag_2' 'target_shop_lag_4'
 'target_shop_lag_12' 'target_shop_lag_5' 'target_item_lag_2'
 'target

In [18]:
lists=[['target_lag_12', 'target_lag_2', 'target_lag_3', 'target_lag_4', 'target_lag_5', 'target_lag_6','target_shop_lag_12', 'target_shop_lag_2', 'target_shop_lag_3', 'target_shop_lag_4', 'target_shop_lag_5', 'target_shop_lag_6','target_item_lag_12', 'target_item_lag_2', 'target_item_lag_3', 'target_item_lag_4', 'target_item_lag_5', 'target_item_lag_6']]

evaluate_lgb(lists)

file -> columns mapping:  {'target_shop_lagged.csv': ['target_shop_lag_6', 'target_shop_lag_2', 'target_shop_lag_3', 'target_shop_lag_5', 'target_shop_lag_4', 'target_shop_lag_12'], 'target_item_lagged.csv': ['target_item_lag_12', 'target_item_lag_4', 'target_item_lag_6', 'target_item_lag_3', 'target_item_lag_2', 'target_item_lag_5'], 'target_lagged.csv': ['target_lag_5', 'target_lag_6', 'target_lag_3', 'target_lag_12', 'target', 'target_lag_2', 'target_lag_4']}
read columns: ['date_block_num' 'target_shop_lag_6' 'item_id' 'shop_id'
 'target_shop_lag_3' 'target_shop_lag_2' 'target_shop_lag_4'
 'target_shop_lag_12' 'target_shop_lag_5' 'target_item_lag_2'
 'target_item_lag_4' 'target_item_lag_12' 'target_item_lag_6'
 'target_item_lag_3' 'target_item_lag_5' 'target_lag_3' 'target'
 'target_lag_12' 'target_lag_2' 'target_lag_5' 'target_lag_6'
 'target_lag_4']
(4499858,)
['target_lag_12', 'target_lag_2', 'target_lag_3', 'target_lag_4', 'target_lag_5', 'target_lag_6', 'target_shop_lag_12', '

In [29]:
feature_list=[f for f in sorted(feature2filename.keys()) if re.search('^target_lag_(\d+)',f) and int(re.search('lag_(\d+)',f).group(1))<13]
feature_list += [f for f in sorted(feature2filename.keys()) if re.search('^target_shop_lag_(\d+)',f) and int(re.search('lag_(\d+)',f).group(1))<13]
feature_list += [f for f in sorted(feature2filename.keys()) if re.search('^target_item_lag_(\d+)',f) and int(re.search('lag_(\d+)',f).group(1))<13]
# feature_list += [f for f in sorted(feature2filename.keys()) if re.search('^target_category_lag_(\d+)',f) and int(re.search('lag_(\d+)',f).group(1))<13]
lists=[feature_list]

evaluate_lgb(lists)

file -> columns mapping:  {'target_category_lagged.csv': ['target_category_lag_3', 'target_category_lag_5', 'target_category_lag_6', 'target_category_lag_2', 'target_category_lag_12', 'target_category_lag_4'], 'target_shop_lagged.csv': ['target_shop_lag_6', 'target_shop_lag_2', 'target_shop_lag_3', 'target_shop_lag_5', 'target_shop_lag_4', 'target_shop_lag_12'], 'target_item_lagged.csv': ['target_item_lag_12', 'target_item_lag_4', 'target_item_lag_6', 'target_item_lag_3', 'target_item_lag_2', 'target_item_lag_5'], 'target_lagged.csv': ['target_lag_5', 'target_lag_3', 'target_lag_12', 'target', 'target_lag_2', 'target_lag_6', 'target_lag_4']}
read columns: ['target_category_lag_3' 'target_category_lag_4' 'target_category_lag_5'
 'target_category_lag_6' 'target_category_lag_2' 'item_id' 'shop_id'
 'target_category_lag_12' 'date_block_num' 'target_shop_lag_6'
 'target_shop_lag_3' 'target_shop_lag_2' 'target_shop_lag_4'
 'target_shop_lag_12' 'target_shop_lag_5' 'target_item_lag_2'
 'target

In [31]:
feature_list=[f for f in sorted(feature2filename.keys()) if re.search('within.*lag',f) and int(re.search('lag_(\d+)',f).group(1))<13]
lists=[feature_list]

evaluate_lgb(lists)

file -> columns mapping:  {'target_category_tfidf_bigram_32_within_shop_lagged.csv': ['target_category_tfidf_bigram_32_within_shop_lag_5', 'target_category_tfidf_bigram_32_within_shop_lag_3', 'target_category_tfidf_bigram_32_within_shop_lag_12', 'target_category_tfidf_bigram_32_within_shop_lag_4', 'target_category_tfidf_bigram_32_within_shop_lag_6', 'target_category_tfidf_bigram_32_within_shop_lag_2'], 'target_category_frequent_256_within_shop_lagged.csv': ['target_category_frequent_256_within_shop_lag_3', 'target_category_frequent_256_within_shop_lag_4', 'target_category_frequent_256_within_shop_lag_6', 'target_category_frequent_256_within_shop_lag_12', 'target_category_frequent_256_within_shop_lag_5', 'target_category_frequent_256_within_shop_lag_2'], 'target_category_tfidf_unigram_256_within_shop_lagged.csv': ['target_category_tfidf_unigram_256_within_shop_lag_2', 'target_category_tfidf_unigram_256_within_shop_lag_12', 'target_category_tfidf_unigram_256_within_shop_lag_6', 'target_c

In [33]:
feature_list=[f for f in sorted(feature2filename.keys()) if re.search('within.*lag',f) and int(re.search('lag_(\d+)',f).group(1))<13]
feature_list=[f for f in feature_list if not re.search('tfidf',f)]
lists=[feature_list]

evaluate_lgb(lists)

file -> columns mapping:  {'target_category_frequent_256_within_shop_lagged.csv': ['target_category_frequent_256_within_shop_lag_6', 'target_category_frequent_256_within_shop_lag_12', 'target_category_frequent_256_within_shop_lag_3', 'target_category_frequent_256_within_shop_lag_5', 'target_category_frequent_256_within_shop_lag_4', 'target_category_frequent_256_within_shop_lag_2'], 'target_category_within_shop_lagged.csv': ['target_category_within_shop_lag_6', 'target_category_within_shop_lag_12', 'target_category_within_shop_lag_4', 'target_category_within_shop_lag_3', 'target_category_within_shop_lag_5', 'target_category_within_shop_lag_2'], 'target_category_frequent_32_within_shop_lagged.csv': ['target_category_frequent_32_within_shop_lag_12', 'target_category_frequent_32_within_shop_lag_3', 'target_category_frequent_32_within_shop_lag_6', 'target_category_frequent_32_within_shop_lag_2', 'target_category_frequent_32_within_shop_lag_5', 'target_category_frequent_32_within_shop_lag_4'

In [34]:
feature_list=[f for f in sorted(feature2filename.keys()) if re.search('tfidf',f)]
lists=[feature_list]

evaluate_lgb(lists)

file -> columns mapping:  {'target_category_tfidf_bigram_32_within_shop_lagged.csv': ['target_category_tfidf_bigram_32_within_shop_lag_5', 'target_category_tfidf_bigram_32_within_shop_lag_3', 'target_category_tfidf_bigram_32_within_shop_lag_12', 'target_category_tfidf_bigram_32_within_shop_lag_14', 'target_category_tfidf_bigram_32_within_shop_lag_6', 'target_category_tfidf_bigram_32_within_shop', 'target_category_tfidf_bigram_32_within_shop_lag_2', 'target_category_tfidf_bigram_32_within_shop_lag_13', 'target_category_tfidf_bigram_32_within_shop_lag_4'], 'features_mean_encoded.csv': ['item_name_category_tfidf_bigram_256_enc_train', 'item_name_category_tfidf_unigram_32_enc_train', 'item_name_category_tfidf_unigram_32_enc_trainval', 'item_name_category_tfidf_unigram_256_enc_train', 'item_name_category_tfidf_unigram_256_enc_trainval', 'item_name_category_tfidf_bigram_32_enc_trainval', 'item_name_category_tfidf_bigram_256_enc_trainval', 'item_name_category_tfidf_bigram_32_enc_train'], 'fea

In [38]:
# textual features
feature_list=[f for f in sorted(feature2filename.keys()) if re.search('item_name',f)]
feature_list+=[f for f in sorted(feature2filename.keys()) if re.search('256',f)]
lists=[feature_list]

evaluate_lgb(lists)

file -> columns mapping:  {'features_mean_encoded.csv': ['item_name_category_tfidf_bigram_256_enc_train', 'item_name_category_tfidf_unigram_32_enc_train', 'item_name_category_tfidf_unigram_32_enc_trainval', 'item_name_category_frequent_32_enc_train', 'item_name_category_tfidf_unigram_256_enc_train', 'item_name_category_tfidf_unigram_256_enc_trainval', 'item_name_category_tfidf_bigram_32_enc_trainval', 'item_name_category_frequent_32_enc_trainval', 'item_name_category_frequent_256_enc_trainval', 'item_name_category_frequent_256_enc_train', 'item_name_category_tfidf_bigram_256_enc_trainval', 'item_name_category_tfidf_bigram_32_enc_train'], 'features_text.csv': ['item_name_category_tfidf_unigram_32', 'item_name_cyrillic_fraction', 'item_name_category_tfidf_bigram_32', 'item_name_category_tfidf_bigram_256', 'item_name_category_tfidf_unigram_256', 'item_name_category_frequent_32', 'item_name_category_frequent_256'], 'target_category_frequent_256_lagged.csv': ['target_category_frequent_256_l

In [36]:
feature_list=[f for f in sorted(feature2filename.keys()) if re.search('^target_lag_(\d+)',f) and int(re.search('lag_(\d+)',f).group(1))<13]
feature_list += [f for f in sorted(feature2filename.keys()) if re.search('^target_shop_lag_(\d+)',f) and int(re.search('lag_(\d+)',f).group(1))<13]
feature_list += [f for f in sorted(feature2filename.keys()) if re.search('^target_item_lag_(\d+)',f) and int(re.search('lag_(\d+)',f).group(1))<13]
feature_list += ['item_category_id_enc_train','item_id','shop_id','item_category_id']

lists=[feature_list]

evaluate_lgb(lists)

file -> columns mapping:  {'target_shop_lagged.csv': ['target_shop_lag_6', 'target_shop_lag_2', 'target_shop_lag_3', 'target_shop_lag_5', 'target_shop_lag_4', 'target_shop_lag_12'], 'target_item_lagged.csv': ['target_item_lag_12', 'target_item_lag_4', 'target_item_lag_6', 'target_item_lag_3', 'target_item_lag_2', 'target_item_lag_5'], 'target_lagged.csv': ['target_lag_5', 'target_lag_3', 'target_lag_12', 'target', 'target_lag_2', 'target_lag_6', 'target_lag_4'], 'features_mean_encoded.csv': ['item_category_id_enc_train'], 'features_category.csv': ['item_category_id']}
read columns: ['date_block_num' 'target_shop_lag_6' 'item_id' 'shop_id'
 'target_shop_lag_3' 'target_shop_lag_2' 'target_shop_lag_4'
 'target_shop_lag_12' 'target_shop_lag_5' 'target_item_lag_2'
 'target_item_lag_4' 'target_item_lag_12' 'target_item_lag_6'
 'target_item_lag_3' 'target_item_lag_5' 'target_lag_3' 'target'
 'target_lag_12' 'target_lag_2' 'target_lag_5' 'target_lag_6'
 'target_lag_4' 'item_category_id_enc_tra

In [None]:
from sklearn import linear_model

feature_list=[f for f in sorted(feature2filename.keys()) if re.search('^target_lag_(\d+)',f) and int(re.search('lag_(\d+)',f).group(1))<13]
feature_list += [f for f in sorted(feature2filename.keys()) if re.search('^target_shop_lag_(\d+)',f) and int(re.search('lag_(\d+)',f).group(1))<13]
feature_list += [f for f in sorted(feature2filename.keys()) if re.search('^target_item_lag_(\d+)',f) and int(re.search('lag_(\d+)',f).group(1))<13]
feature_list += ['item_category_id_enc_train','item_id','shop_id','item_category_id']

lists=[feature_list]

model = CatBoostRegressor(task_type="GPU")

evaluate_by_model(model,lists)

In [62]:
feature_list=[f for f in sorted(feature2filename.keys()) if re.search('^target_lag_(\d+)',f) and int(re.search('lag_(\d+)',f).group(1))<13]
feature_list += [f for f in sorted(feature2filename.keys()) if re.search('^target_shop_lag_(\d+)',f) and int(re.search('lag_(\d+)',f).group(1))<13]
feature_list += [f for f in sorted(feature2filename.keys()) if re.search('^target_item_lag_(\d+)',f) and int(re.search('lag_(\d+)',f).group(1))<13]
feature_list += ['item_category_id_enc_train','item_id','shop_id','item_category_id']
lists=[feature_list]
model = linear_model.RidgeCV()

evaluate_by_model(model,lists)

file -> columns mapping:  {'target_shop_lagged.csv': ['target_shop_lag_6', 'target_shop_lag_2', 'target_shop_lag_3', 'target_shop_lag_5', 'target_shop_lag_4', 'target_shop_lag_12'], 'target_item_lagged.csv': ['target_item_lag_12', 'target_item_lag_4', 'target_item_lag_6', 'target_item_lag_3', 'target_item_lag_2', 'target_item_lag_5'], 'target_lagged.csv': ['target_lag_5', 'target_lag_3', 'target_lag_12', 'target', 'target_lag_2', 'target_lag_6', 'target_lag_4'], 'features_mean_encoded.csv': ['item_category_id_enc_train'], 'features_category.csv': ['item_category_id']}
read columns: ['date_block_num' 'target_shop_lag_6' 'item_id' 'shop_id'
 'target_shop_lag_3' 'target_shop_lag_2' 'target_shop_lag_4'
 'target_shop_lag_12' 'target_shop_lag_5' 'target_item_lag_2'
 'target_item_lag_4' 'target_item_lag_12' 'target_item_lag_6'
 'target_item_lag_3' 'target_item_lag_5' 'target_lag_3' 'target'
 'target_lag_12' 'target_lag_2' 'target_lag_5' 'target_lag_6'
 'target_lag_4' 'item_category_id_enc_tra

In [16]:
from sklearn import svm
from sklearn import linear_model

model = svm.LinearSVR()

feature_list=[f for f in sorted(feature2filename.keys()) if re.search('^target_lag_(\d+)',f) and int(re.search('lag_(\d+)',f).group(1))<13]
feature_list += [f for f in sorted(feature2filename.keys()) if re.search('^target_shop_lag_(\d+)',f) and int(re.search('lag_(\d+)',f).group(1))<13]
feature_list += [f for f in sorted(feature2filename.keys()) if re.search('^target_item_lag_(\d+)',f) and int(re.search('lag_(\d+)',f).group(1))<13]
feature_list += ['item_category_id_enc_train','item_id','shop_id','item_category_id']
lists=[feature_list]

evaluate_by_model(model,lists)

file -> columns mapping:  {'target_lagged.csv': ['target_lag_4', 'target_lag_5', 'target', 'target_lag_2', 'target_lag_3', 'target_lag_12', 'target_lag_6'], 'target_shop_lagged.csv': ['target_shop_lag_3', 'target_shop_lag_12', 'target_shop_lag_2', 'target_shop_lag_5', 'target_shop_lag_4', 'target_shop_lag_6'], 'features_category.csv': ['item_category_id'], 'features_mean_encoded.csv': ['item_category_id_enc_train'], 'target_item_lagged.csv': ['target_item_lag_3', 'target_item_lag_6', 'target_item_lag_2', 'target_item_lag_5', 'target_item_lag_4', 'target_item_lag_12']}
read columns: ['target_lag_4' 'target' 'target_lag_2' 'item_id' 'shop_id'
 'date_block_num' 'target_lag_3' 'target_lag_12' 'target_lag_6'
 'target_lag_5' 'target_shop_lag_3' 'target_shop_lag_12'
 'target_shop_lag_2' 'target_shop_lag_4' 'target_shop_lag_5'
 'target_shop_lag_6' 'item_category_id' 'item_category_id_enc_train'
 'target_item_lag_5' 'target_item_lag_12' 'target_item_lag_3'
 'target_item_lag_6' 'target_item_lag_



# plan: select feature subsets:
1) non-lagged + lagged textual features
2) lagged {target,item,shop} + non-lagged basic categories
3) lagged features within shop

Train three classifiers for each: CatBoost, RidgeCV and KNN 

Combine predictions:

1) simple averaging
2) heuristical weighted averaging proportional to validation score
3) full blown stacking scheme with two levels. Use classifier scores 
of first level classifiers as second-level features using the temporal cross-validation scheme 2f)
( this may not be ideal)

First, however, determine how long history really is beneficial.





In [23]:
from sklearn import linear_model

model = linear_model.RidgeCV()
# here normalizing data did not help

feature_list=[f for f in sorted(feature2filename.keys()) if re.search('^target_lag_(\d+)',f) and int(re.search('lag_(\d+)',f).group(1))<13]
feature_list += [f for f in sorted(feature2filename.keys()) if re.search('^target_shop_lag_(\d+)',f) and int(re.search('lag_(\d+)',f).group(1))<13]
feature_list += [f for f in sorted(feature2filename.keys()) if re.search('^target_item_lag_(\d+)',f) and int(re.search('lag_(\d+)',f).group(1))<13]
feature_list += ['item_category_id_enc_train','item_id','shop_id','item_category_id']
lists=[feature_list]

evaluate_by_model(model,lists)

file -> columns mapping:  {'target_item_lagged.csv': ['target_item_lag_3', 'target_item_lag_6', 'target_item_lag_2', 'target_item_lag_4', 'target_item_lag_5', 'target_item_lag_12'], 'features_mean_encoded.csv': ['item_category_id_enc_train'], 'target_shop_lagged.csv': ['target_shop_lag_2', 'target_shop_lag_6', 'target_shop_lag_4', 'target_shop_lag_3', 'target_shop_lag_12', 'target_shop_lag_5'], 'features_category.csv': ['item_category_id'], 'target_lagged.csv': ['target_lag_4', 'target', 'target_lag_3', 'target_lag_12', 'target_lag_2', 'target_lag_6', 'target_lag_5']}
read columns: ['shop_id' 'target_item_lag_2' 'target_item_lag_4' 'target_item_lag_3'
 'target_item_lag_5' 'target_item_lag_6' 'date_block_num'
 'target_item_lag_12' 'item_id' 'item_category_id_enc_train'
 'target_shop_lag_3' 'target_shop_lag_12' 'target_shop_lag_5'
 'target_shop_lag_2' 'target_shop_lag_6' 'target_shop_lag_4'
 'item_category_id' 'target' 'target_lag_3' 'target_lag_12' 'target_lag_6'
 'target_lag_2' 'target

In [15]:
from sklearn.ensemble import RandomForestRegressor

model =  RandomForestRegressor(n_estimators=100, verbose=2)

feature_list=[f for f in sorted(feature2filename.keys()) if re.search('^target_lag_(\d+)',f) and int(re.search('lag_(\d+)',f).group(1))<13]
feature_list += [f for f in sorted(feature2filename.keys()) if re.search('^target_shop_lag_(\d+)',f) and int(re.search('lag_(\d+)',f).group(1))<13]
feature_list += [f for f in sorted(feature2filename.keys()) if re.search('^target_item_lag_(\d+)',f) and int(re.search('lag_(\d+)',f).group(1))<13]
feature_list += ['item_category_id_enc_train','item_id','shop_id','item_category_id']
lists=[feature_list]

evaluate_by_model(model,lists)

file -> columns mapping:  {'target_item_lagged.csv': ['target_item_lag_6', 'target_item_lag_4', 'target_item_lag_5', 'target_item_lag_3', 'target_item_lag_2', 'target_item_lag_12'], 'target_lagged.csv': ['target_lag_5', 'target_lag_6', 'target', 'target_lag_12', 'target_lag_4', 'target_lag_2', 'target_lag_3'], 'target_shop_lagged.csv': ['target_shop_lag_3', 'target_shop_lag_6', 'target_shop_lag_2', 'target_shop_lag_5', 'target_shop_lag_12', 'target_shop_lag_4'], 'features_mean_encoded.csv': ['item_category_id_enc_train'], 'features_category.csv': ['item_category_id']}
read columns: ['item_id' 'target_item_lag_6' 'target_item_lag_5' 'target_item_lag_4'
 'date_block_num' 'target_item_lag_3' 'target_item_lag_2' 'shop_id'
 'target_item_lag_12' 'target_lag_5' 'target_lag_2' 'target_lag_6'
 'target' 'target_lag_12' 'target_lag_4' 'target_lag_3'
 'target_shop_lag_6' 'target_shop_lag_3' 'target_shop_lag_2'
 'target_shop_lag_5' 'target_shop_lag_4' 'target_shop_lag_12'
 'item_category_id_enc_tra

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


building tree 1 of 100


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   46.7s remaining:    0.0s


building tree 2 of 100
building tree 3 of 100
building tree 4 of 100
building tree 5 of 100
building tree 6 of 100
building tree 7 of 100
building tree 8 of 100
building tree 9 of 100
building tree 10 of 100
building tree 11 of 100
building tree 12 of 100
building tree 13 of 100
building tree 14 of 100
building tree 15 of 100
building tree 16 of 100
building tree 17 of 100
building tree 18 of 100
building tree 19 of 100
building tree 20 of 100
building tree 21 of 100
building tree 22 of 100
building tree 23 of 100
building tree 24 of 100
building tree 25 of 100
building tree 26 of 100
building tree 27 of 100
building tree 28 of 100
building tree 29 of 100
building tree 30 of 100
building tree 31 of 100
building tree 32 of 100
building tree 33 of 100
building tree 34 of 100
building tree 35 of 100
building tree 36 of 100
building tree 37 of 100
building tree 38 of 100
building tree 39 of 100
building tree 40 of 100
building tree 41 of 100
building tree 42 of 100
building tree 43 of 100


[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed: 77.9min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s


['target_lag_12', 'target_lag_2', 'target_lag_3', 'target_lag_4', 'target_lag_5', 'target_lag_6', 'target_shop_lag_12', 'target_shop_lag_2', 'target_shop_lag_3', 'target_shop_lag_4', 'target_shop_lag_5', 'target_shop_lag_6', 'target_item_lag_12', 'target_item_lag_2', 'target_item_lag_3', 'target_item_lag_4', 'target_item_lag_5', 'target_item_lag_6', 'item_category_id_enc_train', 'item_id', 'shop_id', 'item_category_id']: Clipped RMSE 0.3570196527933023


[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    5.7s finished


In [18]:
import lightgbm as lgb

model =  lgb.LGBMClassifier(boosting_type="rf", subsample=.632, # Standard RF bagging fraction
                            subsample_freq=1,
                         n_jobs=3, verbose=100)

feature_list=[f for f in sorted(feature2filename.keys()) if re.search('^target_lag_(\d+)',f) and int(re.search('lag_(\d+)',f).group(1))<13]
feature_list += [f for f in sorted(feature2filename.keys()) if re.search('^target_shop_lag_(\d+)',f) and int(re.search('lag_(\d+)',f).group(1))<13]
feature_list += [f for f in sorted(feature2filename.keys()) if re.search('^target_item_lag_(\d+)',f) and int(re.search('lag_(\d+)',f).group(1))<13]
feature_list += ['item_category_id_enc_train','item_id','shop_id','item_category_id']
lists=[feature_list]

evaluate_by_model(model,lists)

file -> columns mapping:  {'target_item_lagged.csv': ['target_item_lag_6', 'target_item_lag_4', 'target_item_lag_5', 'target_item_lag_3', 'target_item_lag_2', 'target_item_lag_12'], 'target_lagged.csv': ['target_lag_5', 'target_lag_6', 'target', 'target_lag_12', 'target_lag_4', 'target_lag_2', 'target_lag_3'], 'target_shop_lagged.csv': ['target_shop_lag_3', 'target_shop_lag_6', 'target_shop_lag_2', 'target_shop_lag_5', 'target_shop_lag_12', 'target_shop_lag_4'], 'features_mean_encoded.csv': ['item_category_id_enc_train'], 'features_category.csv': ['item_category_id']}
read columns: ['item_id' 'target_item_lag_6' 'target_item_lag_5' 'target_item_lag_4'
 'date_block_num' 'target_item_lag_3' 'target_item_lag_2' 'shop_id'
 'target_item_lag_12' 'target_lag_5' 'target_lag_2' 'target_lag_6'
 'target' 'target_lag_12' 'target_lag_4' 'target_lag_3'
 'target_shop_lag_6' 'target_shop_lag_3' 'target_shop_lag_2'
 'target_shop_lag_5' 'target_shop_lag_4' 'target_shop_lag_12'
 'item_category_id_enc_tra