<a href="https://colab.research.google.com/github/vvivvi/kaggle-c1/blob/master/Kaggle_C1_mean_encoding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd 
import sklearn
import scipy.sparse 
import lightgbm 
import re

for p in [np, pd, scipy, sklearn, lightgbm]:
    print (p.__name__, p.__version__)

numpy 1.18.1
pandas 0.25.3
scipy 1.4.1
sklearn 0.22.1
lightgbm 2.3.1


In [2]:
from itertools import product
import gc
from tqdm import tqdm_notebook


def downcast_dtypes(df):
    '''
        Changes column types in the dataframe: 
                
                `float64` type to `float32`
                `int64`   type to `int32`
    '''
    
    # Select columns to downcast
    float_cols = [c for c in df if df[c].dtype == "float64"]
    int_cols =   [c for c in df if df[c].dtype == "int64"]
    
    # Downcast
    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols]   = df[int_cols].astype(np.int32)
    
    return df

In [3]:
from sklearn.model_selection import KFold

def mean_encode_kfold(all_data, index_bool, coded_columns, target_column, target_suffix):
    index_ints= all_data[index_bool].index.values
    kf = KFold(5,shuffle=True, random_state=123) 
    target_mean=all_data.loc[index_bool, target_column].mean()
    
    for train_index, fill_index in kf.split(index_ints):
        X_tr, X_fill = all_data.loc[index_ints[train_index]], all_data.loc[index_ints[fill_index]]
        for col in coded_columns:
            print(col)
            encoded = X_fill[col].map(X_tr.groupby(col)[target_column].mean())
            all_data.loc[index_ints[fill_index],col + target_suffix] = encoded
       
    for col in coded_columns:
        all_data.loc[index_bool, col+target_suffix]=all_data.loc[index_bool, col+target_suffix].fillna(target_mean)
        
def mean_encode_simple(all_data,train_index, test_index, coded_columns, target_column, target_suffix):
    train_data=all_data[train_index]
    test_data=all_data[test_index]
    target_mean=train_data[target_column].mean()
    for col in coded_columns:
        print(col)
        col_target_mean_train = train_data.groupby(col)[target_column].mean()
        all_data.loc[test_index,col+target_suffix] = test_data[col].map(col_target_mean_train)
        all_data.loc[test_index,col+target_suffix] = all_data.loc[test_index,col+target_suffix].fillna(target_mean)
        

In [4]:
DATA_FOLDER = 'competitive-data-science-predict-future-sales'

date_block_val = 33
date_block_test = 35 # Dec 2015

index_cols=['item_id','shop_id','date_block_num']
category_data=pd.read_csv(DATA_FOLDER + '/category.csv')
targets = pd.read_csv(DATA_FOLDER + '/targets.csv') 
all_data = pd.merge(category_data, targets, on=index_cols)

to_drop_cols = [c for c in targets.columns.values if re.search('target_', c)]
all_data=all_data.drop(to_drop_cols,axis=1).reset_index()

del category_data, targets

In [5]:
columns_to_encode = [c for c in all_data.columns.values if re.search('category', c)]
columns_to_encode += ['shop_id','item_id'] 

idx_train=all_data['date_block_num']  < date_block_val
idx_trainval=all_data['date_block_num']  < date_block_test

idx_val=all_data['date_block_num']  == date_block_val
idx_test=all_data['date_block_num'] == date_block_test

for c in columns_to_encode:
    all_data[c+'_enc_train']=0
    all_data[c+'_enc_trainval']=0

# add mean encodings to training parts of data
# using 5-fold regularization scheme
mean_encode_kfold(all_data, idx_train, columns_to_encode,'target','_enc_train')    
mean_encode_kfold(all_data, idx_trainval, columns_to_encode,'target','_enc_trainval')    

# add unregularized mean encodings to predicted part of data
mean_encode_simple(all_data,idx_train,idx_val, columns_to_encode,'target','_enc_train')    
mean_encode_simple(all_data,idx_trainval,idx_test, columns_to_encode,'target','_enc_trainval')    


item_category_id
item_name_category_tfidf_unigram_32
item_name_category_tfidf_unigram_256
item_name_category_tfidf_bigram_32
item_name_category_tfidf_bigram_256
item_name_category_frequent_32
item_name_category_frequent_256
shop_and_category
shop_id
item_id
item_category_id
item_name_category_tfidf_unigram_32
item_name_category_tfidf_unigram_256
item_name_category_tfidf_bigram_32
item_name_category_tfidf_bigram_256
item_name_category_frequent_32
item_name_category_frequent_256
shop_and_category
shop_id
item_id
item_category_id
item_name_category_tfidf_unigram_32
item_name_category_tfidf_unigram_256
item_name_category_tfidf_bigram_32
item_name_category_tfidf_bigram_256
item_name_category_frequent_32
item_name_category_frequent_256
shop_and_category
shop_id
item_id
item_category_id
item_name_category_tfidf_unigram_32
item_name_category_tfidf_unigram_256
item_name_category_tfidf_bigram_32
item_name_category_tfidf_bigram_256
item_name_category_frequent_32
item_name_category_frequent_256
sh

In [6]:
all_data.columns

Index(['index', 'Unnamed: 0_x', 'shop_id', 'item_id', 'date_block_num',
       'item_category_id', 'item_name_category_tfidf_unigram_32',
       'item_name_category_tfidf_unigram_256',
       'item_name_category_tfidf_bigram_32',
       'item_name_category_tfidf_bigram_256', 'item_name_category_frequent_32',
       'item_name_category_frequent_256', 'shop_and_category', 'Unnamed: 0_y',
       'target', 'item_category_id_enc_train', 'item_category_id_enc_trainval',
       'item_name_category_tfidf_unigram_32_enc_train',
       'item_name_category_tfidf_unigram_32_enc_trainval',
       'item_name_category_tfidf_unigram_256_enc_train',
       'item_name_category_tfidf_unigram_256_enc_trainval',
       'item_name_category_tfidf_bigram_32_enc_train',
       'item_name_category_tfidf_bigram_32_enc_trainval',
       'item_name_category_tfidf_bigram_256_enc_train',
       'item_name_category_tfidf_bigram_256_enc_trainval',
       'item_name_category_frequent_32_enc_train',
       'item_name_ca

In [7]:
encoded_cols = [c for c in all_data.columns.values if re.search('_enc', c)]
all_data[index_cols + encoded_cols].to_csv(DATA_FOLDER + '/mean_encoded.csv')

In [8]:
encoded_cols

['item_category_id_enc_train',
 'item_category_id_enc_trainval',
 'item_name_category_tfidf_unigram_32_enc_train',
 'item_name_category_tfidf_unigram_32_enc_trainval',
 'item_name_category_tfidf_unigram_256_enc_train',
 'item_name_category_tfidf_unigram_256_enc_trainval',
 'item_name_category_tfidf_bigram_32_enc_train',
 'item_name_category_tfidf_bigram_32_enc_trainval',
 'item_name_category_tfidf_bigram_256_enc_train',
 'item_name_category_tfidf_bigram_256_enc_trainval',
 'item_name_category_frequent_32_enc_train',
 'item_name_category_frequent_32_enc_trainval',
 'item_name_category_frequent_256_enc_train',
 'item_name_category_frequent_256_enc_trainval',
 'shop_and_category_enc_train',
 'shop_and_category_enc_trainval',
 'shop_id_enc_train',
 'shop_id_enc_trainval',
 'item_id_enc_train',
 'item_id_enc_trainval']