In [10]:
import pandas as pd
import numpy as np
from itertools import product
import os
import pickle


import matplotlib.pyplot as plt
%matplotlib inline 

In [3]:
# WIP: creating mean encodings as features

In [4]:
transactions = pd.read_csv('sales_train.csv.gz')
items = pd.read_csv('items.csv')
item_categories = pd.read_csv('item_categories.csv')
shops = pd.read_csv('shops.csv')

print(transactions.shape)
print(items.shape)
print(item_categories.shape)
print(shops.shape)
transactions.head(3)

(2935849, 6)
(22170, 3)
(84, 2)
(60, 2)


Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,02.01.2013,0,59,22154,999.0,1.0
1,03.01.2013,0,25,2552,899.0,1.0
2,05.01.2013,0,25,2552,899.0,-1.0


In [5]:
transactions[['Day','Month','Year']]=transactions.date.str.split('.',expand=True)
train = transactions.copy()
del train['date']

In [6]:
train.head(5)

Unnamed: 0,date_block_num,shop_id,item_id,item_price,item_cnt_day,Day,Month,Year
0,0,59,22154,999.0,1.0,2,1,2013
1,0,25,2552,899.0,1.0,3,1,2013
2,0,25,2552,899.0,-1.0,5,1,2013
3,0,25,2554,1709.05,1.0,6,1,2013
4,0,25,2555,1099.0,1.0,15,1,2013


In [30]:
def create_grid(sales):
    index_cols = ['shop_id', 'item_id', 'date_block_num']

    # For every month we create a grid from all shops/items combinations from that month
    grid = [] 
    for block_num in sales['date_block_num'].unique():
        cur_shops = sales[sales['date_block_num']==block_num]['shop_id'].unique()
        cur_items = sales[sales['date_block_num']==block_num]['item_id'].unique()
        grid.append(np.array(list(product(*[cur_shops, cur_items, [block_num]])),dtype='int32'))

    # Turn the grid into pandas dataframe
    grid = pd.DataFrame(np.vstack(grid), columns = index_cols,dtype=np.int32)

    # Get aggregated values for (shop_id, item_id, month)
    gb = sales.groupby(index_cols,as_index=False).agg({'item_cnt_day':{'target':'sum'}})

    # Fix column names
    gb.columns = [col[0] if col[-1]=='' else col[-1] for col in gb.columns.values]
    # Join aggregated data to the grid
    all_data = pd.merge(grid,gb,how='left',on=index_cols).fillna(0)
    # Sort the data
    all_data.sort_values(['date_block_num','shop_id','item_id'],inplace=True)
    return all_data

In [38]:
path = "transactions_all_data.csv"

if os.path.isfile(path):
    data = pd.read_csv(path)
else:
    data = create_grid(transactions)
    data.to_csv(path, index=False)

  return super(DataFrameGroupBy, self).aggregate(arg, *args, **kwargs)


In [39]:
data.head()

Unnamed: 0,shop_id,item_id,date_block_num,target
139255,0,19,0,0.0
141495,0,27,0,0.0
144968,0,28,0,0.0
142661,0,29,0,0.0
138947,0,32,0,6.0


In [41]:
data.date_block_num.unique()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33])

Set data blocks 31,32,33 as validation sets

In [45]:
train_df = data[data.date_block_num < 31]
valid_df = data[data.date_block_num >= 31]

In [47]:
print("train_data blocks: {}".format(train_df.date_block_num.unique()))
print("test_data blocks: {}".format(valid_df.date_block_num.unique()))

train_data blocks: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30]
test_data blocks: [31 32 33]


In [48]:
# Encode based on train data
target_mean = train_df.groupby('item_id').target.mean()
target_mean.head()

item_id
0    0.020000
1    0.023810
2    0.019802
3    0.019802
4    0.020000
Name: target, dtype: float64

In [54]:
# map encodings to train and valid (Takes time to complete)
train_df.loc['item_id_'+ 'mean_target'] =  train_df['item_id'].map(target_mean)
valid_df.loc['item_id_'+ 'mean_target'] =  valid_df['item_id'].map(target_mean)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [58]:
train_df.head()

Unnamed: 0,shop_id,item_id,date_block_num,target,item_id_mean_target
139255,0.0,19.0,0.0,0.0,0.022222
141495,0.0,27.0,0.0,0.0,0.056834
144968,0.0,28.0,0.0,0.0,0.141176
142661,0.0,29.0,0.0,0.0,0.037383
138947,0.0,32.0,0.0,6.0,1.387097


In [57]:
valid_df.head()

Unnamed: 0,shop_id,item_id,date_block_num,target,item_id_mean_target
10412237,2.0,30.0,31.0,0.0,1.47238
10411838,2.0,31.0,31.0,0.0,0.959632
10412190,2.0,32.0,31.0,1.0,1.387097
10411864,2.0,33.0,31.0,0.0,0.54221
10411865,2.0,38.0,31.0,0.0,0.098958
