In [1]:
from mapreduce import FilteringMapReduce, map_reduce_df
import pandas as pd
from utils import get_date_index, get_days_in_range
from metric import NWRMSLE_log, get_weights
import numpy as np

In [2]:
# fill with median from previous two weeks
days = get_days_in_range('2017-06-16', '2017-08-15')
cols = ['id', 'date', 'store_nbr', 'item_nbr', 'unit_sales', 'onpromotion']
types = {'id': 'int32',
         'date': 'int16',
         'item_nbr': 'int32',
         'store_nbr': 'int16',
         'unit_sales': 'float32',
         'onpromotion': bool}
mapreduce = FilteringMapReduce(lambda df: df[df.date.isin(days)])
train = map_reduce_df('./data/train_processed.csv', mapreduce, types=types, position=110000000, cols=cols, verbose=True)

Reading batch from position 110000000, batch size 10000000...
Filtering 10000000...
Filtered 930790, mapping...
Mapped, reducing...
Batch done.
Reading batch from position 120000000, batch size 10000000...
Filtering 5497041...
Filtered 5497041, mapping...
Mapped, reducing...
Batch done.
End of dataset is found.


In [3]:
# filling empty dates
train.ix[train.unit_sales < 0, 'unit_sales'] = 0
u_dates = train.date.unique()
u_stores = train.store_nbr.unique()
u_items = train.item_nbr.unique()
train.set_index(["date", "store_nbr", "item_nbr"], inplace=True)
train = train.reindex(
    pd.MultiIndex.from_product(
        (u_dates, u_stores, u_items),
        names=["date", "store_nbr", "item_nbr"]
    )
)
train.loc[:, "unit_sales"].fillna(0, inplace=True)
train.loc[:, "onpromotion"].fillna(False, inplace=True)
train.reset_index(inplace=True)

In [4]:
# log
train['unit_sales'] = np.log1p(train['unit_sales'])

In [5]:
days_train1 = get_days_in_range('2017-06-16', '2017-06-30')
days_train2 = get_days_in_range('2017-07-01', '2017-07-15')
days_train3 = get_days_in_range('2017-07-16', '2017-07-31')
days_test = get_days_in_range('2017-08-01', '2017-08-15')

mean_train1 = train[train.date.isin(days_train1)]
mean_train2 = train[train.date.isin(days_train2)]
mean_train3 = train[train.date.isin(days_train3)]
mean_test = train[train.date.isin(days_test)]
del train

In [8]:
def ev(train, test):
    train = train.groupby(['item_nbr','store_nbr', 'onpromotion'], as_index=False).agg({'unit_sales':'mean'})
    train.rename(columns={'unit_sales':'unit_sales_mean'}, inplace=True)  
    test = test.merge(train, on=['item_nbr','store_nbr', 'onpromotion'], how='left')
    test.fillna(0.0, inplace=True)
    y_true = test['unit_sales']
    y_pred = test['unit_sales_mean']
    weights = get_weights(test['item_nbr'])
    error = NWRMSLE_log(y_true, y_pred, weights)
    return error

errors = []
errors.append(ev(mean_train1, mean_train2))
errors.append(ev(mean_train2, mean_train3))
errors.append(ev(mean_train3, mean_test))
np.mean(errors)

0.61607487413109474

In [25]:
metric = np.mean(errors)
!telegram-send "Validation finished $metric"

In [10]:
errors

[0.6312275316337234, 0.6048062287676395, 0.6121908619919213]

In [12]:
train = mean_test.groupby(by=['item_nbr','store_nbr', 'onpromotion'])['unit_sales'].mean().to_frame('unit_sales')
train.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,unit_sales
item_nbr,store_nbr,onpromotion,Unnamed: 3_level_1
96995,1,False,0.312142
96995,2,False,0.0
96995,3,False,0.3505
96995,4,False,0.138629
96995,5,False,0.09242


In [17]:
train['unit_sales'] = np.expm1(train['unit_sales'])
train.reset_index(inplace=True)

test = pd.read_csv('./data/test_processed.csv', dtype=types)
test = test.merge(train, on=['item_nbr', 'store_nbr', 'onpromotion'], how='left')
test.fillna(0.0, inplace=True)
test.ix[test.unit_sales < 0, 'unit_sales'] = 0

  if __name__ == '__main__':


In [18]:
#0.673
test[['id', 'unit_sales']].to_csv('./submissions/previous_week_log_mean_0.61.csv.gz', index=False, compression="gzip")

In [19]:
test.head()

Unnamed: 0,id,date,store_nbr,item_nbr,onpromotion,index,unit_sales
0,125497040,1688,1,96995,False,0.0,0.442458
1,125497041,1688,1,99197,False,54.0,0.236777
2,125497042,1688,1,103501,False,108.0,0.0
3,125497043,1688,1,103520,False,162.0,1.395056
4,125497044,1688,1,103665,False,216.0,5.788029


In [20]:
!telegram-send "Submission done"