In [4]:
import numpy as np
import pandas as pd
import cPickle
from functions import labelPermutation

In [5]:
with open(r"dateVar.pickle", "rb") as input_file:
    dateVar_list = cPickle.load(input_file)

In [6]:
types = {'id': 'int32',
     'item_nbr': 'int32',
     'store_nbr': 'int8',
     'unit_sales': 'float32',
     'onpromotion': bool}
train = pd.read_csv('train.csv',usecols=['date','item_nbr','store_nbr','unit_sales','onpromotion'],\
                    parse_dates=['date'],dtype=types, infer_datetime_format=True)
train = train.fillna(2,axis=1)
train.onpromotion = train.onpromotion.astype(np.int8)
train.loc[train.unit_sales<0,'unit_sales'] = .0 # clip negative sales to zero

  interactivity=interactivity, compiler=compiler, result=result)


In [7]:
test = pd.read_csv('test.csv',parse_dates=['date'],dtype=types, infer_datetime_format=True)
test = test.fillna(2,axis=1)
test.onpromotion = test.onpromotion.astype(np.int8)

In [8]:
test.columns = [u'unit_sales', u'date', u'store_nbr', u'item_nbr', u'onpromotion']

In [9]:
#train.groupby('store_nbr')['date'].min().max() 
train = train.loc[train.store_nbr != 52] # new store
test = test.loc[test.store_nbr != 52] # new store

In [7]:
# train.groupby('store_nbr')['date'].min().max()
# --> Timestamp('2015-10-09 00:00:00')

*** S, I, T dependent variables i.e. Sales and promotions ***

In [10]:
train = train.loc[train.date >'2016-07-14']

In [11]:
train = train.set_index(["store_nbr", "item_nbr", "date"])\
                [["unit_sales","onpromotion"]].unstack(level=-1)

In [12]:
test = test.set_index(["store_nbr", "item_nbr", "date"])\
                [["unit_sales","onpromotion"]].unstack(level=-1)

In [13]:
# a = train.columns.get_level_values(1)
# assert train.shape[1] == (a.max()- a.min()).days # make sure no gap

In [14]:
t0 = train.shape[1]/2 - 1

In [15]:
test2 = test.merge(train,'inner',left_index=True,right_index=True)

In [16]:
sales = test2.loc[:,'unit_sales'].fillna(0)
sales = np.concatenate([sales.iloc[:,16:].values,sales.iloc[:,:16].values],1)

In [17]:
promo = test2.loc[:,'onpromotion'].fillna(0.5)
promo = np.concatenate([promo.iloc[:,16:].values,promo.iloc[:,:16].values],1)

*** non time dependent variables, i.e. store and item related ***

In [18]:
items = pd.read_csv('items.csv')
stores = pd.read_csv('stores.csv')

In [19]:
stores2 = labelPermutation(stores.drop('state',1),['city','type','cluster'],[10,4,8])
items2 = labelPermutation(items,['family','class'],[5,20])

In [20]:
X = np.concatenate([pd.merge(pd.DataFrame(test2.reset_index()[['store_nbr']].values,columns=['store_nbr'])\
                             ,stores2,'left','store_nbr').drop('store_nbr',1).values,\
                    pd.merge(pd.DataFrame(test2.reset_index()[['item_nbr']].values,columns=['item_nbr'])\
                             ,items2,'left','item_nbr').drop('item_nbr',1).values],1).astype(np.float32)

*** time dependent variables ***

In [19]:
maxDate = test2.columns.get_level_values(1).max()
minDate = test2.columns.get_level_values(1).min()

In [75]:
# 2016-12-25 is not in training dataset
dateVar_list = [labelPermutation(dateVar.loc[(dateVar.date>=minDate) & \
                                             (dateVar.date<=maxDate) & (dateVar.date!='2016-12-25')]\
                       .drop(['date','locale_name'],1),
                        ['type','locale'],[4,2]).values.T\
                for dateVar in dateVar_list]

*** Create training & test dataset ***

In [103]:
def _creatX(t0):
    # t0+1 is the first prediction date
    n = sales.shape[0]
    return np.concatenate([np.stack(
                    [sales[:,t0-7:t0].mean(1), sales[:,t0-14:t0].mean(1), sales[:,t0-30:t0].mean(1),sales[:,t0-60:t0].mean(1),\
                     sales[:,t0-90:t0].mean(1),sales[:,t0-120:t0].mean(1),\
                     sales[:,t0-30:t0].mean(1)-sales[:,t0-60:t0-30].mean(1),sales[:,t0-7:t0].mean(1)-sales[:,t0-14:t0-7].mean(1),\
                     sales[:,t0]],1),\
                    np.stack(
                    [promo[:,t0-7:t0].mean(1), promo[:,t0-14:t0].mean(1), promo[:,t0-30:t0].mean(1),\
                     promo[:,t0-60:t0].mean(1), promo[:,t0-90:t0].mean(1),\
                     promo[:,t0-30:t0].mean(1)-promo[:,t0-60:t0-30].mean(1),promo[:,t0-7:t0].mean(1)-promo[:,t0-14:t0-7].mean(1),\
                     ],1),\
                    promo[:,t0:t0+17],\
                    X,\
                    np.broadcast_to(dateVar_list[np.random.randint(0,10)][:,t0:t0+17].flatten(),(n,204))],1)

In [127]:
def CreateGBMTrain(timePoints=range(0,255,16),startT=t0-32):
    X_, Y_ = [],[]
    for t in timePoints:
        X_.append(_creatX(startT-t))
        Y_.append(sales[:,startT-t+1:startT-t+17])
    return np.concatenate(X_,0),np.concatenate(Y_,0)

In [104]:
def CreateGBMTest(startT):
    return _creatX(startT), sales[:,startT+1:startT+17]

In [105]:
Xt,Yt = CreateGBMTest(t0)

In [108]:
prefix = 'test_GBM'
np.savetxt(prefix+'_Xt',Xt,fmt="%f",delimiter=",") 
np.savetxt(prefix+'_Yt',Yt,fmt="%f",delimiter=",") 

In [110]:
Xt,Yt = CreateGBMTest(t0-16)

In [112]:
prefix = 'val_GBM'
np.savetxt(prefix+'_Xt',Xt,fmt="%f",delimiter=",") 
np.savetxt(prefix+'_Yt',Yt,fmt="%f",delimiter=",") 

In [128]:
Xt,Yt = CreateGBMTrain()

In [133]:
prefix = 'train_GBM'
np.savetxt(prefix+'_Xt',Xt,fmt="%f",delimiter=",") 
np.savetxt(prefix+'_Yt',Yt,fmt="%f",delimiter=",") 