In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline 
from pandas.tseries.offsets import MonthEnd

In [2]:
def isConsecutive(seq):
    # if non-consecutive, need to re-map to consecutive number starting from 1
    uniq = pd.unique(seq)
    return len(uniq) == (uniq.max()-uniq.min() + 1)

In [3]:
def isUniqBigger(seq1,seq2):
    return set(seq1) >= set(seq2)

In [4]:
def reMapDF(df,cols):
    # remap cols in dataframe to consecutive integers starting from one
    for col in cols:
        uniq = pd.unique(df[col])
        dict_ = {item:i+1 for i,item in enumerate(uniq)}
        df = df.replace({col:dict_})
    return df

In [5]:
def dimentionDF(df,cols):
    return {col:len(set(df[col])) for col in cols}

In [6]:
def mergeFillCast(df1,df2,key):
    cols = df2.columns.values
    types = df2.dtypes.values
    dict_ = {col:type_ for col,type_ in zip(cols,types)}
    dfOut = pd.merge(df1, df2, how='left', on=key, 
             suffixes=('', '_y'), copy=True, indicator=False).fillna(0)
    dfOut[cols] = \
        dfOut[cols].astype(dict_)
    return dfOut

In [7]:
def mergeFillCastsss(df0,dfs,keys):
    for df,key in zip(dfs,keys):
        df0 = mergeFillCast(df0,df,key)
    return df0

In [8]:
types = {'id': 'int32',
         'item_nbr': 'int32',
         'store_nbr': 'int8',
         'unit_sales': 'float32',
         'onpromotion': bool}

In [9]:
train = pd.read_csv('train.csv',usecols=['date','item_nbr','store_nbr','unit_sales','onpromotion'],\
                    parse_dates=['date'],dtype=types, infer_datetime_format=True)

  interactivity=interactivity, compiler=compiler, result=result)


In [10]:
train = train.fillna(2,axis=1)
train.onpromotion = train.onpromotion.astype(np.int8)

In [11]:
train.head()

Unnamed: 0,date,store_nbr,item_nbr,unit_sales,onpromotion
0,2013-01-01,25,103665,7.0,2
1,2013-01-01,25,105574,1.0,2
2,2013-01-01,25,105575,2.0,2
3,2013-01-01,25,108079,1.0,2
4,2013-01-01,25,108701,1.0,2


In [12]:
test = pd.read_csv('test.csv',parse_dates=['date'],dtype=types, infer_datetime_format=True)
test = test.fillna(2,axis=1)
test.onpromotion = test.onpromotion.astype(np.int8)

In [198]:
a = test.groupby(['store_nbr','item_nbr'])['date'].count()

In [200]:
np.sum(a==16)

210654

In [201]:
a.shape

(210654,)

In [13]:
items = pd.read_csv('items.csv')
stores = pd.read_csv('stores.csv')

Needs to map items2 before mapping item_nbr

In [14]:
items2 = reMapDF(items,['family','class'])
items2[['family','class','perishable']] = \
        items2[['family','class','perishable']].astype('int16')

In [15]:
stores2 = reMapDF(stores,['city', 'state', 'type'])
stores2 = stores2.astype('int8')

In [16]:
val = train[train.date >= '2017-07-31']

In [17]:
train = train[train.date < '2017-07-31']

In [18]:
item_uniq = pd.unique(train.item_nbr)
item_dict = {item:i+1 for i,item in enumerate(item_uniq)}
iter_mapping = lambda x: item_dict[x] if x in item_dict else 0

In [171]:
SI_timeMinMax = train.groupby(['store_nbr','item_nbr'])['date'].agg([np.min,np.max,np.count_nonzero]).reset_index()

In [172]:
SI_train_sales = train.groupby(['store_nbr','item_nbr'])[['date','unit_sales']].\
                    agg(lambda x: tuple(x)).reset_index()

In [176]:
dfs = [items2,stores2,SI_train_sales]
keys = ['item_nbr','store_nbr',['item_nbr','store_nbr']]

In [177]:
SI_train = mergeFillCastsss(SI_timeMinMax,dfs,keys)

In [182]:
SI_train['item_nbr'] = SI_train.item_nbr.map(iter_mapping)

In [184]:
SI_train.sort_values('count_nonzero',inplace=True)

In [195]:
SI_train.head()

Unnamed: 0,store_nbr,item_nbr,amin,amax,count_nonzero,family,class,perishable,city,state,type,cluster,date,unit_sales
174139,54,3821,2016-11-27,2016-11-27,1,18,140,0,22,16,3,3,"(2016-11-27 00:00:00,)","(8.0,)"
107355,35,1074,2014-12-14,2014-12-14,1,9,33,0,15,9,3,3,"(2014-12-14 00:00:00,)","(4.0,)"
107964,35,2955,2016-11-30,2016-11-30,1,1,27,0,15,9,3,3,"(2016-11-30 00:00:00,)","(72.0,)"
108214,35,2113,2016-09-23,2016-09-23,1,22,259,1,15,9,3,3,"(2016-09-23 00:00:00,)","(0.621,)"
108224,35,2122,2014-07-25,2014-07-25,1,22,259,1,15,9,3,3,"(2014-07-25 00:00:00,)","(1.425,)"


In [194]:
np.sum((SI_train.amax - SI_train.amin)/pd.Timedelta('1 days') < 16)

2372

In [154]:
a = SI_train_sales.head(10)

In [163]:
pd.concat([a.apply(lambda x: pd.Series(x.date),axis=1).stack(),\
          a.apply(lambda x: pd.Series(x.unit_sales),axis=1).stack()],1)

Unnamed: 0,Unnamed: 1,0,1
0,0,2013-01-10,1.0
0,1,2013-01-11,1.0
0,2,2013-01-14,1.0
0,3,2013-01-18,2.0
0,4,2013-01-21,1.0
0,5,2013-01-25,1.0
1,0,2013-01-04,2.0
1,1,2013-01-05,3.0
1,2,2013-01-07,2.0
1,3,2013-01-08,6.0


In [168]:
SI_timeMinMax.head()

Unnamed: 0,store_nbr,item_nbr,amin,amax,count_nonzero
0,1,96995,2013-01-10,2017-07-22,181
1,1,99197,2014-08-20,2017-07-28,182
2,1,103520,2013-01-04,2017-07-29,1109
3,1,103665,2013-01-02,2017-07-30,1345
4,1,105574,2013-01-02,2017-07-30,1532


In [170]:
SI_timeMinMax.head(1000).apply(lambda x:pd.Series(pd.date_range(x.amin,periods=16)),axis=1).stack()

0    0    2013-01-10
     1    2013-01-11
     2    2013-01-12
     3    2013-01-13
     4    2013-01-14
     5    2013-01-15
     6    2013-01-16
     7    2013-01-17
     8    2013-01-18
     9    2013-01-19
     10   2013-01-20
     11   2013-01-21
     12   2013-01-22
     13   2013-01-23
     14   2013-01-24
     15   2013-01-25
1    0    2014-08-20
     1    2014-08-21
     2    2014-08-22
     3    2014-08-23
     4    2014-08-24
     5    2014-08-25
     6    2014-08-26
     7    2014-08-27
     8    2014-08-28
     9    2014-08-29
     10   2014-08-30
     11   2014-08-31
     12   2014-09-01
     13   2014-09-02
             ...    
998  2    2013-01-05
     3    2013-01-06
     4    2013-01-07
     5    2013-01-08
     6    2013-01-09
     7    2013-01-10
     8    2013-01-11
     9    2013-01-12
     10   2013-01-13
     11   2013-01-14
     12   2013-01-15
     13   2013-01-16
     14   2013-01-17
     15   2013-01-18
999  0    2013-01-02
     1    2013-01-03
     2    201

** Data Processing **

In [12]:
holidays_events = pd.read_csv('holidays_events.csv',parse_dates=['date'],infer_datetime_format=True)

In [13]:
holidays_events2 = reMapDF(holidays_events.drop('description',1),['type', 'locale', 'locale_name'])

In [14]:
holidays_events2[['type', 'locale', 'locale_name','transferred']] = \
        holidays_events2[['type', 'locale', 'locale_name','transferred']].astype('int8')

In [15]:
items = pd.read_csv('items.csv')

In [16]:
items2 = reMapDF(items,['family','class'])

In [17]:
items2[['family','class','perishable']] = \
        items2[['family','class','perishable']].astype('int8')

In [18]:
stores = pd.read_csv('stores.csv')

In [21]:
dateVar = pd.DataFrame(pd.date_range('2013-01-01', '2017-08-31'),columns=['date'])

In [22]:
dateVar['dayOfWeek'] = dateVar.date.dt.dayofweek

In [23]:
dateVar['payDay'] = ((dateVar.date.dt.day == dateVar.date.dt.days_in_month) | \
                     (dateVar.date.dt.day == 15)) * 1

In [24]:
dateVar['month'] = dateVar.date.dt.month

In [25]:
dateVar['monthSinceT0'] = (dateVar.date - pd.datetime(2013,1,1))/np.timedelta64(1, 'M')

In [26]:
dateVar['earthquake'] = (dateVar.date > '2016-04-16') & (dateVar.date <= '2016-04-24')

In [27]:
dateVar.columns.values

array(['date', 'dayOfWeek', 'payDay', 'month', 'monthSinceT0', 'earthquake'], dtype=object)

In [28]:
dateVar[['dayOfWeek', 'payDay', 'month', 'monthSinceT0','earthquake']] = \
    dateVar[['dayOfWeek', 'payDay', 'month', 'monthSinceT0','earthquake']].astype('int8')

In [29]:
oil = pd.read_csv('oil.csv',parse_dates=['date'],infer_datetime_format=True)

In [30]:
dateVar = pd.merge(dateVar,oil,'left').fillna(method='bfill')