In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 
from datetime import date, timedelta
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

# from subprocess import check_output
# print(check_output(["ls", "../input"]).decode("utf8"))

# Any results you write to the current directory are saved as output.

### Unit sales and promotion

In [2]:
df_train = pd.read_csv(
    '../input/train.csv', usecols=[1, 2, 3, 4, 5],
    dtype={'onpromotion': bool},
    converters={'unit_sales': lambda u: np.log1p(
        float(u)) if float(u) > 0 else 0},
    parse_dates=["date"],
    # comment the following line for full dataset processing
    skiprows=range(1, 101688780)  # 2017-01-01 sharp = 101688780! (115688780 for test)
)

df_test = pd.read_csv(
    "../input/test.csv", usecols=[0, 1, 2, 3, 4],
    dtype={'onpromotion': bool},
    parse_dates=["date"]  # , date_parser=parser
).set_index(
    ['store_nbr', 'item_nbr', 'date']
)

In [3]:
df_train = df_train[df_train.date.isin(
    pd.date_range("2017-01-04", periods=7 * 32))]

df_unit_sales = df_train.set_index(
    ["store_nbr", "item_nbr", "date"])[["unit_sales"]].unstack(
        level=-1).fillna(0)
df_unit_sales.columns = df_unit_sales.columns.get_level_values(1)

In [4]:
df_promo_train = df_train.set_index(
    ["store_nbr", "item_nbr", "date"])[["onpromotion"]].unstack(
        level=-1).fillna(False)
df_promo_train.columns = df_promo_train.columns.get_level_values(1)

df_promo_test = df_test[["onpromotion"]].unstack(level=-1).fillna(False)
df_promo_test.columns = df_promo_test.columns.get_level_values(1)
df_promo_test = df_promo_test.reindex(df_promo_train.index).fillna(False)

df_promo = pd.concat([df_promo_train, df_promo_test], axis=1)
del df_promo_train, df_promo_test, df_train

### Item information

In [5]:
df_items = pd.read_csv(
    "../input/items.csv",
).set_index("item_nbr")
df_items = df_items.reindex(df_unit_sales.index.get_level_values(1))
df_items.head()

Unnamed: 0_level_0,family,class,perishable
item_nbr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
96995,GROCERY I,1093,0
99197,GROCERY I,1067,0
103520,GROCERY I,1028,0
103665,BREAD/BAKERY,2712,1
105574,GROCERY I,1045,0


Feature to use: class & perishable. Class is already numeric coded, which should perform well in tree models. For other models, binary encoding can be used instead.

### Store information

In [6]:
df_stores = pd.read_csv("../input/stores.csv").set_index("store_nbr")
df_stores_original = df_stores.copy()
df_stores_expanded = df_stores.reindex(df_unit_sales.index.get_level_values(0))
df_stores_expanded.head()

Unnamed: 0_level_0,city,state,type,cluster
store_nbr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,Quito,Pichincha,D,13
1,Quito,Pichincha,D,13
1,Quito,Pichincha,D,13
1,Quito,Pichincha,D,13
1,Quito,Pichincha,D,13


Choose to use type and cluster. Type can be numeric coded.

In [7]:
encoder = LabelEncoder()
encoder.fit(df_stores_expanded['type'])
df_stores_expanded['type'] = encoder.transform(df_stores_expanded['type'])

### Oil price by date

In [8]:
df_oil = pd.read_csv("../input/oil.csv", parse_dates = ['date']).set_index('date')
df_oil = df_oil.fillna(method='backfill')['dcoilwtico']

### Holiday information

In [9]:
df_holidays = pd.read_csv("../input/holidays_events.csv")
df_holidays['date'] = df_holidays['date'].astype(np.datetime64)
df_holidays.head()

  return self.apply('astype', dtype=dtype, **kwargs)


Unnamed: 0,date,type,locale,locale_name,description,transferred
0,2012-03-02,Holiday,Local,Manta,Fundacion de Manta,False
1,2012-04-01,Holiday,Regional,Cotopaxi,Provincializacion de Cotopaxi,False
2,2012-04-12,Holiday,Local,Cuenca,Fundacion de Cuenca,False
3,2012-04-14,Holiday,Local,Libertad,Cantonizacion de Libertad,False
4,2012-04-21,Holiday,Local,Riobamba,Cantonizacion de Riobamba,False


In [10]:
df_holidays_expanded = pd.DataFrame(np.zeros(df_promo.values.shape), index=df_promo.index.copy(), columns=df_promo.columns.copy())

Populate the table

In [11]:
for i in df_holidays.index.values[:]:
    # Remove non-holidays
    if df_holidays.loc[i, 'transferred'] == True or df_holidays.loc[i, 'type'] == 'Work Day':
        print(df_holidays.loc[i, 'date'], "is not a holiday")
    else:
        # Populate table according to holiday locale
        if df_holidays.loc[i, 'locale'] == 'National':
            print(df_holidays.loc[i, 'date'], "is a National holiday")
            df_holidays_expanded.iloc[:, df_holidays_expanded.columns.get_level_values(0) == df_holidays.loc[i, 'date']] = 1
#         elif df_holidays.loc[i, 'locale'] == 'Regional':
#             print(df_holidays.loc[i, 'date'], "is a Regional holiday to ", df_holidays.loc[i, 'locale_name'])
#             df_holidays_expanded.iloc[
#                 df_holidays_expanded.index.get_level_values(0).isin(
#                     df_stores.loc[df_stores_original['state'] == df_holidays.loc[i, 'locale_name']].index.values
#                 ),
#                 df_holidays_expanded.columns.get_level_values(0) == df_holidays.loc[i, 'date']
#             ] = 1
#         elif df_holidays.loc[i, 'locale'] == 'Local':
#             print(df_holidays.loc[i, 'date'], "is a Local holiday to ", df_holidays.loc[i, 'locale_name'])
#             df_holidays_expanded.iloc[
#                 df_holidays_expanded.index.get_level_values(0).isin(
#                     df_stores.loc[df_stores_original['city'] == df_holidays.loc[i, 'locale_name']].index.values
#                 ),
#                 df_holidays_expanded.columns.get_level_values(0) == df_holidays.loc[i, 'date']
#             ] = 1
        else:
            print(df_holidays.loc[i, 'date'], "skipped.")

2012-03-02 00:00:00 skipped.
2012-04-01 00:00:00 skipped.
2012-04-12 00:00:00 skipped.
2012-04-14 00:00:00 skipped.
2012-04-21 00:00:00 skipped.
2012-05-12 00:00:00 skipped.
2012-06-23 00:00:00 skipped.
2012-06-25 00:00:00 skipped.
2012-06-25 00:00:00 skipped.
2012-06-25 00:00:00 skipped.
2012-07-03 00:00:00 skipped.
2012-07-03 00:00:00 skipped.
2012-07-23 00:00:00 skipped.
2012-08-05 00:00:00 skipped.
2012-08-10 00:00:00 is a National holiday
2012-08-15 00:00:00 skipped.
2012-08-24 00:00:00 skipped.
2012-09-28 00:00:00 skipped.
2012-10-07 00:00:00 skipped.
2012-10-09 00:00:00 is not a holiday
2012-10-12 00:00:00 is a National holiday
2012-11-02 00:00:00 is a National holiday
2012-11-03 00:00:00 is a National holiday
2012-11-06 00:00:00 skipped.
2012-11-07 00:00:00 skipped.
2012-11-10 00:00:00 skipped.
2012-11-11 00:00:00 skipped.
2012-11-12 00:00:00 skipped.
2012-12-05 00:00:00 skipped.
2012-12-06 00:00:00 skipped.
2012-12-08 00:00:00 skipped.
2012-12-21 00:00:00 is a National holiday

## Generate samples

In [12]:
def days_since_last_salary(sample_dt):
    # First day next month
    if sample_dt.month == 12:
        fdnm = date(sample_dt.year + 1, 1, 1)
    else:
        fdnm = date(sample_dt.year, sample_dt.month + 1, 1)
    ldtm = fdnm - timedelta(days=1)
  
    some_deltas = [sample_dt.day, sample_dt.day - 15, sample_dt.day - ldtm.day]
    some_deltas = [i if i >= 0 else 99 for i in some_deltas]
  
    return min(some_deltas)

In [13]:
def get_timespan(df, dt, minus, periods, freq='D'):
    return df[pd.date_range(dt - timedelta(days=minus), periods=periods, freq=freq)] 

In [14]:
def prepare_sample(sales_raw, promo_raw, oil_raw, holiday_raw, store_raw, sample_dt, is_train=True):
    temp_dict = {}
    for i in [3, 7, 14, 21, 28, 35, 70, 140]:
        temp_dict["mean_{}".format(i)] = get_timespan(sales_raw, sample_dt, i, i).mean(axis=1).values
        temp_dict["promo_{}_sum".format(i)] = get_timespan(promo_raw, sample_dt, i, i).sum(axis=1).values
#         temp_dict["oil_{}_mean".format(i)] = get_timespan(oil_raw, sample_dt, i, i).mean()
        temp_dict["holiday_{}_sum".format(i)] = get_timespan(holiday_raw, sample_dt, i, i).sum(axis=1).values
    X = pd.DataFrame(temp_dict)
    for i in range(7):
        X['mean_4_dow{}_2017'.format(i)] = get_timespan(sales_raw, sample_dt, 28-i, 4, freq='7D').mean(axis=1).values
        X['mean_20_dow{}_2017'.format(i)] = get_timespan(sales_raw, sample_dt, 140-i, 20, freq='7D').mean(axis=1).values
    for i in range(16):
        X["promo_{}".format(i)] = promo_raw[
            sample_dt + timedelta(days=i)].values.astype(np.uint8)
        X["holiday_{}".format(i)] = holiday_raw[
            sample_dt + timedelta(days=i)].values.astype(np.uint8)
    X['store_type'] = store_raw['type'].values
    X['store_cluster'] = store_raw['cluster'].values
    X['item_perishable'] = df_items['perishable'].values
    X['item_class'] = df_items['class'].values
    X['days_since_last_salary'] = days_since_last_salary(sample_dt)
    
    if is_train:
        y = sales_raw[
            pd.date_range(sample_dt, periods=16)
        ].values
        return X, y
    return X

# Make train/val/test sets

In [15]:
some_date = date(2017, 5, 31)
X_l, y_l = [], []
for i in range(9):
    delta = timedelta(days=i*7)
    X_tmp, y_tmp = prepare_sample(
        df_unit_sales, df_promo, df_oil, df_holidays_expanded, df_stores_expanded,
        some_date + delta
    )
    X_l.append(X_tmp)
    y_l.append(y_tmp)
X_train_raw = pd.concat(X_l, axis=0)
y_train_raw = np.concatenate(y_l, axis=0)
del X_l, y_l
# X_val, y_val = prepare_sample(
#     df_unit_sales, df_promo, df_oil, df_holidays_expanded, df_stores_expanded, date(2017, 7, 26)
# )
X_train, X_val, y_train, y_val = train_test_split(X_train_raw, y_train_raw, test_size=0.1, random_state=42)

X_test = prepare_sample(
    df_unit_sales, df_promo, df_oil, df_holidays_expanded, df_stores_expanded, date(2017, 8, 16), is_train=False
)

# Save output

In [16]:
X_train.to_csv('../input/X_train.csv', float_format='%.4f', index=False)

np.savetxt("../input/y_train.csv", y_train, fmt='%.4f', delimiter=",")

X_val.to_csv('../input/X_val.csv', float_format='%.4f', index=False)

np.savetxt("../input/y_val.csv", y_val, fmt='%.4f', delimiter=",")

X_test.to_csv('../input/X_test.csv', float_format='%.4f', index=False)