In [1]:
"""
This is an upgraded version of Ceshine's LGBM starter script, simply adding more
average features and weekly average features on it.
"""
from datetime import date, timedelta

import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error

In [2]:
import boto3
import pandas as pd
import io
import gzip
s3 = boto3.resource('s3')
bucket = 'twde-datalab'
train_key = 'raw/train.csv'
test_key = 'raw/test.csv'
items_key = 'raw/items.csv'

In [4]:
dtypes = {'id':'uint32', 'item_nbr':'int32', 'store_nbr':'int8', 'unit_sales':'float32'}

def get_testdf(key):
    obj = s3.Object(bucket,key)
    data = obj.get()['Body'].read()
    df = pd.read_csv(io.BytesIO(data), 
                    usecols=[0, 1, 2, 3, 4],
                    dtype={'onpromotion': bool},
                    parse_dates=["date"]
                    ).set_index(
                        ['store_nbr', 'item_nbr', 'date'])
    return df
def get_traindf(key):
    obj = s3.Object(bucket,key)
    data = obj.get()['Body'].read()
    traindf = pd.read_csv(io.BytesIO(data), 
                          usecols=[1, 2, 3, 4],
                          dtype=dtypes,
                          parse_dates=["date"],
                          skiprows=range(1, 86672217) #Skip dates before 2016-08-01
                          )
    return traindf

def get_itemsdf(key):
    obj = s3.Object(bucket,key)
    data = obj.get()['Body'].read()
    itemsdf = pd.read_csv(io.BytesIO(data)).set_index("item_nbr")
    return itemsdf

def save_s3(df, key):
    csv_buffer = io.StringIO()
    df.to_csv(csv_buffer, index=False)

    csv_buffer.seek(0)
    gz_buffer = io.BytesIO()

    with gzip.GzipFile(mode='w', fileobj=gz_buffer) as gz_file:
        gz_file.write(bytes(csv_buffer.getvalue(), 'utf-8'))

    s3_object = s3.Object(bucket, key)
    s3_object.put(Body=gz_buffer.getvalue())

In [5]:
train = get_traindf(train_key)
test = get_testdf(test_key)
items = get_itemsdf(items_key)

(3370464, 2)


In [6]:
train.loc[(train.unit_sales<0),'unit_sales'] = 0 # eliminate negatives
train['unit_sales'] =  train['unit_sales'].apply(pd.np.log1p) #logarithm conversion
train['dow'] = train['date'].dt.dayofweek

In [7]:
#Days of Week Means
ma_dw = train[['item_nbr','store_nbr','dow','unit_sales']].groupby(
        ['item_nbr','store_nbr','dow'])['unit_sales'].mean().to_frame('madw').reset_index()
ma_wk = ma_dw[['item_nbr','store_nbr','madw']].groupby(
        ['store_nbr', 'item_nbr'])['madw'].mean().to_frame('mawk').reset_index()

train.drop('dow',1,inplace=True)

# creating records for all items, in all stores on all dates
# for correct calculation of daily unit sales averages.
u_dates = train.date.unique()
u_stores = train.store_nbr.unique()
u_items = train.item_nbr.unique()
train.set_index(['date', 'store_nbr', 'item_nbr'], inplace=True)
train = train.reindex(
    pd.MultiIndex.from_product(
        (u_dates, u_stores, u_items),
        names=['date','store_nbr','item_nbr']
    )
).reset_index()

del u_dates, u_stores, u_items

train.loc[:, 'unit_sales'].fillna(0, inplace=True) # fill NaNs
lastdate = train.iloc[train.shape[0]-1].date

#Moving Averages
ma_is = train[['item_nbr','store_nbr','unit_sales']].groupby(
        ['item_nbr','store_nbr'])['unit_sales'].mean().to_frame('mais')

for i in [112,56,28,14,7,3,1]:
    tmp = train[train.date>lastdate-timedelta(int(i))]
    tmpg = tmp.groupby(['item_nbr','store_nbr'])['unit_sales'].mean().to_frame('mais'+str(i))
    ma_is = ma_is.join(tmpg, how='left')

del tmp,tmpg,train

ma_is['mais']=ma_is.median(axis=1)
ma_is.reset_index(inplace=True)

In [8]:
ma_is.head()

Unnamed: 0,item_nbr,store_nbr,mais,mais112,mais56,mais28,mais14,mais7,mais3,mais1
0,96995,1,0.126638,0.154255,0.172356,0.295202,0.334438,0.099021,0.0,0.0
1,96995,2,0.024755,0.161961,0.123776,0.049511,0.0,0.0,0.0,0.0
2,96995,3,0.355917,0.208903,0.286789,0.336299,0.375535,0.454008,0.462098,0.693147
3,96995,4,0.124828,0.093884,0.150635,0.099021,0.099021,0.198042,0.231049,0.693147
4,96995,5,0.118639,0.138257,0.202249,0.237278,0.099021,0.198042,0.0,0.0


In [None]:
ma_is.drop(list(ma_is.columns.values)[3:],1,inplace=True)

In [None]:

#Make test features 
test['dow'] = test['date'].dt.dayofweek
test = pd.merge(test, ma_is, how='left', on=['item_nbr','store_nbr'])
test = pd.merge(test, ma_wk, how='left', on=['item_nbr','store_nbr'])
test = pd.merge(test, ma_dw, how='left', on=['item_nbr','store_nbr','dow'])

del ma_is, ma_wk, ma_dw

#Predicting Test
test['unit_sales'] = test.mais 
pos_idx = test['mawk'] > 0
test_pos = test.loc[pos_idx]
test.loc[pos_idx, 'unit_sales'] = test_pos['mais'] * test_pos['madw'] / test_pos['mawk']
test.loc[:, "unit_sales"].fillna(0, inplace=True)
test['unit_sales'] = test['unit_sales'].apply(pd.np.expm1) # restoring unit values 

#50% more for promotion items
test.loc[test['onpromotion'] == True, 'unit_sales'] *= 1.5


In [None]:
save_s3(test[['id','unit_sales']], 'submission/ma8dwof.csv.gz', index=False, float_format='%.3f', compression='gzip')