In [1]:
"""
LSTM with 
average features in terms of sales and promotion and weekly average features on them.
"""
from datetime import date, timedelta
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.layers import LSTM
from keras import callbacks
from keras.callbacks import ModelCheckpoint

Using TensorFlow backend.


In [2]:
"""
For reading data from AWS S3
"""
import boto3
import pandas as pd
import io
import gzip
s3 = boto3.resource('s3')
bucket = 'twde-datalab'
train_key = 'raw/train.csv'
test_key = 'raw/test.csv'
items_key = 'raw/items.csv'

In [3]:
def get_testdf(key):
    obj = s3.Object(bucket,key)
    data = obj.get()['Body'].read()
    df = pd.read_csv(io.BytesIO(data), 
                    usecols=[0, 1, 2, 3, 4],
                    dtype={'onpromotion': bool},
                    parse_dates=["date"] 
                    ).set_index(
                        ['store_nbr', 'item_nbr', 'date'])
    print(df.shape)
    return df
def get_traindf(key):
    obj = s3.Object(bucket,key)
    data = obj.get()['Body'].read()
    traindf = pd.read_csv(io.BytesIO(data), 
                          usecols=[1, 2, 3, 4, 5],
                          dtype={'onpromotion': bool},
                          converters={'unit_sales': lambda u: np.log1p(
                            float(u)) if float(u) > 0 else 0},
                          parse_dates=["date"],
                          skiprows=range(1, 66458909)  # 2016-01-01
                          )
    return traindf

def get_itemsdf(key):
    obj = s3.Object(bucket,key)
    data = obj.get()['Body'].read()
    itemsdf = pd.read_csv(io.BytesIO(data)).set_index("item_nbr")
    return itemsdf

def save_s3(df, key):
    csv_buffer = io.StringIO()
    df.to_csv(csv_buffer, index=False)

    csv_buffer.seek(0)
    gz_buffer = io.BytesIO()

    with gzip.GzipFile(mode='w', fileobj=gz_buffer) as gz_file:
        gz_file.write(bytes(csv_buffer.getvalue(), 'utf-8'))

    s3_object = s3.Object(bucket, key)
    s3_object.put(Body=gz_buffer.getvalue())

In [4]:
df_train = get_traindf(train_key)
df_test = get_testdf(test_key)
items = get_itemsdf(items_key)

(3370464, 2)


In [5]:
df_2017 = df_train.loc[df_train.date>=pd.datetime(2017,1,1)]
del df_train

promo_2017_train = df_2017.set_index(
    ["store_nbr", "item_nbr", "date"])[["onpromotion"]].unstack(
        level=-1).fillna(False)
promo_2017_train.columns = promo_2017_train.columns.get_level_values(1)
promo_2017_test = df_test[["onpromotion"]].unstack(level=-1).fillna(False)
promo_2017_test.columns = promo_2017_test.columns.get_level_values(1)
promo_2017_test = promo_2017_test.reindex(promo_2017_train.index).fillna(False)
promo_2017 = pd.concat([promo_2017_train, promo_2017_test], axis=1)
del promo_2017_test, promo_2017_train

df_2017 = df_2017.set_index(
    ["store_nbr", "item_nbr", "date"])[["unit_sales"]].unstack(
        level=-1).fillna(0)
df_2017.columns = df_2017.columns.get_level_values(1)

items = items.reindex(df_2017.index.get_level_values(1))

def get_timespan(df, dt, minus, periods, freq='D'):
    return df[pd.date_range(dt - timedelta(days=minus), periods=periods, freq=freq)]

def prepare_dataset(t2017, is_train=True):
    X = pd.DataFrame({
        "day_1_2017": get_timespan(df_2017, t2017, 1, 1).values.ravel(),
        "mean_3_2017": get_timespan(df_2017, t2017, 3, 3).mean(axis=1).values,
        "mean_7_2017": get_timespan(df_2017, t2017, 7, 7).mean(axis=1).values,
        "mean_14_2017": get_timespan(df_2017, t2017, 14, 14).mean(axis=1).values,
        "mean_30_2017": get_timespan(df_2017, t2017, 30, 30).mean(axis=1).values,
        "mean_60_2017": get_timespan(df_2017, t2017, 60, 60).mean(axis=1).values,
        "mean_140_2017": get_timespan(df_2017, t2017, 140, 140).mean(axis=1).values,
        "promo_14_2017": get_timespan(promo_2017, t2017, 14, 14).sum(axis=1).values,
        "promo_60_2017": get_timespan(promo_2017, t2017, 60, 60).sum(axis=1).values,
        "promo_140_2017": get_timespan(promo_2017, t2017, 140, 140).sum(axis=1).values
    })
    for i in range(7):
        X['mean_4_dow{}_2017'.format(i)] = get_timespan(df_2017, t2017, 28-i, 4, freq='7D').mean(axis=1).values
        X['mean_20_dow{}_2017'.format(i)] = get_timespan(df_2017, t2017, 140-i, 20, freq='7D').mean(axis=1).values
    for i in range(16):
        X["promo_{}".format(i)] = promo_2017[
            t2017 + timedelta(days=i)].values.astype(np.uint8)
    if is_train:
        y = df_2017[
            pd.date_range(t2017, periods=16)
        ].values
        return X, y
    return X

print("Preparing dataset")
t2017 = date(2017, 5, 31)
X_l, y_l = [], []
for i in range(6):
    delta = timedelta(days=7 * i)
    X_tmp, y_tmp = prepare_dataset(
        t2017 + delta
    )
    X_l.append(X_tmp)
    y_l.append(y_tmp)
X_train = pd.concat(X_l, axis=0)
y_train = np.concatenate(y_l, axis=0)
del X_l, y_l
X_val, y_val = prepare_dataset(date(2017, 7, 26))
X_test = prepare_dataset(date(2017, 8, 16), is_train=False)

stores_items = pd.DataFrame(index=df_2017.index)
test_ids = df_test[['id']]

items = items.reindex( stores_items.index.get_level_values(1) )

X_train = X_train.as_matrix()
X_test = X_test.as_matrix()
X_val = X_val.as_matrix()
X_train = X_train.reshape((X_train.shape[0], 1, X_train.shape[1]))
X_test = X_test.reshape((X_test.shape[0], 1, X_test.shape[1]))
X_val = X_val.reshape((X_val.shape[0], 1, X_val.shape[1]))

model = Sequential()
model.add(LSTM(32, input_shape=(X_train.shape[1],X_train.shape[2])))
model.add(Dropout(.1))
model.add(Dense(32))
model.add(Dropout(.2))
model.add(Dense(1))
model.compile(loss = 'mse', optimizer='adam', metrics=['mse'])

N_EPOCHS = 5

val_pred = []
test_pred = []
sample_weights=np.array( pd.concat([items["perishable"]] * 6) * 0.25 + 1 )
for i in range(16):
    print("=" * 50)
    print("Step %d" % (i+1))
    print("=" * 50)
    y = y_train[:, i]
    xv = X_val
    yv = y_val[:, i]
    model.fit(X_train, y, batch_size = 512, epochs = N_EPOCHS, verbose=2,
               sample_weight=sample_weights, validation_data=(xv,yv) ) 
    val_pred.append(model.predict(X_val))
    test_pred.append(model.predict(X_test))
    
n_public = 5 # Number of days in public test set
weights=pd.concat([items["perishable"]]) * 0.25 + 1
print("Unweighted validation mse: ", mean_squared_error(
    y_val, np.array(val_pred).squeeze(axis=2).transpose()) )
print("Full validation mse:       ", mean_squared_error(
    y_val, np.array(val_pred).squeeze(axis=2).transpose(), sample_weight=weights) )
print("'Public' validation mse:   ", mean_squared_error(
    y_val[:,:n_public], np.array(val_pred).squeeze(axis=2).transpose()[:,:n_public], 
    sample_weight=weights) )
print("'Private' validation mse:  ", mean_squared_error(
    y_val[:,n_public:], np.array(val_pred).squeeze(axis=2).transpose()[:,n_public:], 
    sample_weight=weights) )
    
y_test = np.array(test_pred).squeeze(axis=2).transpose()
df_preds = pd.DataFrame(
    y_test, index=stores_items.index,
    columns=pd.date_range("2017-08-16", periods=16)
).stack().to_frame("unit_sales")
df_preds.index.set_names(["store_nbr", "item_nbr", "date"], inplace=True)

submission = test_ids.join(df_preds, how="left").fillna(0)
submission["unit_sales"] = np.clip(np.expm1(submission["unit_sales"]), 0, 1000)

Preparing dataset...
Step 1
Train on 1005090 samples, validate on 167515 samples
Epoch 1/5
 - 9s - loss: 0.3982 - mean_squared_error: 0.3752 - val_loss: 0.3063 - val_mean_squared_error: 0.3063
Epoch 2/5
 - 9s - loss: 0.3523 - mean_squared_error: 0.3334 - val_loss: 0.3011 - val_mean_squared_error: 0.3011
Epoch 3/5
 - 9s - loss: 0.3455 - mean_squared_error: 0.3273 - val_loss: 0.2990 - val_mean_squared_error: 0.2990
Epoch 4/5
 - 9s - loss: 0.3414 - mean_squared_error: 0.3235 - val_loss: 0.2989 - val_mean_squared_error: 0.2989
Epoch 5/5
 - 9s - loss: 0.3392 - mean_squared_error: 0.3216 - val_loss: 0.2982 - val_mean_squared_error: 0.2982
Step 2
Train on 1005090 samples, validate on 167515 samples
Epoch 1/5
 - 9s - loss: 0.3665 - mean_squared_error: 0.3457 - val_loss: 0.3305 - val_mean_squared_error: 0.3305
Epoch 2/5
 - 9s - loss: 0.3627 - mean_squared_error: 0.3422 - val_loss: 0.3271 - val_mean_squared_error: 0.3271
Epoch 3/5
 - 9s - loss: 0.3614 - mean_squared_error: 0.3411 - val_loss: 0.3

Epoch 2/5
 - 9s - loss: 0.4233 - mean_squared_error: 0.4011 - val_loss: 0.3883 - val_mean_squared_error: 0.3883
Epoch 3/5
 - 9s - loss: 0.4219 - mean_squared_error: 0.3998 - val_loss: 0.3879 - val_mean_squared_error: 0.3879
Epoch 4/5
 - 9s - loss: 0.4211 - mean_squared_error: 0.3991 - val_loss: 0.3889 - val_mean_squared_error: 0.3889
Epoch 5/5
 - 9s - loss: 0.4206 - mean_squared_error: 0.3986 - val_loss: 0.3880 - val_mean_squared_error: 0.3880
Step 13
Train on 1005090 samples, validate on 167515 samples
Epoch 1/5
 - 9s - loss: 0.4141 - mean_squared_error: 0.3909 - val_loss: 0.3784 - val_mean_squared_error: 0.3784
Epoch 2/5
 - 9s - loss: 0.4065 - mean_squared_error: 0.3838 - val_loss: 0.3860 - val_mean_squared_error: 0.3860
Epoch 3/5
 - 9s - loss: 0.4052 - mean_squared_error: 0.3825 - val_loss: 0.3793 - val_mean_squared_error: 0.3793
Epoch 4/5
 - 9s - loss: 0.4043 - mean_squared_error: 0.3818 - val_loss: 0.3770 - val_mean_squared_error: 0.3770
Epoch 5/5
 - 9s - loss: 0.4038 - mean_squar

In [6]:
save_s3(submission, 'submission/lstm.csv.gz')