In [22]:
#import warnings
#warnings.filterwarnings('ignore')
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook
from datetime import datetime
from sklearn.preprocessing import MinMaxScaler
from sklearn.utils import shuffle
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Activation, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras import optimizers
pd.set_option('display.max_columns', None)
print('available GPU devices:', len(os.environ['CUDA_VISIBLE_DEVICES']), 
      ' | device num:', os.environ['CUDA_VISIBLE_DEVICES'])

available GPU devices: 1  | device num: 0


In [14]:
DATA_DIR = './data'
MODELS_DIR = './models'
MODEL_VER = 'v0'
CUT_DATE = '2015-10-24'
END_DATE = '2016-04-24'
print(datetime.strptime(END_DATE, '%Y-%m-%d'))
LOOK_BACK = 2 * 28
LOOK_FWD = 28
BATCH_SIZE = 1000

2016-04-24 00:00:00


In [3]:
CALENDAR_DTYPES = {
    'date':             'str',
    'wm_yr_wk':         'int16', 
    'weekday':          'object',
    'wday':             'int16', 
    'month':            'int16', 
    'year':             'int16', 
    'd':                'object',
    'event_name_1':     'object',
    'event_type_1':     'object',
    'event_name_2':     'object',
    'event_type_2':     'object',
    'snap_CA':          'int16', 
    'snap_TX':          'int16', 
    'snap_WI':          'int16'
}
PARSE_DATES = ['date']
SPRICES_DTYPES = {
    'store_id':    'object', 
    'item_id':     'object', 
    'wm_yr_wk':    'int16',  
    'sell_price':  'float32'
}

In [4]:
def get_df(is_train=True, backward_lags=None):
    strain = pd.read_csv('{}/sales_train_validation.csv'.format(DATA_DIR))
    print('read train:', strain.shape)
    cat_cols = ['id', 'item_id', 'dept_id','store_id', 'cat_id', 'state_id']
    last_day = int(strain.columns[-1].replace('d_', ''))
    print('last day is:', last_day)
    if not is_train:
        for day in range(last_day + 1, last_day + 28 + 28 + 1):
            strain['d_{}'.format(day)] = np.nan
    strain = pd.melt(
        strain,
        id_vars = cat_cols,
        value_vars = [col for col in strain.columns if col.startswith('d_')],
        var_name = 'd',
        value_name = 'sales'
    )
    print('melted train:', strain.shape)
    calendar = pd.read_csv('{}/calendar.csv'.format(DATA_DIR), dtype=CALENDAR_DTYPES, parse_dates=PARSE_DATES)
    print('read calendar:', calendar.shape)
    strain = strain.merge(calendar, on='d', copy=False)
    print('calendar merge done')
    sprices = pd.read_csv('{}/sell_prices.csv'.format(DATA_DIR), dtype=SPRICES_DTYPES)
    print('read prices:', sprices.shape)
    strain = strain.merge(
        sprices, 
        on=['store_id', 'item_id', 'wm_yr_wk'], 
        copy=False
    )
    print('prices merge done')
    print('begin train date:', strain['date'].min())
    print('end train date:', strain['date'].max())
    if not is_train:
        strain = strain.loc[
            strain['date'] >= (datetime.strptime(END_DATE, '%Y-%m-%d') - timedelta(days=backward_lags))
        ]
    else:
        strain = strain.loc[strain['date'] >= CUT_DATE]
    print('date cut train:', strain.shape)
    print('cut train date:', strain['date'].min())
    print('end train date:', strain['date'].max())
    date_features = {
        'week_num': 'weekofyear',
        'quarter': 'quarter',
        'mday': 'day'
    }
    for date_feat_name, date_feat_func in date_features.items():
        strain[date_feat_name] = getattr(strain['date'].dt, date_feat_func).astype('int16')
    print('date features done')
    strain['d'] = strain['d'].apply(lambda x: int(x.replace('d_', '')))  
    drop_cols = ['date', 'wm_yr_wk', 'weekday']
    strain.drop(columns=drop_cols, inplace=True)
    strain.sort_values(by=['id', 'd'], inplace=True)
    print('trash cols deleted, sorted')
    strain['d_'] = strain['d']
    print('out dataframe:', strain.shape)
    return strain

In [5]:
def get_dummies(strain, cols_dummy):
    print('got shape for dummies:', strain.shape)
    strain_dummies = pd.get_dummies(
        strain[cols_dummy],
        drop_first=False,
        dummy_na=True
    )
    strain.drop(columns=cols_dummy, inplace=True)
    strain = pd.concat([strain, strain_dummies], axis=1)
    print('out shape for dummies:', strain.shape)
    return strain

In [6]:
%%time
strain = get_df(is_train=True, backward_lags=None)

read train: (30490, 1919)
last day is: 1913
melted train: (58327370, 8)
read calendar: (1969, 14)
calendar merge done
read prices: (6841121, 4)
prices merge done
begin train date: 2011-01-29 00:00:00
end train date: 2016-04-24 00:00:00
date cut train: (5607717, 22)
cut train date: 2015-10-24 00:00:00
end train date: 2016-04-24 00:00:00
date features done
trash cols deleted, sorted
out dataframe: (5607717, 23)
CPU times: user 40.8 s, sys: 6.6 s, total: 47.4 s
Wall time: 47.4 s


In [7]:
num_cols = ['d', 'sales', 'wday', 'month', 'year', 
            'snap_CA', 'snap_TX', 'snap_WI', 
            'sell_price', 'week_num', 'quarter', 'mday']
scaler = MinMaxScaler()
strain[num_cols] = scaler.fit_transform(strain[num_cols])
print('min-max scaled')

min-max scaled


In [8]:
sval = strain[strain.d_ >= (1913 - LOOK_BACK - LOOK_FWD)]
print('val | start:', sval.d_.min(), 
      'end:', sval.d_.max(),
      'duration:', sval.d_.max() - sval.d_.min(), 
      'deep:', LOOK_BACK + LOOK_FWD)
strain = strain[strain.d_ <= (1913 - LOOK_FWD)]
print('train | start:', strain.d_.min(), 
      'end:', strain.d_.max(),
      'duration:', strain.d_.max() - strain.d_.min(), 
      'deep:', LOOK_BACK + LOOK_FWD)
event_cols = ['event_name_1', 'event_type_1', 'event_name_2', 'event_type_2']
sdummies_id = strain.drop(columns=num_cols + event_cols + ['d_'])
strain.drop(columns=['d_'], inplace=True)
sval.drop(columns=['d_'], inplace=True)
sdummies_id.drop_duplicates(inplace=True)
print('dummies df done:', sdummies_id.shape)
for col in sdummies_id.columns:
    print('dummies:', col, sdummies_id[col].unique().shape)

val | start: 1829 end: 1913 duration: 84 deep: 84
train | start: 1730 end: 1885 duration: 155 deep: 84
dummies: id (30490,)
dummies: item_id (3049,)
dummies: dept_id (7,)
dummies: store_id (10,)
dummies: cat_id (3,)
dummies: state_id (3,)


In [9]:
%%time
cols_dummy = ['item_id', 'dept_id', 'store_id', 'cat_id', 'state_id']
sdummies_id = get_dummies(sdummies_id, cols_dummy)

got shape for dummies: (30490, 6)
out shape for dummies: (30490, 3078)
CPU times: user 168 ms, sys: 4 ms, total: 172 ms
Wall time: 172 ms


In [10]:
def get_sequence_batch(df, dummies, col_id, col_look, look_back, look_fwd, cols_features):
    X, y = [], [] 
    for idx in list(df['id'].unique()):
        for i in range(len(df.loc[df[col_id] == idx, col_look]) - look_back - look_fwd):
            temp_X = []
            temp_X.append(df.loc[df[col_id] == idx, col_look][i : i + look_back])
            for col in cols_features:
                temp_X.append(df.loc[df[col_id] == idx, col][i + look_fwd : i + look_back + look_fwd])
            temp_D = np.array([dummies.loc[dummies[col_id] == idx].values[0][1:]] * look_back).T
            temp_X = np.vstack((temp_X, temp_D))
            X.append(temp_X)
            y.append(df.loc[df[col_id] == idx, col_look][i + look_back : i + look_back + look_fwd])
    X = [x.T for x in np.array(X)] # to feed LSTM with shape as [samples, time steps, features]
    return np.array(X), np.array(y)

In [11]:
def SeqGenerator(df, dummies, col_id, col_look, look_back, look_fwd, cols_features, batch_size):
    while True:
        for i in range(len(df) // batch_size):
            if (i + 1) * batch_size > len(df):
                yield get_sequence_batch(
                    df[i * batch_size:], 
                    dummies,
                    col_id, 
                    col_look, 
                    look_back, 
                    look_fwd, 
                    cols_features
                )
            else:
                yield get_sequence_batch(
                    df[i * batch_size : (i + 1) * batch_size], 
                    dummies,
                    col_id, 
                    col_look, 
                    look_back, 
                    look_fwd, 
                    cols_features
                )

In [17]:
%%time
X, y = next(
    SeqGenerator(
        df=strain, 
        dummies=sdummies_id, 
        col_id='id', 
        col_look='sales', 
        look_back=LOOK_BACK, 
        look_fwd=LOOK_FWD, 
        cols_features=num_cols, 
        batch_size=BATCH_SIZE
    )
)
print(X.shape, y.shape)

(432, 56, 3090) (432, 28)
CPU times: user 3.49 s, sys: 320 ms, total: 3.81 s
Wall time: 3.81 s


In [18]:
print('batches in train:', len(strain) // BATCH_SIZE + 1)
print('batches in val:', len(sval) // BATCH_SIZE + 1)

batches in train: 4753
batches in val: 2591


In [19]:
n_features = 3090
model = Sequential()
model.add(LSTM(units=512, input_shape=(LOOK_BACK, n_features), return_sequences=True))
model.add(Dropout(.4))
model.add(LSTM(units=512))
model.add(Dropout(.4))
model.add(Dense(LOOK_FWD))
model.add(Activation('linear'))
adam = optimizers.Adam(lr=.001, clipvalue=.5, clipnorm=1)
model.compile(loss='mse', optimizer=adam)
model.summary()

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 56, 512)           7378944   
_________________________________________________________________
dropout (Dropout)            (None, 56, 512)           0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 512)               2099200   
_________________________________________________________________
dropout_1 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense (Dense)                (None, 28)                14364     
_________________________________________________________________
activation (Activation)      (None, 28)                0         
Tota

In [None]:
%%time
model_file = '{}/model_{}.h5'.format(MODELS_DIR, MODEL_VER)
modelsaver = ModelCheckpoint(
    model_file, 
    monitor='val_loss', 
    verbose=1, 
    save_best_only=True,
    mode='min'
)
earlystopper = EarlyStopping(monitor='val_loss', patience=10, verbose=1)
history = model.fit_generator(
    SeqGenerator(
        df=strain, 
        dummies=sdummies_id, 
        col_id='id', 
        col_look='sales', 
        look_back=LOOK_BACK, 
        look_fwd=LOOK_FWD, 
        cols_features=num_cols, 
        batch_size=BATCH_SIZE
    ),
    steps_per_epoch=len(strain) // BATCH_SIZE + 1,
    validation_data=SeqGenerator(
        df=sval, 
        dummies=sdummies_id, 
        col_id='id', 
        col_look='sales', 
        look_back=LOOK_BACK, 
        look_fwd=LOOK_FWD, 
        cols_features=num_cols, 
        batch_size=BATCH_SIZE
    ),
    validation_steps=len(sval) // BATCH_SIZE + 1,
    epochs=100,
    callbacks=[earlystopper, modelsaver],
    verbose=1,
    shuffle=False
)

Epoch 1/100
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Epoch 00001: val_loss improved from inf to 0.00016, saving model to ./models/model_v0.h5
Epoch 2/100

In [None]:
plt.figure(figsize=(8, 4))
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='val')
plt.legend()
plt.show()