In [None]:
import warnings
warnings.filterwarnings('ignore')
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from datetime import datetime, timedelta
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, LSTM, Activation, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras import optimizers
pd.set_option('display.max_columns', None)
print('available GPU devices:', len(os.environ['CUDA_VISIBLE_DEVICES']), 
      '| device num:', os.environ['CUDA_VISIBLE_DEVICES'])

In [None]:
DATA_DIR = './data'
MODELS_DIR = './models'
MODEL_VER = 'v0'
CUT_DATE = '2015-07-24'
END_DATE = '2016-04-24'
print(datetime.strptime(END_DATE, '%Y-%m-%d'))
LOOK_BACK = 2 * 28
LOOK_FWD = 28
BATCH_SIZE = 500

In [None]:
CALENDAR_DTYPES = {
    'date':             'str',
    'wm_yr_wk':         'int16', 
    'weekday':          'object',
    'wday':             'int16', 
    'month':            'int16', 
    'year':             'int16', 
    'd':                'object',
    'event_name_1':     'object',
    'event_type_1':     'object',
    'event_name_2':     'object',
    'event_type_2':     'object',
    'snap_CA':          'int16', 
    'snap_TX':          'int16', 
    'snap_WI':          'int16'
}
PARSE_DATES = ['date']
SPRICES_DTYPES = {
    'store_id':    'object', 
    'item_id':     'object', 
    'wm_yr_wk':    'int16',  
    'sell_price':  'float32'
}

In [None]:
def get_df():
    strain = pd.read_csv('{}/sales_train_validation.csv'.format(DATA_DIR))
    print('read train:', strain.shape)
    cat_cols = ['id', 'item_id', 'dept_id','store_id', 'cat_id', 'state_id']
    last_day = int(strain.columns[-1].replace('d_', ''))
    print('last day is:', last_day)
    strain = pd.melt(
        strain,
        id_vars = cat_cols,
        value_vars = [col for col in strain.columns if col.startswith('d_')],
        var_name = 'd',
        value_name = 'sales'
    )
    print('melted train:', strain.shape)
    calendar = pd.read_csv('{}/calendar.csv'.format(DATA_DIR), dtype=CALENDAR_DTYPES, parse_dates=PARSE_DATES)
    print('read calendar:', calendar.shape)
    strain = strain.merge(calendar, on='d', copy=False)
    print('calendar merge done:', strain.shape)
    sprices = pd.read_csv('{}/sell_prices.csv'.format(DATA_DIR), dtype=SPRICES_DTYPES)
    print('read prices:', sprices.shape)
    strain = strain.merge(
        sprices, 
        on=['store_id', 'item_id', 'wm_yr_wk'], 
        how='left'
    )
    print('prices merge done:', strain.shape)
    print('begin train date:', strain['date'].min())
    print('end train date:', strain['date'].max())
    strain = strain.loc[strain['date'] >= CUT_DATE]
    print('date cut train:', strain.shape)
    print('cut train date:', strain['date'].min())
    print('end train date:', strain['date'].max())
    date_features = {
        'week_num': 'weekofyear',
        'quarter': 'quarter',
        'mday': 'day'
    }
    for date_feat_name, date_feat_func in date_features.items():
        strain[date_feat_name] = getattr(strain['date'].dt, date_feat_func).astype('int16')
    print('date features done')
    strain['d'] = strain['d'].apply(lambda x: int(x.replace('d_', '')))  
    drop_cols = ['date', 'wm_yr_wk', 'weekday']
    strain.drop(columns=drop_cols, inplace=True)
    strain.sort_values(by=['id', 'd'], inplace=True)
    print('trash cols deleted, sorted')
    strain['d_'] = strain['d']
    print('out dataframe:', strain.shape)
    return strain

In [None]:
def get_dummies(strain, cols_dummy):
    print('got shape for dummies:', strain.shape)
    strain_dummies = pd.get_dummies(
        strain[cols_dummy],
        drop_first=False,
        dummy_na=True
    )
    strain.drop(columns=cols_dummy, inplace=True)
    strain = pd.concat([strain, strain_dummies], axis=1)
    print('out shape for dummies:', strain.shape)
    return strain

In [None]:
%%time
strain = get_df()

In [None]:
#list_ids = np.random.choice(strain.id.unique(), 500)
list_ids = strain.id.unique()[:500]

In [None]:
strain = strain[strain.id.isin(list_ids)]

In [None]:
id_name = np.random.choice(strain.id.unique())
id_sales = strain[strain.id == id_name]['sales']
print('from', strain['d'].min(), 'to', strain['d'].max()) 
plt.figure(figsize=(18, 4))
id_sales.plot(label='sales')
plt.title(id_name)
plt.legend()
plt.show()

In [None]:
num_cols = ['d', 'sales', 'wday', 'month', 'year', 
            'snap_CA', 'snap_TX', 'snap_WI', 
            'sell_price', 'week_num', 'quarter', 'mday']
SCALER = MinMaxScaler()
strain[num_cols] = SCALER.fit_transform(strain[num_cols])
print('min-max scaled')

In [None]:
sval = strain[strain.d_ >= (1913 - LOOK_BACK - LOOK_FWD)]
print('val | start:', sval.d_.min(), 
      'end:', sval.d_.max(),
      'duration:', sval.d_.max() - sval.d_.min(), 
      'deep:', LOOK_BACK + LOOK_FWD)
strain = strain[strain.d_ <= (1913 - LOOK_FWD)]
print('train | start:', strain.d_.min(), 
      'end:', strain.d_.max(),
      'duration:', strain.d_.max() - strain.d_.min(), 
      'deep:', LOOK_BACK + LOOK_FWD)
event_cols = ['event_name_1', 'event_type_1', 'event_name_2', 'event_type_2']
sdummies_id = strain.drop(columns=num_cols + event_cols + ['d_'])
sdummies_id.drop_duplicates(inplace=True)
print('dummies df done:', sdummies_id.shape)
for col in sdummies_id.columns:
    print('dummies:', col, sdummies_id[col].unique().shape)

In [None]:
%%time
cols_dummy = ['item_id', 'dept_id', 'store_id', 'cat_id', 'state_id']
cols_features = ['d', 'wday', 'month', 'year', 
                 'snap_CA', 'snap_TX', 'snap_WI', 
                 'sell_price', 'week_num', 'quarter', 'mday']
sdummies_id = get_dummies(sdummies_id, cols_dummy)

In [None]:
def get_sequence_batch(df, dummies, col_id, col_look, look_back, look_fwd, cols_features):
    X, y = [], [] 
    for idx in list(df['id'].unique()):
        for i in range(len(df.loc[df[col_id] == idx, col_look]) - look_back - look_fwd):
            temp_X = []
            temp_X.append(df.loc[df[col_id] == idx, col_look][i : i + look_back])
            for col in cols_features:
                temp_X.append(df.loc[df[col_id] == idx, col][i + look_fwd : i + look_back + look_fwd])
            temp_D = np.array([dummies.loc[dummies[col_id] == idx].values[0][1:]] * look_back).T
            temp_X = np.vstack((temp_X, temp_D))
            X.append(temp_X)
            y.append(df.loc[df[col_id] == idx, col_look][i + look_back : i + look_back + look_fwd])
        #print(idx, np.array(X).shape, np.array(y).shape)
    X = [x.T for x in np.array(X)] # to feed LSTM with shape as [samples, time steps, features]
    #print(np.array(X).shape, np.array(y).shape)
    return np.array(X), np.array(y)

In [None]:
def SeqGenerator(df, dummies, col_id, col_look, look_back, look_fwd, cols_features, batch_size):
    while True:
        for i in range(len(df) // batch_size):
            if (i + 1) * batch_size > len(df):
                #print('-->', i)
                yield get_sequence_batch(
                    df[i * batch_size:], 
                    dummies,
                    col_id, 
                    col_look, 
                    look_back, 
                    look_fwd, 
                    cols_features
                )
            else:
                #print('==>', i)
                yield get_sequence_batch(
                    df[i * batch_size : (i + 1) * batch_size], 
                    dummies,
                    col_id, 
                    col_look, 
                    look_back, 
                    look_fwd, 
                    cols_features
                )

In [None]:
%%time
X, y = next(
    SeqGenerator(
        df=strain, 
        dummies=sdummies_id, 
        col_id='id', 
        col_look='sales', 
        look_back=LOOK_BACK, 
        look_fwd=LOOK_FWD, 
        cols_features=cols_features, 
        batch_size=BATCH_SIZE
    )
)
print(X.shape, y.shape)

In [None]:
print('batches in train:', len(strain) // BATCH_SIZE + 1)
print('batches in val:', len(sval) // BATCH_SIZE + 1)

In [None]:
n_features = sdummies_id.shape[1] + len(num_cols) - 1 
model = Sequential()
model.add(LSTM(units=512, input_shape=(LOOK_BACK, n_features), return_sequences=True))
model.add(Dropout(.4))
model.add(LSTM(units=512))
model.add(Dropout(.4))
model.add(Dense(LOOK_FWD))
model.add(Activation('linear'))
adam = optimizers.Adam(lr=.001, clipvalue=.5, clipnorm=1)
model.compile(loss='mse', optimizer=adam)
model.summary()

In [None]:
%%time
model_file = '{}/model_{}.h5'.format(MODELS_DIR, MODEL_VER)
modelsaver = ModelCheckpoint(
    model_file, 
    monitor='val_loss', 
    verbose=1, 
    save_best_only=True,
    mode='min'
)
earlystopper = EarlyStopping(monitor='val_loss', patience=10, verbose=1)
history = model.fit_generator(
    SeqGenerator(
        df=strain, 
        dummies=sdummies_id, 
        col_id='id', 
        col_look='sales', 
        look_back=LOOK_BACK, 
        look_fwd=LOOK_FWD, 
        cols_features=cols_features, 
        batch_size=BATCH_SIZE
    ),
    steps_per_epoch=len(strain) // BATCH_SIZE + 1,
    validation_data=SeqGenerator(
        df=sval, 
        dummies=sdummies_id, 
        col_id='id', 
        col_look='sales', 
        look_back=LOOK_BACK, 
        look_fwd=LOOK_FWD, 
        cols_features=cols_features, 
        batch_size=BATCH_SIZE
    ),
    validation_steps=len(sval) // BATCH_SIZE + 1,
    epochs=100,
    callbacks=[earlystopper, modelsaver],
    verbose=1,
    shuffle=False
)

In [None]:
plt.figure(figsize=(8, 4))
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='val')
plt.legend()
plt.show()

In [None]:
model_file = '{}/model_{}.h5'.format(MODELS_DIR, MODEL_VER)
model = load_model(model_file)
model.summary()

In [None]:
X_val, y_val = next(
    SeqGenerator(
        df=sval, 
        dummies=sdummies_id, 
        col_id='id', 
        col_look='sales', 
        look_back=LOOK_BACK, 
        look_fwd=LOOK_FWD, 
        cols_features=num_cols, 
        batch_size=BATCH_SIZE
    )
)
print(X_val.shape, y_val.shape)

In [None]:
def get_sequence_predict(df, pred_len, dummies, col_id, col_look, look_back, look_fwd, cols_features):
    X, temp_X = [], []
    i_end = len(df) - pred_len - look_back - look_fwd - 1
    temp_X.append(df[col_look][i_end : i_end + look_back])
    for col in cols_features:
        temp_X.append(df[col][i_end + look_fwd : i_end + look_back + look_fwd])
    temp_D = np.array([dummies.loc[dummies[col_id] == df.id.values[0]].values[0][1:]] * look_back).T
    temp_X = np.vstack((temp_X, temp_D))
    X.append(temp_X)
    X = [x.T for x in np.array(X)] # to feed LSTM with shape as [samples, time steps, features]
    return np.array(X)
def upscale(series):
    return SCALER.inverse_transform(np.repeat([series], 12, axis=0).T)[:, 1]

In [None]:
plt.figure(figsize=(16, 18))
for i in range(10):
    pred_id = np.random.choice(strain.id.unique())
    X_pred = get_sequence_predict(
        df=strain[strain.id == pred_id], 
        pred_len = 0,
        dummies=sdummies_id, 
        col_id='id', 
        col_look='sales', 
        look_back=LOOK_BACK, 
        look_fwd=LOOK_FWD, 
        cols_features=cols_features
    )
    preds = model.predict(X_pred)
    plt.subplot(5, 2, i + 1)
    plt.plot(sval[sval.id == pred_id]['d_'].values[-LOOK_FWD :],
             upscale(preds[0]), 
             label='preds')
    plt.plot(sval[sval.id == pred_id]['d_'].values[-LOOK_FWD:], 
             upscale(sval[sval.id == pred_id]['sales'].values[-LOOK_FWD :]), 
             label='true')
    plt.plot(strain[strain.id == pred_id]['d_'].values[-2 * LOOK_BACK :],
             upscale(strain[strain.id == pred_id]['sales'].values[-2 * LOOK_BACK :]),
             label='train')
    plt.title(pred_id)
    plt.legend()
plt.show()

In [None]:
%%time
spred = get_df(is_train=False, backward_lags=LOOK_BACK)
spred[num_cols] = SCALER.transform(spred[num_cols])
print('min-max scaled')
#spred.drop(columns=['d_'], inplace=True)

In [None]:
print('len per one id:', len(spred[(spred.id == 'FOODS_1_001_CA_1_validation')]))
print('unique ids:', len(spred.id.unique()))
print('len forward:', len(spred[(spred.id == 'FOODS_1_001_CA_1_validation') & (spred.d_ > 1913)]))
print('min max day:', spred.d_.min(), spred.d_.max())
spred[spred.d_ >= 1913].head()

In [None]:
for pred_id in tqdm(spred.id.unique()):
    X_pred = get_sequence_predict(
        df=spred[spred.id == pred_id], 
        pred_len = 28,
        dummies=sdummies_id, 
        col_id='id', 
        col_look='sales', 
        look_back=LOOK_BACK, 
        look_fwd=LOOK_FWD, 
        cols_features=cols_features
    )
    preds = model.predict(X_pred)
    spred.loc[
        (spred.id == pred_id) & 
        (spred.d_ > 1913) & 
        (spred.d_ <= 1913 + 28), 
        'sales'
    ] = upscale(preds[0]) 

In [None]:
pred_id =  np.random.choice(spred.id.unique())
spred_plt = spred.loc[spred.id == pred_id, ['d_', 'sales']].set_index('d_')
spred_plt.loc[spred_plt.index <= 1913, 'sales'] = upscale(spred_plt[spred_plt.index <= 1913]['sales'])
plt.figure(figsize=(16, 4))
plt.plot(spred_plt[spred_plt.index <= 1913], label='fact')
plt.plot(spred_plt[spred_plt.index > 1913], label='pred')
plt.title(pred_id)
plt.legend()
plt.show()

In [None]:
spred_subm = spred.loc[spred.d_ > 1913, ['id', 'd_', 'sales']].copy()
spred_subm['d_'] = spred_subm['d_'].apply(lambda x: 'F{}'.format(x - 1913))
spred_subm.loc[spred_subm['sales'] < 0, 'sales'] = 0
spred_subm.head()

In [None]:
f_cols = ['F{}'.format(x) for x in range(1, 28 + 1)]
spred_subm = spred_subm.set_index(['id', 'd_']).unstack()['sales'][f_cols].reset_index()
spred_subm.fillna(0, inplace=True)
spred_subm.sort_values('id', inplace=True)
spred_subm.reset_index(drop=True, inplace=True)
spred_subm.head()

In [None]:
spred_subm_eval = spred_subm.copy()
spred_subm_eval['id'] = spred_subm_eval['id'].str.replace('validation', 'evaluation')
spred_subm = pd.concat([spred_subm, spred_subm_eval], axis=0, sort=False)
print(spred_subm.shape)
spred_subm.to_csv('submission.csv',index=False)