## Import Packages

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from math import sqrt

Using TensorFlow backend.


### Read Data

In [2]:
calendar = pd.read_csv('../data/calendar.csv')
sales = pd.read_csv('../data/sales_train_evaluation.csv')
price = pd.read_csv('../data/sell_prices.csv')

### Fill in the columns to be predicted
History data : d_1 ~ d_1913

Predict:

- Validation : d_1914 ~ d_1941
- Evaluation : d_1942 ~ d_1969

In [3]:
sales_fill_date = sales.copy()
date_list = []
for i in range(1942, 1970):
    date_list.append('d_'+str(i))
fill_date = pd.DataFrame(columns = date_list)
sales_fill_date = pd.concat([sales_fill_date, fill_date], axis = 1)
sales_fill_date

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,...,d_1960,d_1961,d_1962,d_1963,d_1964,d_1965,d_1966,d_1967,d_1968,d_1969
0,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,,,,,,,,,,
1,HOBBIES_1_002_CA_1_evaluation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,,,,,,,,,,
2,HOBBIES_1_003_CA_1_evaluation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,,,,,,,,,,
3,HOBBIES_1_004_CA_1_evaluation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,,,,,,,,,,
4,HOBBIES_1_005_CA_1_evaluation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30485,FOODS_3_823_WI_3_evaluation,FOODS_3_823,FOODS_3,FOODS,WI_3,WI,0,0,2,2,...,,,,,,,,,,
30486,FOODS_3_824_WI_3_evaluation,FOODS_3_824,FOODS_3,FOODS,WI_3,WI,0,0,0,0,...,,,,,,,,,,
30487,FOODS_3_825_WI_3_evaluation,FOODS_3_825,FOODS_3,FOODS,WI_3,WI,0,6,0,2,...,,,,,,,,,,
30488,FOODS_3_826_WI_3_evaluation,FOODS_3_826,FOODS_3,FOODS,WI_3,WI,0,0,0,0,...,,,,,,,,,,


## Slice dataset into smaller groups for EDA & prediction demo
- cat_id : FOODS
- store_id : CA_1

In [4]:
def create_group_df(group_sales, df_calendar, df_price):
    # Transpose 'd_1' to 'd_1969' columns of group_sales
    group_sales = pd.melt(group_sales, id_vars=group_sales.columns[0:6], 
                          value_vars=group_sales.columns[6:], var_name='d', value_name='sales')
    
    # Merge group_sales with calendar & price
    df_group = pd.merge(group_sales, calendar, how='left', on='d')
    df_group = pd.merge(df_group, price, how='left')
    
    # Extract necessary columns
    df_group = df_group[['id','item_id','dept_id','cat_id','d','sales','wm_yr_wk','event_name_1',
                 'event_type_1','event_name_2','event_type_2','snap_CA','snap_TX','snap_WI']].copy()
    df_group['d_new'] = df_group['d'].str[2:].astype('int')
    df_group['week'] = df_group['wm_yr_wk']-11100
    df_group = df_group.sort_values(['id', 'd_new']).reset_index(drop=True)
    return df_group

### Create dataset for prediction

* Reference of LSTM Model:
#### https://machinelearningmastery.com/time-series-forecasting-long-short-term-memory-network-python/

In [5]:
# create a differenced series
def difference(dataset, interval=1):
	diff = list()
	for i in range(interval, len(dataset)):
		value = dataset[i] - dataset[i - interval]
		diff.append(value)
	return pd.Series(diff)

In [6]:
# frame a sequence as a supervised learning problem(fill NaN with 0)
def timeseries_to_supervised_1(data, lag=365):
	df = pd.DataFrame(data)
	columns = [df.shift(i) for i in range(1, lag+1)]
	columns.append(df)
	df = pd.concat(columns, axis=1)
	df.fillna(0, inplace=True)
	return df

In [7]:
# frame a sequence as a supervised learning problem(drop rows with NaN)
def timeseries_to_supervised_2(data, lag=365):
	df = pd.DataFrame(data)
	columns = [df.shift(i) for i in range(1, lag+1)]
	columns.append(df)
	df = pd.concat(columns, axis=1)
	df = df.dropna()
	return df

In [8]:
# scale train and test data to [-1, 1]
def scale(train, test=np.array([])):
	# fit scaler
	scaler = MinMaxScaler(feature_range=(-1, 1))
	scaler = scaler.fit(train)
	# transform train
	train = train.reshape(train.shape[0], train.shape[1])
	train_scaled = scaler.transform(train)
	if test.size != 0:
	    # transform test
	    test = test.reshape(test.shape[0], test.shape[1])
	    test_scaled = scaler.transform(test)
	else: 
	    test_scaled = test
	return scaler, train_scaled, test_scaled

### Build the LSTM model

In [9]:
# fit an LSTM network to training data
def fit_lstm(train, batch_size, nb_epoch, neurons):
	X, y = train[:, 0:-1], train[:, -1]
	X = X.reshape(X.shape[0], 1, X.shape[1])
	model = Sequential()
	model.add(LSTM(neurons, batch_input_shape=(batch_size, X.shape[1], X.shape[2]), stateful=True))
	model.add(Dense(1))
	model.compile(loss='mean_squared_error', optimizer='adam')
    # Revise the way of model training to reset state for each epoch
	for i in range(nb_epoch):
		model.fit(X, y, epochs=1, batch_size=batch_size, verbose=0, shuffle=False)
		model.reset_states()
	return model

### Prediction based on the model

In [10]:
# make a one-step forecast
def forecast_lstm(model, batch_size, X):
	X = X.reshape(1, 1, len(X))
	yhat = model.predict(X, batch_size=batch_size)
	return yhat[0,0]

In [11]:
# inverse scaling for a forecasted value
def invert_scale(scaler, X, value):
	new_row = [x for x in X] + [value]
	array = np.array(new_row)
	array = array.reshape(1, len(array))
	inverted = scaler.inverse_transform(array)
	return inverted[0, -1]

In [12]:
# invert differenced value
def inverse_difference(history, yhat, interval=1):
	return yhat + history[-interval]

### Predict all item sales

Total: 30 models (3 categories * 10 stores)

Predict each product (each id) by using it's groups model.

Time: about 4 hours

Score: 1.1214

In [None]:
final_val = pd.DataFrame()
final_eval = pd.DataFrame()

# slice dataset into smaller groups
for cat in sales_fill_date['cat_id'].unique():
    for store in sales_fill_date['store_id'].unique():
        print(store, cat)
        df_sales = sales_fill_date[(sales_fill_date['cat_id']==cat) & (sales_fill_date['store_id']==store)]
        df_group = create_group_df(df_sales, calendar, price)

        # form the sales dataset
        df_group_ttl_sales = df_group[['d_new','sales']].groupby(['d_new'])['sales'].sum().reset_index()[:-28]
        group_ttl_sales = df_group_ttl_sales.sales.values

        # transform data to be stationary
        group_diff = difference(group_ttl_sales, 1)

        # transform data to be supervised learning by timeseries_to_supervised_1 function
        group_supervised = timeseries_to_supervised_2(group_diff, 365).values

        # split data into training set
        train = group_supervised[0:-56]

        # transform the scale of the data
        scaler, train_scaled, test_scaled = scale(train)

        # fit the model
        lstm_model = fit_lstm(train_scaled, 1, 100, 4)

        # forecast the entire training dataset to build up state for forecasting
        train_reshaped = train_scaled[:,:-1].reshape(len(train_scaled), 1, 365)
        lstm_model.predict(train_reshaped, batch_size=1)

        # slice dataset into item data
        for ID in df_group['id'].unique():
            item_sales = df_group[df_group['id']==ID].copy().reset_index(drop = True).sales.values[:-28]
            
            # transform data to be stationary
            item_diff = difference(item_sales, 1)

            # transform data to be supervised learning by timeseries_to_supervised_1 function
            item_supervised = timeseries_to_supervised_2(item_diff, 365).values

            # slice data for forecasting
            forecast = item_supervised[-56:]

            # transform the scale of the data
            scaler, forecast_scaled, test_scaled = scale(forecast)
            
            # walk-forward validation on the forecast data
            predictions = list()
            for i in range(len(forecast_scaled)): # make one forecast for each day
                # make one-step forecast
                X = forecast_scaled[i,:-1]
                yhat = forecast_lstm(lstm_model, 1, X)
                # invert scaling
                yhat = invert_scale(scaler, X, yhat)
                # invert differencing
                yhat = inverse_difference(item_sales, yhat, len(forecast_scaled)+1-i)
                # store forecast
                predictions.append(yhat)
            
            result_list = []
            for i in range(0,len(predictions)):
                result_dict = {}
                result_dict['pred_value'] = int(predictions[i])
                result_list.append(result_dict)
            df_result = pd.DataFrame(result_list)
        
            final_val = final_val.append(np.transpose(pd.DataFrame([ID] + list(df_result['pred_value'].iloc[0:28,]))))
            final_eval = final_eval.append(np.transpose(pd.DataFrame([ID] + list(df_result['pred_value'].iloc[28:,])))) 

CA_1 HOBBIES
CA_2 HOBBIES
CA_3 HOBBIES
CA_4 HOBBIES
TX_1 HOBBIES
TX_2 HOBBIES
TX_3 HOBBIES
WI_1 HOBBIES
WI_2 HOBBIES
WI_3 HOBBIES
CA_1 HOUSEHOLD
CA_2 HOUSEHOLD
CA_3 HOUSEHOLD
CA_4 HOUSEHOLD
TX_1 HOUSEHOLD


### Final submission processing

In [27]:
submission_colnames = ['id']
for i in range(1,29):
    submission_colnames.append('F'+str(i))
submission_all = pd.DataFrame(columns = submission_colnames)

final_val.columns = submission_colnames
final_eval.columns = submission_colnames
final_val['id'] = final_val['id'].str.replace('evaluation','validation')

submission = pd.concat([final_val, final_eval], axis = 0).reset_index(drop = True)
submission = submission.clip(lower=0)
submission_all = pd.concat([submission_all, submission], axis = 0)
submission_all.to_csv('submission_lstm_ver2.csv', index = False)    