In [1]:
################################
#
# Dataset: https://www.kaggle.com/tevecsystems/retail-sales-forecasting
#
# The data contains the sales and price of a product at the end of each day
# We will forecast the sales for next day, given the sales for the last few days
# Using LSTMs cells and RNN
#
# Author: Anurag Dwarakanath
################################

In [2]:
import tensorflow as tf
import pandas
import numpy as np

In [3]:
# Constants
DATAFILE_TRAIN = "/home/ubuntu/anurag/rnn/mock_kaggle_edit_train.csv" 

DATAFILE_VALIDATE = "/home/ubuntu/anurag/rnn/mock_kaggle_edit_validate.csv"

# The original data is here: https://www.kaggle.com/tevecsystems/retail-sales-forecasting
# The original data was simplified to consider only 'date', 'sales', 'price'
# However, we will consider only 'sales' & 'date'

TIME_STEPS = 10 # i.e. look at the past 10 days and forecast
NUMBER_OF_DAYS_TO_FORECAST = 1 # for now we will only forecast the next day's sales

BATCH_SIZE=100

NUM_EPOCHS=100

LSTM_UNITS = 250

In [4]:
#ingest the data
data_train = pandas.read_csv(DATAFILE_TRAIN) # data will be an array of dict. E.g. data['sales'][0]
data_validate = pandas.read_csv(DATAFILE_VALIDATE)

In [5]:
# let's print the first 5 rows of the data
data_train.head()

Unnamed: 0,date,sales,price
0,1/1/2014,0,1.29
1,1/2/2014,70,1.29
2,1/3/2014,59,1.29
3,1/4/2014,93,1.29
4,1/5/2014,96,1.29


In [6]:
# Create the training & validation data

numTrainingData = len(data_train)
numValidationData = len(data_validate)

trainingData_date = data_train['date'][0:numTrainingData]
trainingData_sales = data_train['sales'][0:numTrainingData]
trainindData_price = data_train['price'][0:numTrainingData]

validationData_date = data_validate['date'][0: numValidationData]
validationData_sales = data_validate['sales'][0: numValidationData]
validationData_price = data_validate['price'][0: numValidationData]

In [7]:
trainingData_sales

0        0
1       70
2       59
3       93
4       96
5      145
6      179
7      321
8      125
9       88
10     188
11     121
12     134
13      80
14      82
15      94
16     159
17     199
18     104
19      70
20     127
21      96
22      75
23     198
24     168
25     125
26      86
27     222
28     272
29     209
      ... 
720    100
721      9
722      0
723      0
724      0
725      3
726      0
727      0
728      3
729      3
730      0
731      2
732      0
733      3
734      0
735    103
736    164
737    118
738     65
739     24
740      0
741      1
742      0
743      1
744     60
745    103
746    263
747    189
748    197
749    105
Name: sales, Length: 750, dtype: int64

In [8]:
print(len(trainingData_sales))
print(len(validationData_sales))

750
187


In [9]:
# Normalise the training and validation data.
# We use the (x - min)/(range) to normalise, where range = (max - min)
# This will put the values between 0 & 1
trainingData_sales_min = min(trainingData_sales)
trainingData_sales_max = max(trainingData_sales)
trainingData_sales_range = trainingData_sales_max - trainingData_sales_min
trainingData_sales_normalised = [(i - trainingData_sales_min)/trainingData_sales_range for i in trainingData_sales]

validationData_sales_normalised = [(i - trainingData_sales_min)/trainingData_sales_range for i in validationData_sales]
# Note the validation data uses the max & min values of Training Data

In [10]:
trainingData_sales_normalised

[0.0,
 0.12915129151291513,
 0.1088560885608856,
 0.17158671586715868,
 0.17712177121771217,
 0.26752767527675275,
 0.33025830258302585,
 0.5922509225092251,
 0.23062730627306274,
 0.16236162361623616,
 0.34686346863468637,
 0.22324723247232472,
 0.24723247232472326,
 0.14760147601476015,
 0.15129151291512916,
 0.17343173431734318,
 0.2933579335793358,
 0.3671586715867159,
 0.1918819188191882,
 0.12915129151291513,
 0.23431734317343172,
 0.17712177121771217,
 0.13837638376383765,
 0.36531365313653136,
 0.30996309963099633,
 0.23062730627306274,
 0.15867158671586715,
 0.4095940959409594,
 0.5018450184501845,
 0.3856088560885609,
 0.6808118081180812,
 0.4003690036900369,
 0.17896678966789667,
 0.2158671586715867,
 0.18450184501845018,
 0.11808118081180811,
 0.06457564575645756,
 0.0996309963099631,
 0.1014760147601476,
 0.014760147601476014,
 0.07933579335793357,
 0.1974169741697417,
 0.2140221402214022,
 0.16051660516605165,
 0.12177121771217712,
 0.08118081180811808,
 0.060885608856088

In [11]:
# we will create the set of sequences
trainingDataSequence_sales = np.zeros(shape=(len(trainingData_sales)-TIME_STEPS-NUMBER_OF_DAYS_TO_FORECAST+1, TIME_STEPS, 1))
targetDataSequence_sales = np.zeros(shape=(len(trainingData_sales)-TIME_STEPS-NUMBER_OF_DAYS_TO_FORECAST+1, NUMBER_OF_DAYS_TO_FORECAST))
start = 0
for i in range(TIME_STEPS, len(trainingData_sales)-NUMBER_OF_DAYS_TO_FORECAST+1):
    trainingDataSequence_sales[start,:,0]=trainingData_sales_normalised[start:i]
    targetDataSequence_sales[start] = trainingData_sales_normalised[i:i+NUMBER_OF_DAYS_TO_FORECAST]
    start=start+1

## MT: make sure the sequences do not miss out any data
# maybe create predictable data (1, 2, 3, 4) then put completely rubbish data (23434) at the end.. this should be picked up in training


## The format of the data sequences is as follows:

trainingDataSequence_sales = \[sales_day1 sales_day2 sales_day3 ... sales_day10\] 

targetDataSequence_sales = \[sales_day11\]

Although the trainingDataSequence_sales can be captured through a 2D array, we use 3D (i.e. shape = (num of sequence, time steps, 1) ) because the tensor input for the LSTM cell expects a 3D input. The third dimension is the *num of features*  of each data point (in our case, it is a single dimension).

In [12]:
# Print the first few sequences of the sales that have been created
[trainingDataSequence_sales[i, :, 0] for i in range(3)]

[array([0.        , 0.12915129, 0.10885609, 0.17158672, 0.17712177,
        0.26752768, 0.3302583 , 0.59225092, 0.23062731, 0.16236162]),
 array([0.12915129, 0.10885609, 0.17158672, 0.17712177, 0.26752768,
        0.3302583 , 0.59225092, 0.23062731, 0.16236162, 0.34686347]),
 array([0.10885609, 0.17158672, 0.17712177, 0.26752768, 0.3302583 ,
        0.59225092, 0.23062731, 0.16236162, 0.34686347, 0.22324723])]

In [13]:
#Print the first few target sales
[targetDataSequence_sales[i] for i in range(3)]

[array([0.34686347]), array([0.22324723]), array([0.24723247])]

###### Observe that the first sales target (0.34686347) becomes the last day sales of the next triainingDataSequence

i.e. targetDataSequence_sales\[i\] = targetDataSequence_sales\[i+1,TIME_STEPS-1,0\]

In [14]:
# since we are making each sequence independent to each other,
# the order of the sequences can be shuffled.
# Note that we need to ensure the target and the train sequences are in the same order
a = np.arange(len(targetDataSequence_sales))
np.random.shuffle(a)
trainingDataSequence_sales_shuffle = np.zeros(shape=(len(trainingData_sales)-TIME_STEPS-NUMBER_OF_DAYS_TO_FORECAST+1, TIME_STEPS, 1))
targetDataSequence_sales_shuffle = np.zeros(shape=(len(trainingData_sales)-TIME_STEPS-NUMBER_OF_DAYS_TO_FORECAST+1, NUMBER_OF_DAYS_TO_FORECAST))

loc=0
for i in a:
    trainingDataSequence_sales_shuffle[loc] = trainingDataSequence_sales[i]
    targetDataSequence_sales_shuffle[loc] = targetDataSequence_sales[i]
    loc+=1

trainingDataSequence_sales = trainingDataSequence_sales_shuffle
targetDataSequence_sales = targetDataSequence_sales_shuffle

In [15]:
# Build the seqeunces for validation
validationDataSequence_sales = np.zeros(shape=(len(validationData_sales) - TIME_STEPS - NUMBER_OF_DAYS_TO_FORECAST + 1, TIME_STEPS, 1 ))
validationDataSequence_sales_target = np.zeros(shape=(len(validationData_sales) - TIME_STEPS - NUMBER_OF_DAYS_TO_FORECAST + 1, NUMBER_OF_DAYS_TO_FORECAST))

start = 0
for i in range(TIME_STEPS, len(validationData_sales)-NUMBER_OF_DAYS_TO_FORECAST + 1):
    validationDataSequence_sales[start, :, 0] = validationData_sales_normalised[start:i]
    validationDataSequence_sales_target[start] = validationData_sales_normalised[i:i+NUMBER_OF_DAYS_TO_FORECAST]
    start+=1

In [16]:
#build the RNN model
tf.reset_default_graph()

inputSequencePlaceholder = tf.placeholder(dtype=tf.float32, shape=(None, TIME_STEPS, 1), name='inputSequencePlaceholder')
targetPlaceholder = tf.placeholder(dtype=tf.float32, shape=(None, NUMBER_OF_DAYS_TO_FORECAST), name='targetPlaceholder')

#cell = tf.contrib.cudnn_rnn.CudnnLSTM(num_layers=1, num_units=5, name='LSTM_cell')
cell = tf.nn.rnn_cell.LSTMCell(num_units=LSTM_UNITS, name='LSTM_cell')


output, state = tf.nn.dynamic_rnn(cell=cell, inputs=inputSequencePlaceholder, dtype=tf.float32)
#output is of shape (Batch, time, num_units)

lastCellOutput = output[:, -1,:]

In [17]:
print('output:',output)
print('state:', state)
print('lastCellOutput:', lastCellOutput)

output: Tensor("rnn/transpose_1:0", shape=(?, 10, 250), dtype=float32)
state: LSTMStateTuple(c=<tf.Tensor 'rnn/while/Exit_3:0' shape=(?, 250) dtype=float32>, h=<tf.Tensor 'rnn/while/Exit_4:0' shape=(?, 250) dtype=float32>)
lastCellOutput: Tensor("strided_slice:0", shape=(?, 250), dtype=float32)


## Understanding the type and shape of **output** & **state**
the RNN gives two outputs for every sequence of input: a) output & b) state.

**output** is the per time step output (i.e. output from each time step and which is represented as a_t in our notes). Thus it has a shape of (Batch, 10, 25). We are interested only in the final time step output (i.e. Batch, 9, 250) and this is captured by lastCellOutput

the **state** is the value of the internal parameters of the LSTM that can be propogated to the next sequence if need be. The state is of type LSTMStateTuple which contains two tensors of shape (batch, 250) each.

In [18]:
#build the output layer from the RNN outputs
weights = tf.Variable(initial_value=tf.truncated_normal(shape=(LSTM_UNITS, NUMBER_OF_DAYS_TO_FORECAST)))
bias = tf.Variable(initial_value=tf.ones(shape=(NUMBER_OF_DAYS_TO_FORECAST)))

forecast = tf.matmul(a=lastCellOutput, b=weights) + bias

# bring back the forecast to the original scale
forecast_originalScale = forecast * trainingData_sales_range + trainingData_sales_min

In [19]:
print(forecast)
print(forecast_originalScale)

Tensor("add:0", shape=(?, 1), dtype=float32)
Tensor("add_1:0", shape=(?, 1), dtype=float32)


the **forecast** is the layer that computes the final forecast. this is the y_hat in our notes. **forecast** will have a shape of (Batch, 1)

In [20]:
loss = tf.reduce_mean(tf.squared_difference(x=forecast, y=targetPlaceholder))

In [21]:
optimizer = tf.train.AdamOptimizer(learning_rate=0.1)
minimize_step = optimizer.minimize(loss)

In [22]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    sess.run(tf.local_variables_initializer())
    
    #start the training
    for e in range(NUM_EPOCHS):
        print('starting training for epoch:', e+1)
        
        startLocation=0
        iteration=0
        for iteration in range(int(len(targetDataSequence_sales)/BATCH_SIZE)):
            print('epoch:', e+1, ' iteration:', iteration+1)
            trainingBatchInput = trainingDataSequence_sales[startLocation:startLocation+BATCH_SIZE, :, :]
            trainingBatchTarget = targetDataSequence_sales[startLocation:startLocation+BATCH_SIZE]
            
            _, lsBatch, forecastBatch, forecastBatch_originalScale = sess.run([minimize_step, loss, forecast, forecast_originalScale], feed_dict={inputSequencePlaceholder: trainingBatchInput,
                                                                                            targetPlaceholder: trainingBatchTarget})
            
            if ((iteration+1) % 1 == 0):
                print('got a loss of:', lsBatch)
                print('the forecast of first 5 normalised are:', forecastBatch[0:5])
                print('while the actuals were normalised     :', trainingBatchTarget[0:5])
                print('the forecast of first 5 orignal scale are:', forecastBatch_originalScale[0:5])
                print('while the actuals were original scale     :', trainingBatchTarget[0:5]* trainingData_sales_range + trainingData_sales_min)
            
            startLocation+=BATCH_SIZE
        
        #pick up the training data that does not fit into BATCH_SIZE
        if (len(targetDataSequence_sales) > startLocation):
            print('epoch:', e+1, ' iteration:', iteration+1)
            trainingBatchInput = trainingDataSequence_sales[startLocation:len(targetDataSequence_sales), :, :]
            trainingBatchTarget = targetDataSequence_sales[startLocation:len(targetDataSequence_sales)]
            
            _, lsBatch, forecastBatch, forecastBatch_originalScale = sess.run([minimize_step, loss, forecast, forecast_originalScale], feed_dict={inputSequencePlaceholder: trainingBatchInput,
                                                                                            targetPlaceholder: trainingBatchTarget})
            
            print('got a loss of:', lsBatch)
            print('the forecast of first 5 normalised are:', forecastBatch[0:5])
            print('while the actuals were normalised     :', trainingBatchTarget[0:5])
            print('the forecast of first 5 orignal scale are:', forecastBatch_originalScale[0:5])
            print('while the actuals were original scale     :', trainingBatchTarget[0:5]* trainingData_sales_range + trainingData_sales_min)
    
    
        #end of 1 epoch. Perform validation
        totalValidationLoss = 0
        startLocation=0
        print('starting validation')
        for iter in range(len(validationDataSequence_sales)//BATCH_SIZE):
            validationBatchInput = validationDataSequence_sales[startLocation: startLocation+BATCH_SIZE, :, :]
            validationBatchTarget = validationDataSequence_sales_target[startLocation: startLocation+BATCH_SIZE]
            
            validationLsBatch, validationForecastBatch, validationForecastBatch_originalScale = sess.run([loss, forecast, forecast_originalScale], feed_dict={inputSequencePlaceholder: validationBatchInput,
                                                                                              targetPlaceholder: validationBatchTarget})
            
           
            startLocation+=BATCH_SIZE
            totalValidationLoss+= validationLsBatch
            
            print('first five predictions:', validationForecastBatch[0:5])
            print('first five actuals    :', validationBatchTarget[0:5])
            print('the forecast of first 5 orignal scale are:', validationForecastBatch_originalScale[0:5])
            print('while the actuals were original scale     :', validationBatchTarget[0:5]* trainingData_sales_range + trainingData_sales_min)
    
        
        if(startLocation < len(validationDataSequence_sales)):
            validationBatchInput = validationDataSequence_sales[startLocation: len(validationDataSequence_sales)]
            validationBatchTarget = validationDataSequence_sales_target[startLocation: len(validationDataSequence_sales)]
            
            validationLsBatch, validationForecastBatch = sess.run([loss, forecast], feed_dict={inputSequencePlaceholder: validationBatchInput,
                                                                                              targetPlaceholder: validationBatchTarget})
            
            totalValidationLoss+= validationLsBatch
        
        #validation completed.
        print('Validation completed after epoch:',e+1 ,'. Total validation loss:', totalValidationLoss)
            
    

    

starting training for epoch: 1
epoch: 1  iteration: 1
got a loss of: 0.7606255
the forecast of first 5 normalised are: [[1.0147064]
 [1.0091066]
 [1.0190531]
 [1.0073123]
 [1.0238214]]
while the actuals were normalised     : [[0.18634686]
 [0.19926199]
 [0.25830258]
 [0.23800738]
 [0.42804428]]
the forecast of first 5 orignal scale are: [[549.9708 ]
 [546.9358 ]
 [552.3268 ]
 [545.96326]
 [554.9112 ]]
while the actuals were original scale     : [[101.]
 [108.]
 [140.]
 [129.]
 [232.]]
epoch: 1  iteration: 2
got a loss of: 7710.0713
the forecast of first 5 normalised are: [[-87.68513]
 [-87.68503]
 [-87.5487 ]
 [-87.68245]
 [-87.54081]]
while the actuals were normalised     : [[0.20110701]
 [0.17158672]
 [0.        ]
 [0.17343173]
 [0.2195572 ]]
the forecast of first 5 orignal scale are: [[-47525.34 ]
 [-47525.285]
 [-47451.395]
 [-47523.887]
 [-47447.117]]
while the actuals were original scale     : [[109.]
 [ 93.]
 [  0.]
 [ 94.]
 [119.]]
epoch: 1  iteration: 3
got a loss of: 280.9935