## Transform TimeSeries Data to Supervised Learning Problem

In [1]:
import numpy as np

In [2]:
# Define univariate time series data
series = np.arange(1, 11)
print(series.shape)

(10,)


#### Test the sub procedure

In [3]:
n_step = 3
pred_step = 1

In [4]:
x_list, y_list = [], [] 
for s in range(series.shape[0]-pred_step-n_step+1):
    x_list.append(series[s:s+n_step])
    y_list.append(series[s+n_step])
X, y = np.array(x_list), np.array(y_list)
print(X, y)

[[1 2 3]
 [2 3 4]
 [3 4 5]
 [4 5 6]
 [5 6 7]
 [6 7 8]
 [7 8 9]] [ 4  5  6  7  8  9 10]


In [5]:
def split_sequence(series, n_step):
    x_list, y_list = [], [] 
    for s in range(series.shape[0]-pred_step-n_step+1):
        x_list.append(series[s:s+n_step])
        y_list.append(series[s+n_step])
    X, y = np.array(x_list), np.array(y_list)
    return X, y

In [6]:
X, y = split_sequence(series, n_step)
print(X, y)

[[1 2 3]
 [2 3 4]
 [3 4 5]
 [4 5 6]
 [5 6 7]
 [6 7 8]
 [7 8 9]] [ 4  5  6  7  8  9 10]


#### Dissecting the formal anwser

In [7]:
X, y = list(), list()
sequence = series
for i in range(len(sequence)):
    end_ix = i + n_step
    if end_ix > len(sequence) - 1:
        break
    seq_x, seq_y = sequence[i:end_ix], sequence[end_ix]
    print(seq_x, seq_y)

[1 2 3] 4
[2 3 4] 5
[3 4 5] 6
[4 5 6] 7
[5 6 7] 8
[6 7 8] 9
[7 8 9] 10


#### Formal answer

In [8]:
# transform univariate time series to supervised learning problem
from numpy import array

# split a univariate sequence into samples
def split_sequence(sequence, n_steps):
    X, y = list(), list()
    for i in range(len(sequence)):
        # find the end of this pattern
        end_ix = i + n_steps
        # check if we are beyond the sequence
        if end_ix > len(sequence)-1:
            break
        # gather input and output parts of the pattern
        seq_x, seq_y = sequence[i:end_ix], sequence[end_ix]
        X.append(seq_x)
        y.append(seq_y)
    return array(X), array(y)

# define univariate time series
series = array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
print(series.shape)
# transform to a supervised learning problem
X, y = split_sequence(series, 3)
print(X.shape, y.shape)
# show each sample
for i in range(len(X)):
    print(X[i], y[i])
print(y)

(10,)
(7, 3) (7,)
[1 2 3] 4
[2 3 4] 5
[3 4 5] 6
[4 5 6] 7
[5 6 7] 8
[6 7 8] 9
[7 8 9] 10
[ 4  5  6  7  8  9 10]


#### Reshape the 2D supv learning data into 3D (samples, time steps, features)

In [9]:
batch_size = 7
X.reshape((batch_size, n_step, 1))

array([[[1],
        [2],
        [3]],

       [[2],
        [3],
        [4]],

       [[3],
        [4],
        [5]],

       [[4],
        [5],
        [6]],

       [[5],
        [6],
        [7]],

       [[6],
        [7],
        [8]],

       [[7],
        [8],
        [9]]])

### 6.4 Data Preparation Example

I have two columns in my data file with 5,000 rows, column 1 is time (with 1 hour interval) and column 2 is the number of sales and I am trying to forecast the number of sales for future time steps. Help me to set the number of samples, time steps and features in this data for an LSTM?

In [10]:
data = np.array([np.array([i+1, (i+1)*10]) for i in range(5000)])
print(data[:3])
print(data.shape)
### drop the time column if there is no missing row
data = data[:, 0]
data

[[ 1 10]
 [ 2 20]
 [ 3 30]]
(5000, 2)


array([   1,    2,    3, ..., 4998, 4999, 5000])

In [11]:
# 6.4.3 Split Into Samples
n = 5000
n_step = 200
feature = 1
sample = int(n / n_step)
samples = list()
for i in range(0, n, n_step):
    samples.append(data[i:i+n_steps])
print(len(samples))
supv_data = np.array(samples)
supv_data

NameError: name 'n_steps' is not defined

In [None]:
data.reshape(sample, n_step, feature)