In [1]:
# https://machinelearningmastery.com/multi-step-time-series-forecasting-long-short-term-memory-networks-python/

import pandas as pd
from datetime import datetime, timedelta, date
import numpy as np

from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM

import math
import matplotlib.pyplot as plt

np.random.seed(1337)

# Creating functions

In [2]:
# transform series into train and test sets for supervised learning
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
    n_vars = 1 if type(data) is list else data.shape[1]
    df = pd.DataFrame(data)
    cols, names = list(), list()

    # input sequence (t-n, ... t-1)
    for i in range(n_in, 0, -1):
        cols.append(df.shift(i))
        names += [("var%d(t-%d)" % (j + 1, i)) for j in range(n_vars)]

    # forecast sequence (t, t+1, ... t+n)
    for i in range(0, n_out):
        cols.append(df.shift(-i))
        if i == 0:
            names += [("var%d(t)" % (j + 1)) for j in range(n_vars)]
        else:
            names += [("var%d(t+%d)" % (j + 1, i)) for j in range(n_vars)]

    # put it all together
    agg = pd.concat(cols, axis=1)
    agg.columns = names

    # drop rows with NaN values
    if dropnan:
        agg.dropna(inplace=True)

    return agg


In [3]:
# transform series into train and test sets for supervised learning
def prepare_data(series, n_test, n_lag, n_seq):
    # extract raw values
    raw_values = series.values

    # transform data to be stationary
    diff_values = raw_values
    diff_values = diff_values.reshape(len(diff_values), 1)

    # rescale values to -1, 1
    scaler = MinMaxScaler(feature_range=(-1, 1))
    scaled_values = scaler.fit_transform(diff_values)
    scaled_values = scaled_values.reshape(len(scaled_values), 1)

    # transform into supervised learning problem X, y
    supervised = series_to_supervised(scaled_values, n_lag, n_seq)
    supervised_values = supervised.values

    # split into train and test sets
    train, test = supervised_values[0:-n_test], supervised_values[-n_test:]
    return scaler, train, test

In [4]:
def fit_lstm(train, n_lag, n_seq, n_batch, nb_epoch, n_neurons):
    # reshape training into [samples, timesteps, features]
    X, y = train[:, 0:n_lag], train[:, n_lag:]

    X = X.reshape(X.shape[0], 1, X.shape[1])

    # design network
    model = Sequential()
    model.add(LSTM(n_neurons, batch_input_shape=(n_batch, X.shape[1], X.shape[2]), stateful=True))
    model.add(Dense(y.shape[1]))
    model.compile(loss="mean_squared_error", optimizer="adam")

    model.fit(X, y, epochs=nb_epoch, batch_size=n_batch, verbose=2, shuffle=False)

    return model

In [5]:
# make one forecast with an LSTM,
def forecast_lstm(model, X, n_batch):
    # reshape input pattern to [samples, timesteps, features]
    X = X.reshape(1, 1, len(X))

    # make forecast
    forecast = model.predict(X, batch_size=n_batch)

    # convert to array
    return [x for x in forecast[0, :]]

In [11]:
# evaluate the persistence model
def make_forecasts(model, n_batch, train, test, n_lag, n_seq, forecast_len):
    forecasts = list()
    print(f'Forecast x of {forecast_len}:', end=" ")
    for i in range(forecast_len):
        X, y = test[i, 0:n_lag], test[i, n_lag:]
        # make forecast
        forecast = forecast_lstm(model, X, n_batch)
        # store the forecast
        forecasts.append(forecast)

        # Printing current status in hundreds
        step = i % 100
        if step == 0:
            print(i, end=" ")

    return forecasts

In [12]:
# inverse data transform on forecasts
def inverse_transform(series, forecasts, scaler):
    inverted = list()
    for i in range(len(forecasts)):

        # create array from forecast
        forecast = np.array(forecasts[i])
        forecast = forecast.reshape(1, len(forecast))

        # invert scaling
        inv_scale = scaler.inverse_transform(forecast)
        inv_scale = inv_scale[0, :]

        inverted.append(inv_scale)

    return inverted

# Fitting and predicting

In [13]:
# load dataset
logreturns = "data/final.csv"
series = pd.read_csv(logreturns, usecols=["Exchange.Date", "logreturns"], header=0, index_col=0, squeeze=True)

# configure
n_lag = 5 # same as ARMA-GARCH
n_seq = 63  #  number of periods forecast
test_share = 0.25
n_test = int(len(series) * test_share)
n_epochs = 5
n_batch = 1
n_neurons = 50
forecast_len = 1000

print("Preparing data...")
scaler, train, test = prepare_data(series, n_test, n_lag, n_seq)

print("Fitting model...")
model = fit_lstm(train, n_lag, n_seq, n_batch, n_epochs, n_neurons)

print("Making forecasts...")
forecasts = make_forecasts(model, n_batch, train, test, n_lag, n_seq, forecast_len)

print("\nInverting forecasts...")
forecasts = inverse_transform(series, forecasts, scaler)
print("Done!")

Preparing data...
Fitting model...
Epoch 1/5
3356/3356 - 4s - loss: 0.0087
Epoch 2/5
3356/3356 - 3s - loss: 0.0078
Epoch 3/5
3356/3356 - 4s - loss: 0.0077
Epoch 4/5
3356/3356 - 4s - loss: 0.0077
Epoch 5/5
3356/3356 - 3s - loss: 0.0077
Making forecasts...
Forecast x of 1000: 0 100 200 300 400 500 600 700 800 900 
Inverting forecasts...
Done!


# Evaluating from t=1


## Creating dataframe for evaluation
In essence, creating a new DF combining training data (historic) and forecasts

In [14]:
# Getting dataframe with Close as well and the creating a training df same size as used in the model
original_df = pd.read_csv("data/final.csv", usecols=["Exchange.Date", "logreturns", "Close"])

# Setting as date
original_df['Exchange.Date'] = original_df['Exchange.Date'].apply(lambda x: date(1900, 1, 1) + timedelta(int(x)))
original_df.index = original_df['Exchange.Date']

train_df = original_df[:-n_test].copy()

# Assigning all rows in train df (before forecast) to closing value
# This is because this column cannot be empty (and we have no forecasts since it's training data)
train_df["forecast"] = train_df["Close"]

In [15]:
# Transforming logreturns back to price
last_train = train_df["Close"].values[-1]
price_forecasts = np.exp(np.cumsum(forecasts[0]) + math.log(last_train))

In [16]:
# Creating a separate dataframe only for forecasts (i.e. "outside train df")
forecast_df = pd.DataFrame(columns=["Exchange.Date", "Close", "logreturns", "forecast"])
forecast_df["Close"] = original_df["Close"].values[-n_test : -n_test + n_seq]
forecast_df["logreturns"] = original_df["logreturns"].values[-n_test : -n_test + n_seq]
forecast_df["forecast"] = price_forecasts
forecast_df.index

forecast_df["Exchange.Date"] = forecast_df.index.map(lambda x: date(2016, 8, 1) + timedelta(int(x)))
forecast_df.index = forecast_df["Exchange.Date"]

In [17]:
# Merging train and forecast dataframe
merged_df = train_df.append(forecast_df, ignore_index=True)

# Creating error, absolute error, actual price going up (True/False) and forecast going up (True/False)
merged_df["error"] = merged_df["forecast"] - merged_df["Close"]
merged_df["abs_error"] = np.abs(merged_df["forecast"] - merged_df["Close"])
merged_df["actual_up"] = merged_df["Close"].diff(1) > 0
merged_df["forecast_up"] = merged_df["forecast"].diff(1) > 0
merged_df.index = merged_df["Exchange.Date"]

# Formula for creating confusion value, used below
def confusion(actual, forecast):
    if actual and forecast:
        return "TP"

    if actual and not forecast:
        return "FN"

    if not actual and forecast:
        return "FP"

    if not actual and not forecast:
        return "TN"

    # Just common programming sense to return something, could have written "blabla"
    return False


# The lambda stuff applies the above function on every row of data
merged_df["confusion"] = merged_df.apply(lambda x: confusion(x["actual_up"], x["forecast_up"]), axis=1)

# Printing the tail of the data
merged_df.tail()

Unnamed: 0_level_0,Exchange.Date,Close,logreturns,forecast,error,abs_error,actual_up,forecast_up,confusion
Exchange.Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2016-09-28,2016-09-28,671.89,0.005567,650.663147,-21.226853,21.226853,True,True,TP
2016-09-29,2016-09-29,671.08,-0.001206,651.891724,-19.188276,19.188276,False,True,FP
2016-09-30,2016-09-30,668.82,-0.003373,653.108582,-15.711418,15.711418,False,True,FP
2016-10-01,2016-10-01,668.02,-0.001197,654.217529,-13.802471,13.802471,False,True,FP
2016-10-02,2016-10-02,663.33,-0.007046,655.444702,-7.885298,7.885298,False,True,FP


## Evaluating

In [18]:
# New dataframe that only contains the number of periods to evaluate (1,3,5,21,63)
def new_df(n_periods):
    df = merged_df[len(train_df) : len(train_df) + n_periods]
    return df

In [19]:
# Creating RMSE AND MAE
def evaluate(n_periods):
    df = new_df(n_periods)
    mape = ((df["abs_error"] / df["Close"]).sum() / n_periods) * 100
    rmse = math.sqrt(pow(df["error"].sum(), 2) / n_periods)
    print(f"{n_periods}, RMSE: {round(rmse, 3)}, MAPE: {round(mape, 3)}%")


evaluate(1)  # 1 day
evaluate(3)  # half a week
evaluate(5)  # week
evaluate(21)  # month
evaluate(63)  # quarter

1, RMSE: 0.709, MAPE: 0.114%
3, RMSE: 6.076, MAPE: 0.644%
5, RMSE: 8.219, MAPE: 0.639%
21, RMSE: 0.511, MAPE: 0.537%
63, RMSE: 66.282, MAPE: 1.476%


In [20]:
# Creating confusion matrix
def confusion_matrix(df):
    conf = pd.DataFrame(columns=["P", "N"], index=["P", "N"])
    conf.loc["P", "P"] = len(df[df["confusion"] == "TP"])
    conf.loc["P", "N"] = len(df[df["confusion"] == "FN"])
    conf.loc["N", "P"] = len(df[df["confusion"] == "FP"])
    conf.loc["N", "N"] = len(df[df["confusion"] == "TN"])
    return conf


confusion = confusion_matrix(new_df(63))
precision = confusion.iloc[0, 0] / (confusion.iloc[0, 0] + confusion.iloc[1, 0])
recall = confusion.iloc[0, 0] / (confusion.iloc[0, 0] + confusion.iloc[0, 1])
f_score = 2 * precision * recall / (precision + recall)

print(confusion)
print(f"precision: {int(precision*100)}%, recall: {int(recall*100)}%, f-score: {round(f_score, 3)}")

    P  N
P  31  6
N  25  1
precision: 83%, recall: 55%, f-score: 0.667


# Plotting

In [None]:
plot_df = merged_df[-n_seq - (n_seq * 2) :]
plt.figure(figsize=(10, 5))
plt.plot(plot_df["forecast"], label="forecast")
plt.plot(plot_df["Close"], label="actual")
plt.legend()

# Cross-validating

In [1]:
def cross_evaluate(df, n_periods):
    df = df[-63:-63+n_periods] if n_periods < 63 else df.tail(63)
    mape = ((df["abs_error"] / df["Close"]).sum() / n_periods) * 100
    rmse = math.sqrt(pow(df["error"].sum(), 2) / n_periods)
    return mape, rmse

cross_df = pd.DataFrame(columns=[
    "mape_1", 
    "mape_3",
    "mape_5",
    "mape_21",
    "mape_63",
    "rmse_1",
    "rmse_3",
    "rmse_5",
    "rmse_21",
    "rmse_63"
])

for i in range(len(forecasts)):
    train_df = original_df[:-n_test + i].copy()
    train_df["forecast"] = train_df["Close"]

    last_train = train_df["Close"].values[-1]
    price_forecasts = np.exp(np.cumsum(forecasts[i]) + math.log(last_train))

    cross_merged_df = original_df[-n_test + i:-n_test + i + 63].copy()
    cross_merged_df["forecast"] = price_forecasts

    cross_merged_df["error"] = cross_merged_df["forecast"] - cross_merged_df["Close"]
    cross_merged_df["abs_error"] = np.abs(cross_merged_df["forecast"] - cross_merged_df["Close"])

    one = cross_evaluate(cross_merged_df, 1)
    three = cross_evaluate(cross_merged_df, 3)
    five = cross_evaluate(cross_merged_df, 5)
    twentyone = cross_evaluate(cross_merged_df, 21)
    sixtythree = cross_evaluate(cross_merged_df, 63)

    cross_df = cross_df.append({
        'mape_1': one[0],
        'mape_3': three[0],
        'mape_5': five[0],
        'mape_21': twentyone[0],
        'mape_63': sixtythree[0],
        'rmse_1': one[1],
        'rmse_3': three[1],
        'rmse_5': five[1],
        'rmse_21': twentyone[1],
        'rmse_63': sixtythree[1],
    }, ignore_index=True)

cross_df

NameError: name 'pd' is not defined

In [27]:
cross_df.describe()

Unnamed: 0,mape_1,mape_3,mape_5,mape_21,mape_63,rmse_1,rmse_3,rmse_5,rmse_21,rmse_63
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,0.619687,0.943839,1.222891,2.815316,5.451576,5.021035,12.395882,20.414207,95.02971,325.2603
std,0.819616,1.102143,1.464113,3.286426,4.312766,6.365685,15.22301,25.937659,116.688066,293.582137
min,0.001937,0.03698,0.126373,0.329151,0.751996,0.015474,0.029424,0.003182,0.332651,0.35379
25%,0.188671,0.370524,0.495427,1.263913,2.625553,1.52507,3.890628,6.21066,30.827729,125.210458
50%,0.428625,0.642222,0.836517,2.024687,4.258856,3.394917,8.147912,13.84007,67.895755,252.833447
75%,0.753446,1.138278,1.440175,3.052294,6.573215,6.204407,15.495337,25.677703,110.530774,418.426647
max,14.05227,15.741533,18.920082,27.296362,27.49192,99.719121,189.1416,285.228183,919.271311,2018.404728


In [28]:
n = cross_df.count()[0]
mean = cross_df.mean()
upper = cross_df.mean() + 1.96 * cross_df.std() / math.sqrt(n)
lower = cross_df.mean() - 1.96 * cross_df.std() / math.sqrt(n)

ci_df = pd.DataFrame(columns=['measure', 'mean', 'lower', 'upper'])

for i in range(10):
    ci_df = ci_df.append({
        'measure': cross_df.columns[i],
        'mean': mean[i],
        'lower': lower[i],
        'upper': upper[i]
    }, ignore_index=True)

ci_df

Unnamed: 0,measure,mean,lower,upper
0,mape_1,0.619687,0.568886,0.670487
1,mape_3,0.943839,0.875527,1.01215
2,mape_5,1.222891,1.132144,1.313638
3,mape_21,2.815316,2.611621,3.019011
4,mape_63,5.451576,5.184268,5.718884
5,rmse_1,5.021035,4.626486,5.415585
6,rmse_3,12.395882,11.45235,13.339414
7,rmse_5,20.414207,18.806575,22.02184
8,rmse_21,95.02971,87.797308,102.262111
9,rmse_63,325.2603,307.063891,343.45671
