In [1]:
import pandas as pd
import numpy as np
import os
from prophet import Prophet
import plotly.graph_objects as go
from sklearn.metrics import r2_score # R^2 score
from sklearn.metrics import mean_squared_error # squared = True for MSE, False for RMSE
from sklearn.metrics import mean_absolute_error # mean absolute error

import tensorflow as tf
physical_devices = tf.config.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], enable=True)
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.callbacks import ModelCheckpoint, TensorBoard
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from collections import deque
import random
import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
# set seed, so we can get the same results after rerunning several times
np.random.seed(314)
tf.random.set_seed(314)
random.seed(314)

In [147]:
class RNN:
    def __init__(self, df, n_steps=None, lookup_step=None, scale=None, shuffle=None,                                    split_by_date=None, test_size=None, features=None):
        
        self.df = df
        self.data = {}
        
        if n_steps == None:
            self.n_steps = 30 # Window size or the sequence length.
        
        if lookup_step == None:
            self.lookup_step = 1 # Lookup step, 1 is the next day.
        
        if scale == None:
            self.scale = True # whether to scale feature columns & output price as well
        self.scale_str = f"sc-{int(self.scale)}"
        
        if shuffle == None:
            self.shuffle = True # whether to shuffle the dataset
        else:
            self.shuffle = shuffle
        self.shuffle_str = f"sh-{int(self.shuffle)}"
        
        if split_by_date == None:
            self.split_by_date = False  # whether to split the training/testing set by date
        else:
            self.split_by_date = split_by_date

        self.split_by_date_str = f"sbd-{int(self.split_by_date)}"
        
        if test_size == None:
            self.test_size = 0.2 # test ratio size, 0.2 is 20%
        
        if features == None:
            self.features = ["open", "high", "low", "close", "macd", "rsi", "adx", "cci"] 

        # create these folders if they does not exist
        if not os.path.isdir("results"):
            os.mkdir("results")
        if not os.path.isdir("logs"):
            os.mkdir("logs")
        if not os.path.isdir("data"):
            os.mkdir("data")


    def shuffle_in_unison(self, a, b):
        # shuffle two arrays in the same way
        state = np.random.get_state()
        np.random.shuffle(a)
        np.random.set_state(state)
        np.random.shuffle(b)

    def load_data(self): 
        
        self.data['df'] = self.df.copy(deep=True) # Copy of original dataframe.
        for col in self.features: # Validate features exist in the dataframe.
            assert col in self.df.columns, f"'{col}' does not exist in the dataframe."

        if "date" not in self.df.columns: # Add date column if it doesn't exist.
            self.df["date"] = self.df.index

        if self.scale: # Scale the data (prices) from 0 to 1.
            column_scaler = {}
            for column in self.features:
                scaler = preprocessing.MinMaxScaler()
                self.df[column] = scaler.fit_transform(np.expand_dims(self.df[column].values,                                                                               axis=1))
                column_scaler[column] = scaler 
            self.data["column_scaler"] = column_scaler  # So we can reverse the scaled values later.
        
        self.df['future'] = self.df['close'].shift(-self.lookup_step) # Add the target column.

        # last `lookup_step` columns contains NaN in future column
        # get them before droping NaNs
        last_sequence = np.array(self.df[self.features].tail(self.lookup_step))
        self.df.dropna(inplace=True)

        sequence_data = []
        sequences = deque(maxlen=self.n_steps) # Double ended queue with size of "n_steps" (window lengths).

        for entry,target in zip(self.df[self.features + ["date"]].values, self.df['future'].values):
            sequences.append(entry)
            if len(sequences) == self.n_steps:
                sequence_data.append([np.array(sequences), target])

        # get the last sequence by appending the last `n_step` sequence with `lookup_step` sequence
        # for instance, if n_steps=30 and lookup_step=1, last_sequence should be of 31 (that is 30+1) length
        # this last_sequence will be used to predict future stock prices that are not available in the dataset
        last_sequence = list([s[:len(self.features)] for s in sequences]) + list(last_sequence)
        last_sequence = np.array(last_sequence).astype(np.float32)

        self.data['last_sequence'] = last_sequence
        
        # construct the X's and y's
        X, y = [], []
        for seq, target in sequence_data:
            X.append(seq)
            y.append(target)

        # convert to numpy arrays
        X = np.array(X)
        y = np.array(y)
        
        if self.split_by_date:
            # split the dataset into training & testing sets by date (not randomly splitting)
            train_samples = int((1 - self.test_size) * len(X))
            self.data["X_train"] = X[:train_samples]
            self.data["y_train"] = y[:train_samples]
            self.data["X_test"]  = X[train_samples:]            
            self.data["y_test"]  = y[train_samples:]
            if self.shuffle:
                # shuffle the datasets for training (if shuffle parameter is set)
                self.shuffle_in_unison(self.data["X_train"], self.data["y_train"])
                self.shuffle_in_unison(self.data["X_test"], self.data["y_test"])
        else:    
            # split the dataset randomly
            self.data["X_train"], self.data["X_test"], self.data["y_train"], self.data["y_test"] = \
                train_test_split(X, y, test_size=self.test_size, shuffle=self.shuffle)

        dates = self.data["X_test"][:, -1, -1] # get the list of test set dates
        self.data["test_df"] = self.data["df"].loc[dates] # retrieve test features from the original dataframe
        self.data["test_df"] = self.data["test_df"][~self.data["test_df"].index.duplicated(keep='first')] # Duplicated dates
        # remove dates from the training/testing sets & convert to float32
        self.data["X_train"] = self.data["X_train"][:, :, :len(self.features)].astype(np.float32) 
        self.data["X_test"] = self.data["X_test"][:, :, :len(self.features)].astype(np.float32)

        self.data['df'].to_csv('./rnn-example.csv')
        # return result

    def create_model(self, units=256, cell=LSTM, n_layers=3, dropout=0.3, loss="huber_loss", optimizer="adam",                                    bidirectional=False):

        ### model parameters
        sequence_length = self.n_steps
        n_features = len(self.features)
        self.model = Sequential()
        for i in range(n_layers):
            if i == 0:
                # first layer
                if bidirectional:
                    self.model.add(Bidirectional(cell(units, return_sequences=True), batch_input_shape=(None,                                                   sequence_length, n_features)))
                else:
                    self.model.add(cell(units, return_sequences=True, batch_input_shape=(None, sequence_length,n_features)))
            elif i == n_layers - 1:
                # last layer
                if bidirectional:
                    self.model.add(Bidirectional(cell(units, return_sequences=False)))
                else:
                    self.model.add(cell(units, return_sequences=False))
            else:
                # hidden layers
                if bidirectional:
                    self.model.add(Bidirectional(cell(units, return_sequences=True)))
                else:
                    self.model.add(cell(units, return_sequences=True))
            # add dropout after each layer
            self.model.add(Dropout(dropout))
        self.model.add(Dense(1, activation="linear"))
        self.model.compile(loss=loss, metrics=["mean_absolute_error"], optimizer=optimizer)
        # return model

    def train(self, batch_size=64, epochs=500):
        checkpointer = ModelCheckpoint(os.path.join("results", 'rnn-example' + ".h5"), save_weights_only=True,                                                      save_best_only=True, verbose=0) # TF callbacks
        tensorboard = TensorBoard(log_dir=os.path.join("logs", 'rnn-example'))
        self.history = self.model.fit(self.data["X_train"], self.data["y_train"],
                    batch_size=batch_size,
                    epochs=epochs,
                    validation_data=(self.data["X_test"], self.data["y_test"]),
                                        callbacks=[checkpointer, tensorboard], verbose=0)

    def plot_graph(self, test_df):
        """
        This function plots true close price along with predicted close price
        with blue and red colors respectively
        """
        fig = go.Figure()
        fig.add_trace(go.Scatter(y=test_df[f'true_close_{LOOKUP_STEP}'], x=test_df.index, name='Train'))
        fig.add_trace(go.Scatter(y=test_df[f'close_{LOOKUP_STEP}'], x=test_df.index, name='Forecasts'))
        # fig.add_trace(go.Scatter(x=df.index[-7:], y=df['close'][-7:], name='Actual'))
        fig.show()
    
    def predict(self):
        
        last_sequence = self.data["last_sequence"][-self.n_steps:] # retrieve the last sequence from data
        last_sequence = np.expand_dims(last_sequence, axis=0) # expand dimension
        prediction = self.model.predict(last_sequence) # get the prediction (scaled from 0 to 1)
        if self.scale: # get the price (by inverting the scaling)
            predicted_price = self.data["column_scaler"]["close"].inverse_transform(prediction)[0][0]
        else:
            predicted_price = prediction[0][0]
        return predicted_price

In [194]:
df_btc = pd.read_csv('../../data/ready/1-day-data/coin_Bitcoin.csv', index_col=0, parse_dates=True)

In [195]:
rnn = RNN(df_btc, shuffle=True, split_by_date=True)

In [196]:
rnn.load_data()

In [197]:
rnn.data.keys()

dict_keys(['df', 'column_scaler', 'last_sequence', 'X_train', 'y_train', 'X_test', 'y_test', 'test_df'])

In [198]:
rnn.data['test_df']

Unnamed: 0_level_0,open,high,low,close,macd,rsi,adx,cci
ds,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2020-09-04 23:59:59,10230.365161,10663.920045,10207.940219,10511.813881,-78.711201,38.189130,28.484973,-272.471054
2021-02-04 23:59:59,37475.104029,38592.176377,36317.498807,36926.064465,872.621359,59.546088,20.355251,122.874586
2020-10-11 23:59:59,11296.081784,11428.813770,11288.627661,11384.181953,89.368976,64.009506,18.073818,259.512229
2020-09-03 23:59:59,11407.191077,11443.022795,10182.465086,10245.296686,-7.167567,33.543164,28.094500,-317.311693
2020-12-27 23:59:59,26439.373289,28288.840022,25922.769765,26272.294567,1912.261068,76.670115,45.242003,139.850374
...,...,...,...,...,...,...,...,...
2020-10-12 23:59:59,11392.636215,11698.467736,11240.686996,11555.362888,132.589755,66.924996,19.761232,236.516845
2021-01-09 23:59:59,40788.640523,41436.350006,38980.876906,40254.546498,4671.046568,87.525003,57.437068,153.128164
2020-08-18 23:59:59,12251.895970,12335.707070,11954.525749,11991.233246,468.045886,64.107326,56.711760,139.614782
2020-11-13 23:59:59,16276.440477,16463.177840,15992.152801,16317.808190,1009.187764,77.307544,53.838426,124.853368


In [199]:
rnn.data['X_train'].shape

(972, 30, 8)

In [200]:
rnn.data['X_test'].shape

(243, 30, 8)

In [201]:
rnn.data['df'].shape

(1245, 8)

In [202]:
rnn.data['test_df']

Unnamed: 0_level_0,open,high,low,close,macd,rsi,adx,cci
ds,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2020-09-04 23:59:59,10230.365161,10663.920045,10207.940219,10511.813881,-78.711201,38.189130,28.484973,-272.471054
2021-02-04 23:59:59,37475.104029,38592.176377,36317.498807,36926.064465,872.621359,59.546088,20.355251,122.874586
2020-10-11 23:59:59,11296.081784,11428.813770,11288.627661,11384.181953,89.368976,64.009506,18.073818,259.512229
2020-09-03 23:59:59,11407.191077,11443.022795,10182.465086,10245.296686,-7.167567,33.543164,28.094500,-317.311693
2020-12-27 23:59:59,26439.373289,28288.840022,25922.769765,26272.294567,1912.261068,76.670115,45.242003,139.850374
...,...,...,...,...,...,...,...,...
2020-10-12 23:59:59,11392.636215,11698.467736,11240.686996,11555.362888,132.589755,66.924996,19.761232,236.516845
2021-01-09 23:59:59,40788.640523,41436.350006,38980.876906,40254.546498,4671.046568,87.525003,57.437068,153.128164
2020-08-18 23:59:59,12251.895970,12335.707070,11954.525749,11991.233246,468.045886,64.107326,56.711760,139.614782
2020-11-13 23:59:59,16276.440477,16463.177840,15992.152801,16317.808190,1009.187764,77.307544,53.838426,124.853368


In [203]:
rnn.create_model()

In [204]:
rnn.train(epochs=500)

In [205]:
rnn.predict()

34944.63

In [206]:
y_hat = rnn.model.predict(rnn.data['X_test'])

In [207]:
y = rnn.data['y_test']

In [208]:
r2_score(y, y_hat)

0.8802433228582179

In [209]:
rnn.data['df']

Unnamed: 0_level_0,open,high,low,close,macd,rsi,adx,cci
ds,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2017-10-02 23:59:59,4395.810059,4470.229980,4377.459961,4409.319824,76.404274,59.992753,19.974615,151.403920
2017-10-03 23:59:59,4408.459961,4432.470215,4258.890137,4317.479980,85.320503,57.036863,18.787460,112.788346
2017-10-04 23:59:59,4319.370117,4352.310059,4210.419922,4229.359863,84.304315,54.273695,17.460863,83.873827
2017-10-05 23:59:59,4229.879883,4362.640137,4164.049805,4328.410156,90.448870,56.806686,16.415504,79.977985
2017-10-06 23:59:59,4324.459961,4413.270020,4320.529785,4370.810059,97.614553,57.882226,15.294155,92.800380
...,...,...,...,...,...,...,...,...
2021-02-23 23:59:59,54204.929756,54204.929756,45290.590268,48824.426869,4566.049082,55.360778,38.924607,30.133858
2021-02-24 23:59:59,48835.087661,51290.136695,47213.498162,49705.333316,4126.268801,56.875394,36.602798,22.800770
2021-02-25 23:59:59,49709.082425,51948.966982,47093.853019,47093.853019,3526.365381,51.316544,34.627863,2.374072
2021-02-26 23:59:59,47180.464054,48370.785260,44454.842114,46339.760083,2956.013517,49.802852,32.328988,-54.009679


In [210]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=rnn.data['df'].index, y=rnn.data['df']['close'], name='Actual'))
fig.show()

In [211]:
rnn.data['X_test'].shape

(243, 30, 8)

In [212]:
y_test = np.squeeze(rnn.data["column_scaler"]["close"].inverse_transform(np.expand_dims(rnn.data['y_test'], axis=0)))
y_test

array([10169.567221  , 38144.30686267, 11555.3628878 , 10511.8138807 ,
       27084.80788628, 10280.3517032 , 11094.3462761 ,  9240.34632654,
       46196.46371944, 18802.99829969, 39187.32812109, 10549.32889962,
        9525.36344997, 13075.24769656, 10750.72357903, 10775.26937624,
       51679.79669449,  9581.07201141, 32366.39304887, 39371.04235311,
        9905.16724705, 11064.45759247, 11991.2332456 , 36926.06446515,
       10938.2712894 , 11970.4787405 , 13780.99470249,  9243.61385509,
       11711.506161  , 11542.4997333 , 33922.9605815 , 11747.0228312 ,
       10442.1706031 ,  9192.83736784, 37316.35939997, 23241.34486501,
       11384.1819535 , 33992.42934365, 11916.33514058,  9185.81691242,
       13437.8832414 , 11100.4681253 , 19246.64434137, 11246.3487749 ,
       15579.84846029, 11878.3716211 , 33537.1768196 , 10619.45190766,
       11774.5958414 , 47909.33119483,  9164.2313647 , 11495.34965037,
       11506.8653177 , 17804.00563217, 13950.30084729, 11878.1113253 ,
      

In [213]:
y_pred = np.squeeze(rnn.data["column_scaler"]["close"].inverse_transform(y_hat))
y_pred


array([10776.423 , 29444.633 , 11325.581 , 10621.23  , 24782.807 ,
       10533.393 , 10756.501 ,  9221.879 , 29593.752 , 19268.445 ,
       33471.04  , 10550.096 ,  9526.367 , 13329.648 , 10694.842 ,
       10712.605 , 40266.434 ,  9661.799 , 26619.492 , 33300.86  ,
        9822.6875, 10909.661 , 12219.665 , 30423.736 , 10931.699 ,
       11885.21  , 14016.447 ,  9258.907 , 11751.451 , 11605.847 ,
       30362.924 , 11562.319 , 10407.505 ,  9166.063 , 31238.156 ,
       22994.016 , 11206.7   , 28181.15  , 11764.5   ,  9150.771 ,
       13424.932 , 10903.867 , 18279.596 , 10992.513 , 14453.024 ,
       11623.646 , 27843.467 , 10843.992 , 11844.684 , 32653.71  ,
        9181.065 , 11451.893 , 11810.055 , 17480.783 , 13884.937 ,
       12145.141 ,  9141.5205, 17844.035 , 18053.953 , 12058.719 ,
       32211.568 , 35093.2   , 27835.518 , 11934.545 , 11401.584 ,
       16008.035 , 38054.746 , 20347.512 , 19310.63  , 16974.668 ,
       10425.066 , 35879.234 , 15660.899 , 12275.936 ,  9045.3

In [214]:
rnn.data['test_df']

Unnamed: 0_level_0,open,high,low,close,macd,rsi,adx,cci
ds,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2020-09-04 23:59:59,10230.365161,10663.920045,10207.940219,10511.813881,-78.711201,38.189130,28.484973,-272.471054
2021-02-04 23:59:59,37475.104029,38592.176377,36317.498807,36926.064465,872.621359,59.546088,20.355251,122.874586
2020-10-11 23:59:59,11296.081784,11428.813770,11288.627661,11384.181953,89.368976,64.009506,18.073818,259.512229
2020-09-03 23:59:59,11407.191077,11443.022795,10182.465086,10245.296686,-7.167567,33.543164,28.094500,-317.311693
2020-12-27 23:59:59,26439.373289,28288.840022,25922.769765,26272.294567,1912.261068,76.670115,45.242003,139.850374
...,...,...,...,...,...,...,...,...
2020-10-12 23:59:59,11392.636215,11698.467736,11240.686996,11555.362888,132.589755,66.924996,19.761232,236.516845
2021-01-09 23:59:59,40788.640523,41436.350006,38980.876906,40254.546498,4671.046568,87.525003,57.437068,153.128164
2020-08-18 23:59:59,12251.895970,12335.707070,11954.525749,11991.233246,468.045886,64.107326,56.711760,139.614782
2020-11-13 23:59:59,16276.440477,16463.177840,15992.152801,16317.808190,1009.187764,77.307544,53.838426,124.853368


In [215]:
rnn.data['test_df']['yhat'] = y_pred

In [216]:
rnn.data['test_df'].sort_index(inplace=True)

In [217]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=rnn.data['df'].index, y=rnn.data['df']['close'], name='Actual'))
fig.add_trace(go.Scatter(x=rnn.data['test_df'].index, y=rnn.data['test_df']['yhat'], name='Forecasts'))
fig.show()