In [None]:
# Load the TensorBoard notebook extension

#Uncomment to use it in colab and monitor to tendorboard
""""%load_ext tensorboard

import tensorflow as tf
import datetime"""

##Data preparation

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os

Define the path to load all locally stored csv with data from yahoo:

In [5]:
path = os.getcwd()+"/exper_files/datasets/" #Define where are the datasets

Create the whole dataset: 

In [144]:
def dataset_creation(crypto_list, pth):
    datasets = {} #Store all datasets here
    dates = {} #Store all minimum dates here
    datasets_list = os.listdir(pth) #Find all dataset
    
    crypto_list = [elem+'.csv' for elem in crypto_list]

    for dataset in datasets_list:
        if dataset.endswith(".csv") and (dataset) in crypto_list:

            name = dataset.split(".")
            dataset_name = name[0]

            datasets[dataset_name] = pd.read_csv(path + dataset) 

            datasets[dataset_name]['Date'] = pd.to_datetime(datasets[dataset_name]['Date'])
            datasets[dataset_name].fillna(method='ffill', inplace=True)

            #Create new columns 'close_off_high' and 'volatility' in order to make predictions more accurate:
            kwards = {'close_off_high': lambda x: 2 * (x['High'] - x['Close']) / (x['High'] - x['Low']) - 1,
              'volatility': lambda x: (x['High'] - x['Low']) / (x['Open'])
              }

            datasets[dataset_name] = datasets[dataset_name].assign(**kwards)

            first_date = pd.to_datetime(datasets[dataset_name]['Date'][0])
            dates[dataset_name]=first_date

    max_date = max(dates.values(), key=lambda v: v)

    #Drop all the data which are prior to max_date
    for dataset in datasets:
        datasets[dataset] = datasets[dataset][datasets[dataset]['Date'] >= max_date]


    #Compute the average and standard deviation of 'Close' value for the last 7-days and 30-days(month): 
    for dataset in datasets:

        temp = datasets[dataset].copy()

        #Drop the first 30 days to be able to compute average and standard deviation of month for the rows of the table
        temp = temp[29:]

        temp['mean_7days_Close'] = datasets[dataset]['Close'].rolling(window=7).mean()

        temp['mean_month_Close'] = datasets[dataset]['Close'].rolling(window=30).mean()

        temp['std_7days_Close'] = datasets[dataset]['Close'].rolling(window=7).std()

        temp['std_month_Close'] = datasets[dataset]['Close'].rolling(window=30).std()

        datasets[dataset] = temp.copy()
        



    #Rename the columns
    count = 0
    for dataset in datasets:

        datasets[dataset] = datasets[dataset].rename(columns={'Close':dataset+'_Close', 'Open':dataset+'_Open', 
                                                              'High':dataset+'_High', 'Low':dataset+'_Low', 
                                                              'Adj Close':dataset+'_Adj_Close', 
                                                              'Volume':dataset+'_Volume', 
                                                              'close_off_high':dataset+'_close_off_high',
                                                              'volatility':dataset+'_volatility',
                                                              'mean_7days_Close':dataset+'_mean_7days_Close',
                                                              'mean_month_Close':dataset+'_mean_month_Close',
                                                              'std_7days_Close':dataset+'_std_7days_Close',
                                                              'std_month_Close':dataset+'_std_month_Close'})
        
        
        if count == 0:
            
            date_col = (datasets[dataset]['Date'].reset_index()).drop(['index'], axis=1)
            
            
        datasets[dataset] = ((datasets[dataset].drop(['Date'], axis=1)).reset_index()).drop(['index'], axis=1)    
                    
        

    whole_dataset = pd.concat([datasets[dataset] for dataset in datasets], axis=1)
    whole_dataset = pd.concat([date_col, whole_dataset], axis=1)
    
   
    return whole_dataset

Split data into training, validation and test set:

In [57]:
def split_data(perc_train_set, perc_val_set, currency_data):
        
        #Compute the date to split the dataset into training and validation_test set based on 'perc_train_set'
        splt_date_train = currency_data.iloc[round(currency_data.shape[0] * perc_train_set)]['Date']
        
        #Split the dataset into trainning and validation_test set
        tr_set, val_tst_set = currency_data[currency_data['Date'] < splt_date_train], \
                         currency_data[currency_data['Date'] >= splt_date_train]
        
        #Compute the date to split the val_tst_set into validation and test set based on 'perc_val_set'
        splt_date_val = val_tst_set.iloc[round(val_tst_set.shape[0] * perc_val_set)]['Date']

        #Split the val_tst_set into validation and test set        
        val_set, tst_set = val_tst_set[val_tst_set['Date'] < splt_date_val], \
                            val_tst_set[val_tst_set['Date'] >= splt_date_val]
        
        return tr_set, val_set, tst_set, splt_date_train, splt_date_val

Normalize training, validation and test inputs and outputs with sliding window:

In [104]:
def normalize_in_out(prd_range, wind_len, tr_set, val_set, tst_set, feats, coin_targ, crypto_list):
    
    all_feats = tr_set.columns #Get all features
    feats = [crypto+"_"+feat for crypto in crypto_list for feat in feats] #Get the features in the appropriate format 
                                                                          #(e.g 'Close' --> 'BTC-USD_Close')
    
    #Normalize training inputs
    LSTM_tr_in = []
    for i in range(len(tr_set) - wind_len):
        tmp_set = tr_set[i:(i + wind_len)].copy()
        
        for col in all_feats:
            if col not in feats:
                tmp_set = tmp_set.drop([col], axis=1) #Drop the feature that will not be used

        for col in feats:
            tmp_set[:][col] = tmp_set[col] / tmp_set[col].iloc[0] - 1 #Normalize the feature that will be used

        LSTM_tr_in.append(tmp_set)
    
    #Transform from DataFrame to numpy array
    LSTM_tr_in = [np.array(LSTM_tr_i) for LSTM_tr_i in LSTM_tr_in]
    LSTM_tr_in = np.array(LSTM_tr_in)
    
    
    #Normalize validation inputs
    LSTM_val_in = []
    for i in range(len(val_set) - wind_len):
        tmp_set = val_set[i:(i + wind_len)].copy()
        
        for col in all_feats:
            if col not in feats:
                tmp_set = tmp_set.drop([col], axis=1) #Drop the feature that will not be used

        for col in feats:
            tmp_set[:][col] = tmp_set[col] / tmp_set[col].iloc[0] - 1 #Normalize the feature that will be used
    
        LSTM_val_in.append(tmp_set)
        
    #Transform from DataFrame to numpy array
    LSTM_val_in = [np.array(LSTM_val_i) for LSTM_val_i in LSTM_val_in]
    LSTM_val_in = np.array(LSTM_val_in)
    
    
    #Normalize test inputs
    LSTM_test_in = []
    for i in range(len(tst_set) - wind_len):
        tmp_set = tst_set[i:(i + wind_len)].copy() 
        
        for col in all_feats:
            if col not in feats:
                
                tmp_set = tmp_set.drop([col], axis=1) #Drop the feature that will not be used

        for col in feats:
            tmp_set[:][col] = tmp_set[col] / tmp_set[col].iloc[0] - 1 #Normalize the feature that will be used

        LSTM_test_in.append(tmp_set)
    
    
    #Transform from DataFrame to numpy array
    LSTM_test_in = [np.array(LSTM_test_i) for LSTM_test_i in LSTM_test_in]
    LSTM_test_in = np.array(LSTM_test_in)
    
    
    #Normalize training outputs
    LSTM_rangd_train_out = []
    for i in range(wind_len, len(tr_set[coin_targ+'_Close']) - prd_range):
        LSTM_rangd_train_out.append((tr_set[coin_targ+'_Close'][i:i+prd_range].values/tr_set[coin_targ+'_Close'][tr_set.index[0]+i-wind_len]) - 1)

    LSTM_rangd_train_out = np.array(LSTM_rangd_train_out)
    
    
    #Normalize validation outputs
    LSTM_rangd_val_out = []
    for i in range(wind_len, len(val_set[coin_targ+'_Close']) - prd_range):
        LSTM_rangd_val_out.append((val_set[coin_targ+'_Close'][i:i+prd_range].values/val_set[coin_targ+'_Close'][val_set.index[0]+i-wind_len]) - 1)
    
    LSTM_rangd_val_out = np.array(LSTM_rangd_val_out)
    
    return LSTM_rangd_train_out, LSTM_rangd_val_out, LSTM_tr_in, LSTM_val_in, LSTM_test_in

Define model

In [116]:
from keras.models import Sequential
from keras.layers import Activation, Dense
from keras.layers import LSTM
from keras.layers import Dropout
import tensorflow as tf


def build_model(inputs, output_size, neurons, activ_func="linear",
                dropout=0.25, loss="mae", optimizer="adam"):
    model = Sequential()
    model.add(LSTM(neurons, input_shape=(inputs.shape[1], inputs.shape[2])))
   
    
    model.add(Dropout(dropout))
    
    model.add(Dense(units=output_size))
    model.add(Activation(activ_func))

    model.compile(loss=loss, optimizer=optimizer)
    return model

Bulid and train model:

In [133]:
def build_and_train_model(epchs, btch_size, neurs, dropout, prd_range, LSTM_train_in, LSTM_rangd_train_out, LSTM_valid_in, LSTM_rangd_valid_out, shffl, verb, early_st_pat""", tbd_ck"""):

    rnged_btcoin_model = build_model(LSTM_train_in, output_size=prd_range, neurons=neurs, dropout=dropout)
    
    np.random.seed(202)
    
    callbacks = ["""tbd_ck, """tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=early_st_pat)]

    rnged_hist = rnged_btcoin_model.fit(LSTM_train_in[:-prd_range], LSTM_rangd_train_out,
                                      validation_data=(LSTM_valid_in[:-prd_range], LSTM_rangd_valid_out), 
                                        epochs=epchs, batch_size=btch_size, verbose=verb,  callback=callbacks, 
                                        shuffle=shffl)
    
    return rnged_btcoin_model, rnged_hist

In [None]:
#Get data
crypto_list = [['BTC-USD', 'ETH-USD'], ['BTC-USD']] #Define the sets of cryptocurrencies to be tested
for cryptocurrency_list in crypto_list:
      
    print('Using cryptocurrencies: '+str(cryptocurrency_list))
    data = dataset_creation(cryptocurrency_list, path)

    #Split data
    percent_train_set = 0.8
    percent_val_set = 0.5

    training_set, validation_set, test_set, split_date_train, split_date_valid = split_data(percent_train_set, 
                                                                                           percent_val_set, data)

    
    features_list = [['Close'], ['Close', 'Volume'], ['Close', 'Open', 'High'],
                     ['Close', 'close_off_high', 'volatility'],
                     ['Close', 'mean_7days_Close', 'mean_month_Close'],
                     ['Close', 'std_7days_Close', 'std_month_Close']] #Define the sets of features to be tested 
    
    for featurs in features_list: 
    
        print('\tUsing the features: '+str(featurs))
        #Create inputs and outputs for the model training, validation and testing
        pred_range = 5
        window_len = 10
        features = featurs
        coin_target = 'BTC-USD'

        LSTM_ranged_training_outputs, LSTM_ranged_validation_outputs, LSTM_training_inputs, LSTM_validation_inputs, LSTM_test_inputs = normalize_in_out(
                                                                                                      pred_range, window_len, 
                                                                                                      training_set, 
                                                                                                      validation_set, 
                                                                                                      test_set, 
                                                                                                      features, coin_target,
                                                                                                      cryptocurrency_list)

        batch_size_list = [1, 32, 64]
        neuron_list = [20, 40, 60, 100]
        dropout_list = [0.2, 0.25, 0.3, 0.4]

        for bat_s, neur, drop in [(bat_s, neur, drop) for bat_s in batch_size_list for neur in neuron_list for drop in dropout_list]:

            #Build and train model
            epochs = 100
            batch_size = bat_s
            neurons = neur
            dropout = drop
            early_stop_patience = 10
            shuffle = True
            verbose = 0

            print('\t\tBatch_size: '+str(batch_size)+" Neurons: "+str(neurons)+" Dropout: "+str(dropout))
            
            #Unocomment to monitor in tensorboard
            """log_dir = "logs/fit/" + str(cryptocurrency_list) + '/' + str(featurs) + '/' + 'Batch_size:'+str(batch_size)+"_Neurons:"+str(neurons)+"_Dropout:"+str(dropout)
            tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)
            """
            
            ranged_btcoin_model, ranged_hist = build_and_train_model(epochs, batch_size, neurons, dropout, pred_range, 
                                                                     LSTM_training_inputs, LSTM_ranged_training_outputs, 
                                                                     LSTM_validation_inputs, LSTM_ranged_validation_outputs, 
                                                                     shuffle, verbose, early_stop_patience", tensorboard_callback")

            print('\t\tBest model found in epoch ' + str(ranged_hist.epoch[-early_stop_patience]) +' with validation loss: '+str(ranged_hist.history['val_loss'][-early_stop_patience]))

Using cryptocurrencies: ['BTC-USD', 'ETH-USD']
	Using the features: ['Close']
		Batch_size: 1 Neurons: 20 Dropout: 0.2
		Best model found in epoch 7 with validation loss: 0.03555181995034218
		Batch_size: 1 Neurons: 20 Dropout: 0.25
		Best model found in epoch 7 with validation loss: 0.03686963766813278
		Batch_size: 1 Neurons: 20 Dropout: 0.3
		Best model found in epoch 8 with validation loss: 0.03547072410583496
		Batch_size: 1 Neurons: 20 Dropout: 0.4
		Best model found in epoch 11 with validation loss: 0.035500068217515945
		Batch_size: 1 Neurons: 40 Dropout: 0.2
		Best model found in epoch 5 with validation loss: 0.03589186072349548
		Batch_size: 1 Neurons: 40 Dropout: 0.25
		Best model found in epoch 4 with validation loss: 0.03716958314180374
		Batch_size: 1 Neurons: 40 Dropout: 0.3
		Best model found in epoch 14 with validation loss: 0.03564949333667755
		Batch_size: 1 Neurons: 40 Dropout: 0.4
		Best model found in epoch 6 with validation loss: 0.036535829305648804
		Batch_size

		Best model found in epoch 45 with validation loss: 0.03539042919874191
		Batch_size: 32 Neurons: 40 Dropout: 0.4
		Best model found in epoch 41 with validation loss: 0.03535907715559006
		Batch_size: 32 Neurons: 60 Dropout: 0.2
		Best model found in epoch 36 with validation loss: 0.035388536751270294
		Batch_size: 32 Neurons: 60 Dropout: 0.25
		Best model found in epoch 30 with validation loss: 0.0356292799115181
		Batch_size: 32 Neurons: 60 Dropout: 0.3
		Best model found in epoch 30 with validation loss: 0.03562966734170914
		Batch_size: 32 Neurons: 60 Dropout: 0.4
		Best model found in epoch 45 with validation loss: 0.03570662811398506
		Batch_size: 32 Neurons: 100 Dropout: 0.2
		Best model found in epoch 28 with validation loss: 0.035730741918087006
		Batch_size: 32 Neurons: 100 Dropout: 0.25
		Best model found in epoch 28 with validation loss: 0.03568040207028389
		Batch_size: 32 Neurons: 100 Dropout: 0.3
		Best model found in epoch 41 with validation loss: 0.0355195589363575
		

In [None]:
#Uncomment to monitor in tensorboard
#%tensorboard --logdir log_dir