In [1]:
import warnings
warnings.simplefilter(action='ignore', category=UserWarning)

import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import plotly.graph_objs as go
import plotly.express as px
from plotly.subplots import make_subplots
import gzip
from datetime import datetime, timedelta
from statistics import mean, median
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
import seaborn as sns
import matplotlib.pyplot as plt

import tensorflow
import tensorflow.keras as tf
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Input, Dense, LeakyReLU, BatchNormalization, ReLU, LSTM, Conv1D, Conv2D
from tensorflow.keras.activations import sigmoid, tanh
from tensorflow.keras.utils import to_categorical

from sklearn.preprocessing import MinMaxScaler

from tqdm import tqdm
import csv
import random

from sklearn.metrics import accuracy_score as accuracy
from sklearn.metrics import precision_score as precision
from sklearn.metrics import recall_score as recall
from sklearn.metrics import f1_score as f1

In [2]:
def retrieve_data(varname, filename):
#     df = pd.read_csv(filename, index_col=0)
    df = pd.read_csv(filename)
#     display(df)
    df["Date"] = pd.to_datetime(df["Date"])
    return df

def create_classification_data(df, lookback):
    rows = []
    columns = ['Date', 'SP500_relative_change_perc_1'] # Date and SP500_relative_change_perc_1 from t-0 are added first as target variables 
    
    # create column names based on original with the addition of t-i where i is lookback
    for i in range(1, lookback + 1): # starts at 1 since we do not want t-0 variables apart from 'Date' and 'SP500_relative_change_perc_1'
        new_columns = df.columns.tolist()[1:] # starts at 1 to exclude 'Date' column
        for x in range(len(new_columns)):
            new_columns[x] = new_columns[x] + "_t-" + str(i)
        columns = columns + new_columns
    
    # create lookback data
    for i, row in enumerate(df.iterrows()):
        if i > lookback: # lookback cannot be determined for earlier rows
            new_row = [row[1][0], row[1][1]] # add target 'Date' and 'SP500_relative_change_perc_1 '
            for x in range(1, lookback + 1): # starts at 1 since we do not want t-0 variables apart from 'Date' and 'SP500_relative_change_perc_1'
                add_row = df.iloc[i - x].tolist()[1:] # starts at 1 to exclude 'Date' column
                new_row = new_row + add_row
            rows.append(new_row)
    df2 = pd.DataFrame(rows)
    df2.columns = columns
    return df2

def create_train_val_test(df, year_val, year_test, perc_train=None):
    if perc_train == None:
        # assumes years_train < year_val < year_test
        df["Date"] = pd.to_datetime(df["Date"])
        
        val = df[df['Date'].dt.year == year_val]
        test = df[df['Date'].dt.year == year_test]
        train = df[df['Date'].dt.year < year_val]
    else:
        train = df.head(round(len(df) * perc_train))
        val = df.tail(len(df) - len(train))
        test = val.tail(round(0.5 * len(val)))
        val = val.head(len(val) - len(test))
    y_train = train['SP500_relative_change_perc_1']
    x_train = train.drop(['SP500_relative_change_perc_1'], axis=1)
    
    y_val = val['SP500_relative_change_perc_1']
    x_val = val.drop(['SP500_relative_change_perc_1'], axis=1)
    
    y_test = test['SP500_relative_change_perc_1']
    x_test = test.drop(['SP500_relative_change_perc_1'], axis=1)
    
    return x_train, y_train, x_val, y_val, x_test, y_test

def scale_data(x):
    standard_scaler = MinMaxScaler()
    x = x.drop(["Date"], axis=1)
    x_scaled = pd.DataFrame(standard_scaler.fit_transform(x), columns=x.columns)
    return x_scaled

In [3]:
def label_data(y):
    positives = []
    negatives = []
    y = list(y)
    
    labels = []
    for dev in y:
        if dev >= 0:
            labels.append(1)
        else:
            labels.append(0)
    return labels

In [4]:
class LSTM_model(object):
    def __init__(self, x, activation_functions, batch_sizes):        
        self.activation_function = activation_functions[x[0]]
        
        self.lstm1 = x[1]
        self.lstm2 = x[2]
        
        self.dense1 = x[3]
        self.dense2 = x[4]

        self.dropout1 = x[5]
        self.dropout2 = x[6]
        
        self.epochs = x[7]
        self.batch_size = batch_sizes[x[8]]
        
        self.lookback = x[9]
        
    def fit(self):
        global df

        val_year = 2018
        test_year = 2019
        df_class = create_classification_data(df, self.lookback)
        x_train, y_train, x_val, y_val, x_test, y_test = create_train_val_test(df_class, val_year, test_year)


        y_train = label_data(y_train)
        y_val = label_data(y_val)
        y_test = label_data(y_test)

        train_date = x_train[['Date']]
        x_train = x_train.drop(['Date'], axis=1)
        val_date = x_val[['Date']]
        x_val = x_val.drop(['Date'], axis=1)
        test_date = x_test[['Date']]
        x_test = x_test.drop(['Date'], axis=1)

        x_train = np.asarray(x_train)
        x_val = np.asarray(x_val)
        x_test = np.asarray(x_test)
        y_train = np.asarray(y_train)
        y_val = np.asarray(y_val)
        y_test = np.asarray(y_test)

        x_train = x_train.reshape((x_train.shape[0], 1, x_train.shape[1]))
        x_val = x_val.reshape((x_val.shape[0], 1, x_val.shape[1]))
        x_test = x_test.reshape((x_test.shape[0], 1, x_test.shape[1]))

        y_train = y_train.reshape((y_train.shape[0], 1))
        y_val = y_val.reshape((y_val.shape[0], 1))
        y_test = y_test.reshape((y_test.shape[0], 1))

        tensorflow.random.set_seed(111)
        np.random.seed(111)
        random.seed(111)
        
        model = Sequential()
        model.add(LSTM(self.lstm1, dropout=self.dropout1, input_shape=(x_train.shape[1], x_train.shape[2]), return_sequences=True))
        if self.lstm2 > 0:
            model.add(LSTM(self.lstm2, dropout=self.dropout2, return_sequences=True))
        if self.dense1 > 0:
            model.add(Dense(self.dense1, activation=self.activation_function))
        if self.dense2 > 0:
            model.add(Dense(self.dense2, activation=self.activation_function))
        model.add(Dense(1, activation='sigmoid'))
        
        model.compile(optimizer='adam', loss='binary_crossentropy', metrics=["acc"])
        history = model.fit(x_train, y_train, epochs=self.epochs, batch_size=self.batch_size, verbose=0, validation_data=(x_val, y_val), shuffle=False)
        
        val_loss = mean(history.history['val_acc'][-5:])
        return val_loss

In [5]:
def calculate_fitness(val_loss):
    fitness = val_loss
    return fitness

In [6]:
class EA(object):
    def __init__(self, population_size, activation_functions, batch_sizes):
        self.population_size = population_size
        self.a = 0.2
        self.activation_functions = activation_functions
        self.batch_sizes = batch_sizes
        
    def evaluate(self, x):
        """
        include in fitness function
            relative difference between train_loss and val_loss (smaller is better)
            number of layers (smaller is better)
            bottleneck size (smaller is better)
            val_loss (smaller is better)
        
        """
        lstm = LSTM_model(x, self.activation_functions, self.batch_sizes)
        val_loss = lstm.fit()
        fitness = calculate_fitness(val_loss)
        return fitness
    
    def select_triple(self, candidate, population):
        # select three random instances for differential evolution
        x1, x2, x3 = np.random.choice(range(len(population))), np.random.choice(range(len(population))), np.random.choice(range(len(population)))
        while candidate == x1 or candidate == x2 or candidate == x3 or x1 == x2 or x2 == x3 or x1 == x3:
            # keep selecting new ones until candidate != x1 != x2 != x3
            x1, x2, x3 = np.random.choice(range(len(population))), np.random.choice(range(len(population))), np.random.choice(range(len(population)))
        return population[x1], population[x2], population[x3]
    
    def mutate(self, x1, x2, x3):
        mutated = x1 + (self.a * (x3 - x2))
#         print(f"mutated {mutated}")
        # activation function
        mutated[0] = round(mutated[0])
        mutated[0] = min(mutated[0], len(self.activation_functions) - 1)
        mutated[0] = max(0, mutated[0])
        
        # lstm layer 1
        mutated[1] = round(mutated[1])
        mutated[1] = max(1, mutated[1]) # must be at least one

        # lstm layer 2
        mutated[2] = round(mutated[2])
        mutated[2] = max(0, mutated[2])

        # dense layer 1
        mutated[3] = round(mutated[3])
        mutated[3] = max(4, mutated[3])
        
        # dense layer 2
        mutated[4] = round(mutated[4])
        mutated[4] = max(4, mutated[4])

        # dropout lstm layer 1
        mutated[5] = max(0.05, mutated[5])
        mutated[5] = min(0.95, mutated[5])
        mutated[5] = round(mutated[5],2)
        
        # dropout lstm layer 2
        mutated[6] = max(0.05, mutated[6])
        mutated[6] = min(0.95, mutated[6])
        mutated[6] = round(mutated[6],2)
        
        # epochs
        mutated[7] = round(mutated[7])
        mutated[7] = max(6, mutated[7])

        # batch size
        mutated[8] = round(mutated[8])
        mutated[8] = min(mutated[8], len(self.batch_sizes) - 1)
        mutated[8] = max(0, mutated[8])
        
        mutated[9] = round(mutated[9])
        mutated[9] = max(1, mutated[9])
        
        return mutated
        
    def recombine(self, candidate, mutation):
        for i in range(candidate.shape[0]):
            prob = np.random.randint(0, 2)
            if prob == 1:
                candidate[i] = mutation[i]
        return candidate

    def select(self, x_new, f_new, x_old, f_old):
        x_cat = np.concatenate([x_new, x_old], 0)
        f_cat = np.concatenate([f_new, f_old])
        ind = np.argsort(f_cat)
        x = x_cat[ind]
        f = f_cat[ind]
        return x[-self.population_size:], f[-self.population_size:]
    
    def step(self, x_old, f_old):
        x = np.copy(x_old)
        f = np.copy(f_old)
        for i in tqdm(range(self.population_size), total=self.population_size):
            # choose candidate
            candidate = x[i]
            # select 3 instances for differential evolution
            x1, x2, x3 = self.select_triple(i, x)
            # mutate 3 instances
            mutated_triple = self.mutate(x1, x2, x3)
            # recombine candidate with mutation
            candidate = self.recombine(candidate, mutated_triple)
            x[i] = candidate
            # evaluate candidate solution
            f_candidate = self.evaluate(candidate)
            f[i] = f_candidate
        # select survivors
        x, f = self.select(x, f, x_old, f_old)
        return x, f

In [7]:
def init_population(population_size, activation_functions, batch_sizes):
    # generate initial population
    population = []
    print("Creating initial population...")
    for i in tqdm(range(population_size), total=population_size):
        activation_function = random.randint(0, len(activation_functions) - 1)
        
        lookback = random.randint(1, 20)
        
        variables = lookback * 18
        
        lstm1 = random.randint(1, 1.5 * variables)
        lstm2 = random.randint(0, 1.5 * variables)
        dense1 = random.randint(4, 1.5 * variables)
        dense2 = random.randint(4, 1.5 * variables)
        
        dropout1 = round(random.uniform(0.05, 0.95),2)
        dropout2 = round(random.uniform(0.05, 0.95),2)

        epochs = random.randint(10, 500)
        
        batch_size = random.randint(0, len(batch_sizes) - 1)
    
        population.append(np.asarray([activation_function, lstm1, lstm2, dense1, dense2, dropout1, dropout2, epochs, batch_size, lookback], dtype='object'))
    print("Initial population ready")
    return np.asarray(population)

def evaluate_init_population(ea, x):
    # evaluate initial population
    f = []
    print("Evaluating initial population...")
    for i in tqdm(range(x.shape[0]), total=x.shape[0]):
        instance = x[i]
        f.append(ea.evaluate(instance))
    print("Evaluation initial population completed")
    return np.asarray(f)

def print_best(x, activation_functions, batch_sizes, fitness):
    print(f"\nMost suitable parameters -- Accuracy of {fitness}:")
    print(f"\tActivation function:           \t{activation_functions[x[0]]}")
    print(f"\tLSTM nodes layer 1:            \t{x[1]}")
    print(f"\tLSTM nodes layer 2:            \t{x[2]}")
    print(f"\tDense nodes layer 1:           \t{x[3]}")
    print(f"\tDense nodes layer 2:           \t{x[4]}")
    print(f"\tDropout LSTM layer 1:          \t{x[5]}")
    print(f"\tDropout LSTM layer 2:          \t{x[6]}")
    print(f"\tEpochs trained:                \t{x[7]}")
    print(f"\tBatch Size:                    \t{batch_sizes[x[8]]}")
    print(f"\tLookback:                      \t{x[9]}")

def plot_convergence(f_best):
    fig1 = make_subplots(rows=1, cols=1, specs=[[{'type':'xy'}]])
    
    x_values = []
    for i in range(len(f_best)):
        x_values.append(i)
    fig1.add_trace(go.Scatter(x=x_values, y=f_best, mode="lines"), row=1, col=1)

    fig1.update_layout(
        title = f'Validation Accuracy Over LSTM Tuning Generations', 
        xaxis1 = dict(title_text = 'Generation'),
        yaxis1 = dict(title_text = "Validation Accuracy")
    )
    fig1.write_image("Plots/opt lstm 5 binary updown SP500 all.png")
    fig1.show()

def validate_best(x, ea):
    print("\nValidating solution...")
    ea.evaluate(x)
    print("Solution validated")

In [None]:
df = retrieve_data("SP500", "Dataset v3/SP500_combined_data_20220422.csv")
cols = []
relevant = 'SP500'
for col in df.columns.tolist():
    if col[:len(relevant)] == relevant:
        cols.append(col)
        
print(cols)

population_size = 30
generations = 30
activation_functions = ['sigmoid', 'tanh']
batch_sizes = [64, 128, 256]

ea = EA(population_size, activation_functions, batch_sizes)
x = init_population(population_size, activation_functions, batch_sizes)
f = evaluate_init_population(ea, x)

populations = []
populations.append(x)
f_best = [f.max()]

start_time = datetime.now()

print("--> STARTING EVOLUTION")
early_stop = 0
for i in range(generations):
    print(f'Generation: {i}\tBest fitness: {f.max()}')
    x, f = ea.step(x, f)
    print(x)
    populations.append(x)

    if f.max() > f_best[-1]:
        f_best.append(f.max())
        early_stop = 0
    else:
        f_best.append(f_best[-1])
        early_stop += 1
    if early_stop == 5:
        print("Early stop triggered at generation {i} after not improving fitness for three generations")
        break
print("--> EVOLUTION FINISHED")

end_time = datetime.now()
evolution_time = end_time - start_time
evolution_time_seconds = evolution_time.total_seconds()
print(f"\nElapsed time in minutes: {evolution_time_seconds/60}")

print(f)
print(f.min())
index_best_parameters = np.where(f == f.max())[0][0]
print(index_best_parameters)
print_best(x[index_best_parameters], activation_functions, batch_sizes, f.max())
validate_best(x[index_best_parameters], ea)
plot_convergence(f_best)

100%|███████████████████████████████████████████████████████████████████████████████| 30/30 [00:00<00:00, 44971.09it/s]
  0%|                                                                                           | 0/30 [00:00<?, ?it/s]

['SP500_AD_MACD_12_26', 'SP500_AD_oscillator', 'SP500_ATR10', 'SP500_ATR20', 'SP500_ATR5', 'SP500_ATR50', 'SP500_EMA10', 'SP500_EMA20', 'SP500_EMA5', 'SP500_EMA50', 'SP500_F_Volume', 'SP500_F_relative_change_perc_1', 'SP500_F_relative_change_perc_10', 'SP500_F_relative_change_perc_20', 'SP500_F_relative_change_perc_5', 'SP500_F_relative_change_perc_50', 'SP500_MA10', 'SP500_MA20', 'SP500_MA5', 'SP500_MA50', 'SP500_OBV', 'SP500_OBV_stdev_10', 'SP500_OBV_stdev_20', 'SP500_OBV_stdev_5', 'SP500_OBV_stdev_50', 'SP500_RSI_14', 'SP500_RSI_28', 'SP500_Volume', 'SP500_bollinger_high_10', 'SP500_bollinger_high_20', 'SP500_bollinger_high_5', 'SP500_bollinger_high_50', 'SP500_bollinger_low_10', 'SP500_bollinger_low_20', 'SP500_bollinger_low_5', 'SP500_bollinger_low_50', 'SP500_bollinger_middle_10', 'SP500_bollinger_middle_20', 'SP500_bollinger_middle_5', 'SP500_bollinger_middle_50', 'SP500_disparity_10', 'SP500_disparity_20', 'SP500_disparity_5', 'SP500_disparity_50', 'SP500_momentum_16', 'SP500_m

2022-05-02 11:21:15.018120: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2022-05-02 11:21:15.018603: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-05-02 11:21:15.545311: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:116] None of the MLIR optimization passes are enabled (registered 2)
100%|███████████████████████████████████████████████████████████████████████████████| 30/30 [1:28:03<00:00, 176.11s/it]
  0%|                                                                                           | 0/30 [00:00<?, ?it/s]

Evaluation initial population completed
--> STARTING EVOLUTION
Generation: 0	Best fitness: 1.0


100%|███████████████████████████████████████████████████████████████████████████████| 30/30 [3:32:42<00:00, 425.41s/it]
  0%|                                                                                           | 0/30 [00:00<?, ?it/s]

[[1 466 386 178 69 0.85 0.41 388 2 18]
 [1 466 137 153 69 0.59 0.06 402 1 18]
 [1 466 155 132 69 0.19 0.06 459 0 18]
 [0 466 391 54 69 0.52 0.06 56 2 18]
 [1 466 181 204 69 0.37 0.06 403 0 18]
 [0 19 7 231 311 0.68 0.42 321 1 12]
 [0 466 35 10 69 0.38 0.05 81 0 18]
 [1 466 62 35 69 0.5 0.05 98 2 18]
 [1 466 428 318 69 0.35 0.05 204 2 18]
 [1 466 294 452 69 0.46 0.05 487 0 18]
 [0 466 65 16 69 0.59 0.05 112 0 18]
 [1 466 7 4 69 0.47 0.05 491 2 18]
 [0 501 7 500 38 0.68 0.41 321 1 19]
 [0 192 7 304 157 0.68 0.41 321 1 17]
 [0 466 0 39 69 0.77 0.41 356 1 18]
 [1 199 47 147 106 0.43 0.14 59 2 9]
 [1 466 91 39 69 0.58 0.41 445 2 18]
 [0 466 103 427 69 0.77 0.41 313 2 18]
 [1 466 168 99 69 0.8 0.41 451 2 18]
 [1 466 248 312 69 0.17 0.41 271 0 18]
 [1 466 118 134 69 0.3 0.41 435 2 18]
 [1 466 53 24 69 0.59 0.41 159 1 18]
 [0 466 105 45 69 0.29 0.41 362 1 18]
 [1 466 32 124 69 0.52 0.41 473 0 18]
 [0 466 2 7 69 0.85 0.41 257 1 18]
 [1 466 10 86 69 0.47 0.41 415 2 18]
 [1 466 169 131 69 0.11 0.

  7%|█████▎                                                                          | 2/30 [14:51<3:34:50, 460.36s/it]

In [None]:
plot_convergence(f_best)