In [1]:
import warnings
warnings.simplefilter(action='ignore', category=UserWarning)

import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import plotly.graph_objs as go
import plotly.express as px
from plotly.subplots import make_subplots
import gzip
from datetime import datetime, timedelta
from statistics import mean, median
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
import seaborn as sns
import matplotlib.pyplot as plt

import tensorflow
import tensorflow.keras as tf
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Input, Dense, LSTM, Conv1D, Conv2D, Dropout
from tensorflow.keras.activations import sigmoid, tanh
from tensorflow.keras.utils import to_categorical

from sklearn.preprocessing import MinMaxScaler

from tqdm import tqdm
import csv
import random

from sklearn.metrics import accuracy_score as accuracy
from sklearn.metrics import precision_score as precision
from sklearn.metrics import recall_score as recall
from sklearn.metrics import f1_score as f1

In [2]:
def extract_model(filename):
    model = tf.models.load_model(filename)
    return model
    
def retrieve_data(filename):
    df = pd.read_csv(filename, index_col=0)
    df["Date"] = pd.to_datetime(df["Date"])
    return df

def create_classification_data(df, lookback, column):
    rows = []
    columns = ['Date', column] # Date and SP500_relative_change_perc_1 from t-0 are added first as target variables 
    
    # create column names based on original with the addition of t-i where i is lookback
    for i in range(1, lookback + 1): # starts at 1 since we do not want t-0 variables apart from 'Date' and 'SP500_relative_change_perc_1'
        new_columns = df.columns.tolist()[1:] # starts at 1 to exclude 'Date' column
        for x in range(len(new_columns)):
            new_columns[x] = new_columns[x] + "_t-" + str(i)
        columns = columns + new_columns
    
    # create lookback data
    for i, row in enumerate(df.iterrows()):
        if i > lookback: # lookback cannot be determined for earlier rows
            new_row = [row[1][0], row[1][1]] # add target 'Date' and 'SP500_relative_change_perc_1 '
            for x in range(1, lookback + 1): # starts at 1 since we do not want t-0 variables apart from 'Date' and 'SP500_relative_change_perc_1'
                add_row = df.iloc[i - x].tolist()[1:] # starts at 1 to exclude 'Date' column
                new_row = new_row + add_row
            rows.append(new_row)
    df2 = pd.DataFrame(rows)
    df2.columns = columns
    return df2

def create_train(df, year_val, year_test, column, perc_train=None):
    # assumes years_train < year_val < year_test
    df["Date"] = pd.to_datetime(df["Date"])

    val = df[df['Date'].dt.year == year_val]
    test = df[df['Date'].dt.year == year_test]
    train = df[df['Date'].dt.year < year_val]
    y_train = train[column]
    x_train = train
    return x_train


def create_val(df, year_val, year_test, column, perc_train=None):
    # assumes years_train < year_val < year_test
    df["Date"] = pd.to_datetime(df["Date"])
    val = df[df['Date'].dt.year == year_val]
    test = df[df['Date'].dt.year == year_test]
    train = df[df['Date'].dt.year < year_val]
    y_val = val[column]
    x_val = val
#     display(x_val)
    return x_val

def create_test(df, year_val, year_test, column, perc_train=None):
    print("test", len(df))
    # assumes years_train < year_val < year_test
    df["Date"] = pd.to_datetime(df["Date"])

    val = df[df['Date'].dt.year == year_val]
    test = df[df['Date'].dt.year == year_test]
    train = df[df['Date'].dt.year < year_val]
    y_test = test[column]
    x_test = test
    return x_test

def create_full(df, year_val, year_test, column, perc_train=None):
    df["Date"] = pd.to_datetime(df["Date"])
    y = df[column]
    x = df
    
    return x

In [3]:
def extract_data(instruments, data, mode):
    year_val = 2018
    year_test = 2019
    for symbol in instruments:
        column = symbol + "_relative_change_perc_1"
        ndir = symbol + "_dir"
        nmag = symbol + "_mag"

        data_dir = data[ndir]
        data_mag = data[nmag]
        if mode == "train":
            data[ndir] = create_train(data_dir, year_val, year_test, column)
            data[nmag] = create_train(data_mag, year_val, year_test, column)
        if mode == "val":
            data[ndir] = create_val(data_dir, year_val, year_test, column)
            data[nmag] = create_val(data_mag, year_val, year_test, column)
        if mode == "test":
            data[ndir] = create_test(data_dir, year_val, year_test, column)
            data[nmag] = create_test(data_mag, year_val, year_test, column)
        if mode == "full":
            data[ndir] = create_full(data_dir, year_val, year_test, column)
            data[nmag] = create_full(data_mag, year_val, year_test, column)
    return data

In [4]:
def init_models_data():
    instruments = ['SP500', 'NASDAQ', 'US30']

    model_mag_sp500 = extract_model("Models/SP500_LSTM_large-small_model")
    model_mag_us30 = extract_model("Models/US30_LSTM_large-small_model")
    model_mag_nasdaq = extract_model("Models/NASDAQ_LSTM_large-small_model")

    model_dir_sp500 = extract_model("Models/SP500_NN_up-down_model")
    model_dir_us30 = extract_model("Models/US30_NN_up-down_model")
    model_dir_nasdaq = extract_model("Models/NASDAQ_NN_up-down_model")

    models = {
        'SP500_mag': model_mag_sp500,
        'US30_mag': model_mag_us30,
        'NASDAQ_mag': model_mag_nasdaq,
        'SP500_dir': model_dir_sp500,
        'US30_dir': model_dir_us30,
        'NASDAQ_dir': model_dir_nasdaq
    }

    data_mag_sp500 = create_classification_data(retrieve_data("Dataset v3/SP500_reduced_data_20220425.csv"), 2, 'SP500_relative_change_perc_1')
    data_mag_us30 = create_classification_data(retrieve_data("Dataset v3/US30_reduced_data_20220425.csv"), 12, 'US30_relative_change_perc_1')
    data_mag_nasdaq = create_classification_data(retrieve_data("Dataset v3/NASDAQ_reduced_data_20220425.csv"), 18, 'NASDAQ_relative_change_perc_1')

    data_dir_sp500 = create_classification_data(retrieve_data("Dataset v3/SP500_reduced_data_20220425.csv"), 12, 'SP500_relative_change_perc_1')
    data_dir_us30 = create_classification_data(retrieve_data("Dataset v3/US30_reduced_data_20220425.csv"), 10, 'US30_relative_change_perc_1')
    data_dir_nasdaq = create_classification_data(retrieve_data("Dataset v3/NASDAQ_reduced_data_20220425.csv"), 10, 'NASDAQ_relative_change_perc_1')

    data = {
        'SP500_mag': data_mag_sp500,
        'US30_mag': data_mag_us30,
        'NASDAQ_mag': data_mag_nasdaq,
        'SP500_dir': data_dir_sp500,
        'US30_dir': data_dir_us30,
        'NASDAQ_dir': data_dir_nasdaq
    }
    return instruments, data, models

In [5]:
instruments, data, models = init_models_data()

2022-05-10 17:39:28.046944: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2022-05-10 17:39:28.047977: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [10]:
def label_data_dir(y):
    positives = []
    negatives = []
    y = list(y)
    
    labels = []
    for dev in y:
        if float(dev) >= 0:
            labels.append(1)
        else:
            labels.append(0)
    return labels

def label_data_mag(y):
    positives = []
    negatives = []
    y = list(y)
    for dev in y:
        if dev >= 0:
            positives.append(dev)
        else:
            negatives.append(dev)
    med_pos = median(positives)
    med_neg = median(negatives)
    
    labels = []
    for dev in y:
        if dev >= med_pos or dev <= med_neg:
            labels.append(1)
        else:
            labels.append(0)
    return labels


def predict(instruments, data, models):
    state = []
    for symbol in instruments:
        column = symbol + "_relative_change_perc_1"
        ndir = symbol + "_dir"
        nmag = symbol + "_mag"
        
        model_dir = models[ndir]
        data_dir = data[ndir]
        x = create_val(data_dir, 2018, 2019, column)
        
        y = np.asarray(label_data_dir(x[column]))
        y = y.reshape((y.shape[0], 1))
        
        x = x.drop(['Date'], axis=1)
        x = x.drop([column], axis=1)
        x = np.asarray(x)
        x = x.reshape((x.shape[0], 1, x.shape[1]))
        
        y = y.tolist()
        y_pred = model_dir.predict(x)
        y_pred = y_pred.round()
        y_pred = y_pred.tolist()

        for i in range(len(y)):
            y[i] = int(y[i][0])
            y_pred[i] = int(y_pred[i][0][0])

        acc = accuracy(y, y_pred)
        print(f"{symbol} Direction Accuracy: {acc}")
        
        
        model_mag = models[nmag]
        data_mag = data[nmag]
        x = create_val(data_mag, 2018, 2019, column)
        
        y = np.asarray(label_data_mag(x[column]))
        y = y.reshape((y.shape[0], 1))
        
        x = x.drop(['Date'], axis=1)
        x = x.drop([column], axis=1)
        x = np.asarray(x)
        x = x.reshape((x.shape[0], 1, x.shape[1]))
        
        y = y.tolist()
        y_pred = model_mag.predict(x)
        y_pred = y_pred.round()
        y_pred = y_pred.tolist()

        for i in range(len(y)):
            y[i] = int(y[i][0])
            y_pred[i] = int(y_pred[i][0][0])

        acc = accuracy(y, y_pred)
        print(f"{symbol} Magnitude Accuracy: {acc}")
        
        
predict(instruments, data, models)

SP500 Direction Accuracy: 0.5896414342629482
SP500 Magnitude Accuracy: 0.6852589641434262
NASDAQ Direction Accuracy: 0.6215139442231076
NASDAQ Magnitude Accuracy: 0.6414342629482072
US30 Direction Accuracy: 0.5816733067729084
US30 Magnitude Accuracy: 0.6294820717131474
