In [1]:
import warnings
warnings.simplefilter(action='ignore', category=UserWarning)

import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import plotly.graph_objs as go
import plotly.express as px
from plotly.subplots import make_subplots
import gzip
from datetime import datetime, timedelta
from statistics import mean, median
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
import seaborn as sns
import matplotlib.pyplot as plt

import tensorflow
import tensorflow.keras as tf
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Input, Dense, LeakyReLU, BatchNormalization, ReLU, LSTM, Conv1D, Conv2D
from tensorflow.keras.activations import sigmoid, tanh
from tensorflow.keras.utils import to_categorical

from sklearn.preprocessing import MinMaxScaler

from tqdm import tqdm
import csv
import random
import math

from sklearn.metrics import accuracy_score as accuracy
from sklearn.metrics import precision_score as precision
from sklearn.metrics import recall_score as recall
from sklearn.metrics import f1_score as f1

import pickle

In [2]:
def extract_model(filename):
    model = tf.models.load_model(filename)
    return model
    
def retrieve_data(filename):
    if "combined" in filename:
        df = pd.read_csv(filename)
    else:
        df = pd.read_csv(filename, index_col=0)
    df["Date"] = pd.to_datetime(df["Date"])
    return df

In [3]:
def predict_init(data, model, column):
    x = data.copy()
    x = x.drop(['Date'], axis=1)
    x = x.drop([column], axis=1)
    x = np.asarray(x)
    x = x.reshape((x.shape[0], 1, x.shape[1]))
    pred = model.predict(x)
    pred_new = []
    for i in pred:
        pred_new.append(int(round(i[0][0])))
    
#     print(pred_new)
    data['pred'] = pred_new
    return data

def create_classification_data(df, lookback, column):
    rows = []
    columns = ['Date', column] # Date and SP500_relative_change_perc_1 from t-0 are added first as target variables 
    # create column names based on original with the addition of t-i where i is lookback
    for i in range(1, lookback + 1): # starts at 1 since we do not want t-0 variables apart from 'Date' and 'SP500_relative_change_perc_1'
        new_columns = df.columns.tolist() # starts at 1 to exclude 'Date' column
        for x in range(len(new_columns)):
            new_columns[x] = new_columns[x] + "_t-" + str(i)
        columns = columns + new_columns
    
    # create lookback data
    for i, row in enumerate(df.iterrows()):
        if i > lookback: # lookback cannot be determined for earlier rows
            new_row = [row[1]['Date'], row[1][column]]
#             new_row = [row[1][0], row[1][1]] # add target 'Date' and 'SP500_relative_change_perc_1 '
            for x in range(1, lookback + 1): # starts at 1 since we do not want t-0 variables apart from 'Date' and 'SP500_relative_change_perc_1'
                add_row = df.iloc[i - x].tolist() # starts at 1 to exclude 'Date' column
                new_row = new_row + add_row
            rows.append(new_row)
    
    df2 = pd.DataFrame(rows)
    df2.columns = columns
                       
    for col in columns:
        if col[:4] == "Date" and col != "Date":
            df2 = df2.drop([col], axis=1)
    return df2

def create_train(df, year_val, year_test, column, perc_train=None):
    # assumes years_train < year_val < year_test
    df["Date"] = pd.to_datetime(df["Date"])

    val = df[df['Date'].dt.year == year_val]
    test = df[df['Date'].dt.year == year_test]
    train = df[df['Date'].dt.year < year_val]
    y_train = train[column]
    x_train = train
    return x_train


def create_val(df, year_val, year_test, column, perc_train=None):
    # assumes years_train < year_val < year_test
    df["Date"] = pd.to_datetime(df["Date"])
    val = df[df['Date'].dt.year == year_val]
    test = df[df['Date'].dt.year == year_test]
    train = df[df['Date'].dt.year < year_val]
    y_val = val[column]
    x_val = val
#     display(x_val)
    return x_val

def create_test(df, year_val, year_test, column, perc_train=None):
    print("test", len(df))
    # assumes years_train < year_val < year_test
    df["Date"] = pd.to_datetime(df["Date"])

    val = df[df['Date'].dt.year == year_val]
    test = df[df['Date'].dt.year == year_test]
    train = df[df['Date'].dt.year < year_val]
    y_test = test[column]
    x_test = test
    return x_test

def create_full(df, year_val, year_test, column, perc_train=None):
    df["Date"] = pd.to_datetime(df["Date"])
    y = df[column]
    x = df
    
    return x

def extract_data(instruments, data, mode):
    year_val = 2018
    year_test = 2019
    for symbol in instruments:
        column = symbol + "_relative_change_perc_1"
        ndir = symbol + "_dir"
        nmag = symbol + "_mag"

        data_dir = data[ndir]
        data_mag = data[nmag]
        if mode == "train":
            data[ndir] = create_train(data_dir, year_val, year_test, column)
            data[nmag] = create_train(data_mag, year_val, year_test, column)
        if mode == "val":
            data[ndir] = create_val(data_dir, year_val, year_test, column)
            data[nmag] = create_val(data_mag, year_val, year_test, column)
        if mode == "test":
            data[ndir] = create_test(data_dir, year_val, year_test, column)
            data[nmag] = create_test(data_mag, year_val, year_test, column)
        if mode == "full":
            data[ndir] = create_full(data_dir, year_val, year_test, column)
            data[nmag] = create_full(data_mag, year_val, year_test, column)
    return data

In [14]:
def label_dir(y):
    positives = []
    negatives = []
    y = list(y)
    labels = []
    for dev in y:
        if dev >= 0:
            labels.append(1)
        else:
            labels.append(0)
    return labels

def label_mag(y):
    positives = []
    negatives = []
    y = list(y)
    for dev in y:
        if dev >= 0:
            positives.append(dev)
        else:
            negatives.append(dev)
    med_pos = median(positives)
    med_neg = median(negatives)
    
    labels = []
    for dev in y:
        if dev >= 0:
            if dev >= med_pos:
                labels.append(1)
            else:
                labels.append(0)
        else:
            if dev <= med_neg:
                labels.append(1)
            else:
                labels.append(0)
    return labels

def init_models_data():
    instruments = ['SP500', 'NASDAQ', 'US30']

    model_mag_sp500 = extract_model("Models/SP500_LSTM_large-small_model")
    model_mag_us30 = extract_model("Models/US30_LSTM_large-small_model")
    model_mag_nasdaq = extract_model("Models/NASDAQ_LSTM_large-small_model")
    
    model_mag_sp500 = extract_model("Models/SP500_NN_large-small_model_shap_l3")
    model_mag_us30 = extract_model("Models/US30_NN_large-small_model_shap_l3")
    model_mag_nasdaq = extract_model("Models/NASDAQ_NN_large-small_model_shap_l3")

    model_dir_sp500 = extract_model("Models/SP500_NN_up-down_model_shap_l3")
    model_dir_us30 = extract_model("Models/US30_NN_up-down_model_shap_l3")
    model_dir_nasdaq = extract_model("Models/NASDAQ_NN_up-down_model_shap_l3")
    

    models = {
        'SP500_mag': model_mag_sp500,
        'US30_mag': model_mag_us30,
        'NASDAQ_mag': model_mag_nasdaq,
        'SP500_dir': model_dir_sp500,
        'US30_dir': model_dir_us30,
        'NASDAQ_dir': model_dir_nasdaq
    }
    
    features_dir_sp500 = ['SP500_stochastic_K_50_t-5', 'SP500_williams_R_10_t-3', 'SP500_relative_change_perc_1_t-4', 'SP500_williams_R_50_t-3', 'Copper_F_relative_change_perc_1_t-7', 'SP500_stochastic_K_20_t-4', 'Gold_F_relative_change_perc_1_t-10', 'SP500_stochastic_K_5_t-4', 'SP500_momentum_16_t-7', 'SP500_stochastic_K_10_t-10', 'Silver_F_relative_change_perc_1_t-5', 'Gold_F_relative_change_perc_1_t-2', 'SP500_stochastic_K_50_t-4', 'SP500_williams_R_50_t-4', 'SP500_stochastic_K_10_t-1', 'SP500_stochastic_K_10_t-7', 'SP500_AD_oscillator_t-9', 'SP500_AD_oscillator_t-1', 'SP500_stochastic_K_5_t-2', 'Silver_F_relative_change_perc_1_t-10', 'SP500_stochastic_D_5_5_t-5', 'SP500_williams_R_10_t-7']
#     features_dir_us30 = ['USDCAD_relative_change_perc_5_t-2', 'USDCAD_relative_change_perc_10_t-3', 'Corn_F_relative_change_perc_5_t-2', 'SSE50_relative_change_perc_5_t-2', 'NaturalGas_F_Volume_t-2', 'USDCHF_relative_change_perc_20_t-2', 'HS50_relative_change_perc_50_t-3', 'UK100_relative_change_perc_1_t-2', 'NZDUSD_relative_change_perc_10_t-2', 'UK100_F_relative_change_perc_10_t-3', 'US30_momentum_4_t-3', 'US30_relative_change_perc_20_t-1', 'MSFT_relative_change_perc_1_t-1', 'HS50_relative_change_perc_5_t-3', 'BrentOil_F_relative_change_perc_1_t-3', 'USDJPY_relative_change_perc_5_t-2', 'NIKKEI225_F_relative_change_perc_20_t-2', 'MSFT_relative_change_perc_5_t-2', 'NIKKEI225_relative_change_perc_10_t-1', 'US30_OBV_t-2', 'USDJPY_relative_change_perc_1_t-1', 'NIKKEI225_relative_change_perc_5_t-1', 'GS_relative_change_perc_1_t-1', 'NIKKEI225_F_relative_change_perc_10_t-3']
#     features_dir_nasdaq = ['WTIOil_F_relative_change_perc_5_t-2', 'MSFT_relative_change_perc_5_t-2', 'CAC40_F_Volume_t-2', 'Copper_F_relative_change_perc_1_t-3', 'NIKKEI225_F_relative_change_perc_20_t-1', 'NASDAQ_momentum_8_t-2', 'USDJPY_relative_change_perc_1_t-1', 'NASDAQ_F_relative_change_perc_50_t-1', 'NaturalGas_F_relative_change_perc_20_t-3', 'NIKKEI225_relative_change_perc_50_t-3', 'SSE50_F_relative_change_perc_20_t-1', 'HS50_relative_change_perc_5_t-1', 'USDCHF_relative_change_perc_1_t-2']
    features_dir_sp500.insert(0, 'SP500_relative_change_perc_1')
    features_dir_sp500.insert(0, 'Date')
#     features_dir_us30.insert(0, 'US30_relative_change_perc_1')
#     features_dir_us30.insert(0, 'Date')
#     features_dir_nasdaq.insert(0, 'NASDAQ_relative_change_perc_1')
#     features_dir_nasdaq.insert(0, 'Date')
    
    features_mag_sp500 = ['SP500_week_high_1_t-2', 'SP500_stochastic_K_50_t-1', 'SP500_week_low_1_t-1']
    features_mag_us30 = ['US30_stochastic_K_20_t-1', 'US30_week_high_1_t-3', 'US30_ATR5_t-2']
    features_mag_nasdaq = ['NASDAQ_F_Volume_t-1', 'NASDAQ_stochastic_K_50_t-3', 'HS50_F_relative_change_perc_50_t-1', 'CAC40_relative_change_perc_50_t-1', 'NASDAQ_stochastic_K_50_t-1', 'NASDAQ_week_low_1_t-1', 'USDCHF_relative_change_perc_5_t-1', 'NASDAQ_stochastic_K_10_t-1', 'EURUSD_relative_change_perc_1_t-3', 'US30_relative_change_perc_5_t-3']
    features_mag_sp500.insert(0, 'SP500_relative_change_perc_1')
    features_mag_sp500.insert(0, 'Date')
    features_mag_us30.insert(0, 'US30_relative_change_perc_1')
    features_mag_us30.insert(0, 'Date')
    features_mag_nasdaq.insert(0, 'NASDAQ_relative_change_perc_1')
    features_mag_nasdaq.insert(0, 'Date')

    data_mag_sp500 = create_classification_data(retrieve_data("Dataset v3/SP500_combined_data_20220422.csv"), 3, 'SP500_relative_change_perc_1')
    data_mag_us30 = create_classification_data(retrieve_data("Dataset v3/US30_combined_data_20220422.csv"), 3, 'US30_relative_change_perc_1')
    data_mag_nasdaq = create_classification_data(retrieve_data("Dataset v3/nasdaq_combined_data_20220422.csv"), 3, 'NASDAQ_relative_change_perc_1')

    data_dir_sp500 = create_classification_data(retrieve_data("Dataset v3/SP500_reduced_data_20220425.csv"), 10, 'SP500_relative_change_perc_1')
    data_dir_us30 = create_classification_data(retrieve_data("Dataset v3/US30_combined_data_20220422.csv"), 3, 'US30_relative_change_perc_1')
    data_dir_nasdaq = create_classification_data(retrieve_data("Dataset v3/nasdaq_combined_data_20220422.csv"), 3, 'NASDAQ_relative_change_perc_1')
    
    data_dir_sp500 = data_dir_sp500[features_dir_sp500]
#     data_dir_us30 = data_dir_us30[features_dir_us30]
#     data_dir_nasdaq = data_dir_nasdaq[features_dir_nasdaq]
    data_mag_sp500 = data_mag_sp500[features_mag_sp500]
    data_mag_us30 = data_mag_us30[features_mag_us30]
    data_mag_nasdaq = data_mag_nasdaq[features_mag_nasdaq]

    data_dir_sp500 = predict_init(data_dir_sp500, model_dir_sp500, 'SP500_relative_change_perc_1')
    data_mag_sp500 = predict_init(data_mag_sp500, model_mag_sp500, 'SP500_relative_change_perc_1')
    
    data_dir_us30 = predict_init(data_dir_us30, model_dir_us30, 'US30_relative_change_perc_1')
    data_mag_us30 = predict_init(data_mag_us30, model_mag_us30, 'US30_relative_change_perc_1')
    
    data_dir_nasdaq = predict_init(data_dir_nasdaq, model_dir_nasdaq, 'NASDAQ_relative_change_perc_1')
    data_mag_nasdaq = predict_init(data_mag_nasdaq, model_mag_nasdaq, 'NASDAQ_relative_change_perc_1')
    
    data_dir_sp500['label'] = label_dir(data_dir_sp500['SP500_relative_change_perc_1'])
    data_mag_sp500['label'] = label_mag(data_mag_sp500['SP500_relative_change_perc_1'])
    
    data_dir_us30['label'] = label_dir(data_dir_us30['US30_relative_change_perc_1'])
    data_mag_us30['label'] = label_mag(data_mag_us30['US30_relative_change_perc_1'])
    
    data_dir_nasdaq['label'] = label_dir(data_dir_nasdaq['NASDAQ_relative_change_perc_1'])
    data_mag_nasdaq['label'] = label_mag(data_mag_nasdaq['NASDAQ_relative_change_perc_1'])
    
    data = {
        'SP500_mag': data_mag_sp500,
        'US30_mag': data_mag_us30,
        'NASDAQ_mag': data_mag_nasdaq,
        'SP500_dir': data_dir_sp500,
        'US30_dir': data_dir_us30,
        'NASDAQ_dir': data_dir_nasdaq
    }
    return instruments, data, models

In [15]:
instruments, data, models = init_models_data()

In [24]:
def plot_accuracy(dates_mag, accuracies, name, l):
    fig1 = make_subplots(rows=1, cols=1, specs=[[{'type':'xy'}]])
    
    fig1.add_trace(go.Scatter(x=dates, y=accuracies, mode="lines"), row=1, col=1)
    
    fig1.update_layout(
        title = f'Average Accuracy over {l} Trading Days for {name}', 
        xaxis1 = dict(title_text = 'Day'),
        yaxis1 = dict(title_text = 'Accuracy (%)')
    )
    fig1.write_image(f"Plots/Modelling/Accuracy Over Time {name}.png")
    fig1.show()
    
def plot_accuracy2(dates1, accs1, dates2, accs2, name, l):
    fig1 = make_subplots(rows=1, cols=1, specs=[[{'type':'xy'}]])
    
    fig1.add_trace(go.Scatter(x=dates1, y=accs1, mode="lines", name="Direction"), row=1, col=1)
    fig1.add_trace(go.Scatter(x=dates2, y=accs2, mode="lines", name="Magnitude"), row=1, col=1)
    
    fig1.update_layout(
        title = f'Average Accuracy over {l} Trading Days for {name}', 
        xaxis1 = dict(title_text = 'Day'),
        yaxis1 = dict(title_text = 'Accuracy (%)')
    )
    fig1.write_image(f"Plots/Modelling/Accuracy Over Time {name}.png")
    fig1.show()

accuracies_dict = {}
dates_dict = {}

for l in [250]:
    for d in data:
        if d[-4:] == "_mag":
            name = d[:-4] + " Magnitude"
        else:
            name = d[:-4] + " Direction"
        accuracies = []
        dates = []
        current_preds = []
        current_labels = []
        for i, row in data[d].iterrows():
            current_labels.append(row['label'])
            current_preds.append(row['pred'])
            
            if len(current_preds) >= l:
                accuracies.append(accuracy(current_labels, current_preds))
                dates.append(row['Date'])
                current_preds.pop(0)
                current_labels.pop(0)
        plot_accuracy(dates, accuracies, name, l)
        accuracies_dict[name] = accuracies
        dates_dict[name] = dates

instruments = ['SP500', 'US30', 'NASDAQ']
for instrument in instruments:
    accs_mag = accuracies_dict[instrument + " Magnitude"]
    accs_dir = accuracies_dict[instrument + " Direction"]
    dates_mag = dates_dict[instrument + " Magnitude"]
    dates_dir = dates_dict[instrument + " Direction"]
    
    plot_accuracy2(dates_dir, accs_dir, dates_mag, accs_mag, instrument, l)

In [22]:
for d in data:
    print(d, accuracy(data[d]['pred'], data[d]['label']))

SP500_mag 0.5
US30_mag 0.5
NASDAQ_mag 0.5
SP500_dir 0.5218382073680212
US30_dir 0.45151515151515154
NASDAQ_dir 0.4484848484848485
