In [4]:
import warnings
warnings.simplefilter(action='ignore', category=UserWarning)

import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import plotly.graph_objs as go
import plotly.express as px
from plotly.subplots import make_subplots
import gzip
from datetime import datetime, timedelta
from statistics import mean, median
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
import seaborn as sns
import matplotlib.pyplot as plt

import tensorflow
import tensorflow.keras as tf
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Input, Dense, LeakyReLU, BatchNormalization, ReLU, LSTM, Conv1D, Conv2D
from tensorflow.keras.activations import sigmoid, tanh
from tensorflow.keras.utils import to_categorical

from sklearn.preprocessing import MinMaxScaler

from tqdm import tqdm
import csv
import random
import math

from sklearn.metrics import accuracy_score as accuracy
from sklearn.metrics import precision_score as precision
from sklearn.metrics import recall_score as recall
from sklearn.metrics import f1_score as f1

import pickle

In [5]:
def extract_model(filename):
    model = tf.models.load_model(filename)
    return model
    
def retrieve_data(filename):
    if "combined" in filename:
        df = pd.read_csv(filename)
    else:
        df = pd.read_csv(filename, index_col=0)
    df["Date"] = pd.to_datetime(df["Date"])
    return df

def create_classification_data(df, lookback, column):
    rows = []
    columns = ['Date', column] # Date and SP500_relative_change_perc_1 from t-0 are added first as target variables 
    # create column names based on original with the addition of t-i where i is lookback
    for i in range(1, lookback + 1): # starts at 1 since we do not want t-0 variables apart from 'Date' and 'SP500_relative_change_perc_1'
        new_columns = df.columns.tolist() # starts at 1 to exclude 'Date' column
        for x in range(len(new_columns)):
            new_columns[x] = new_columns[x] + "_t-" + str(i)
        columns = columns + new_columns
    
    # create lookback data
    for i, row in enumerate(df.iterrows()):
        if i > lookback: # lookback cannot be determined for earlier rows
            new_row = [row[1]['Date'], row[1][column]]
#             new_row = [row[1][0], row[1][1]] # add target 'Date' and 'SP500_relative_change_perc_1 '
            for x in range(1, lookback + 1): # starts at 1 since we do not want t-0 variables apart from 'Date' and 'SP500_relative_change_perc_1'
                add_row = df.iloc[i - x].tolist() # starts at 1 to exclude 'Date' column
                new_row = new_row + add_row
            rows.append(new_row)
    
    df2 = pd.DataFrame(rows)
    df2.columns = columns
                       
    for col in columns:
        if col[:4] == "Date" and col != "Date":
            df2 = df2.drop([col], axis=1)
    return df2

def create_train(df, year_val, year_test, column, perc_train=None):
    # assumes years_train < year_val < year_test
    df["Date"] = pd.to_datetime(df["Date"])

    val = df[df['Date'].dt.year == year_val]
    test = df[df['Date'].dt.year == year_test]
    train = df[df['Date'].dt.year < year_val]
#     y_train = train[column]
    x_train = train
    return x_train


def create_val(df, year_val, year_test, column, perc_train=None):
    # assumes years_train < year_val < year_test
    df["Date"] = pd.to_datetime(df["Date"])
    val = df[df['Date'].dt.year == year_val]
    test = df[df['Date'].dt.year == year_test]
    train = df[df['Date'].dt.year < year_val]
#     y_val = val[column]
    x_val = val
#     display(x_val)
    return x_val

def create_test(df, year_val, year_test, column, perc_train=None):
    print("test", len(df))
    # assumes years_train < year_val < year_test
    df["Date"] = pd.to_datetime(df["Date"])

    val = df[df['Date'].dt.year == year_val]
    test = df[df['Date'].dt.year == year_test]
    train = df[df['Date'].dt.year < year_val]
#     y_test = test[column]
    x_test = test
    return x_test

def create_full(df, year_val, year_test, column, perc_train=None):
    df["Date"] = pd.to_datetime(df["Date"])
#     y = df[column]
    x = df
    
    return x

In [6]:
def label_dir(y):
    positives = []
    negatives = []
    y = list(y)
    
    labels = []
    for dev in y:
        if dev >= 0:
            labels.append(1)
        else:
            labels.append(0)
    return labels

def label_mag(y):
    positives = []
    negatives = []
    y = list(y)
    for dev in y:
        if dev >= 0:
            positives.append(dev)
        else:
            negatives.append(dev)
    med_pos = median(positives)
    med_neg = median(negatives)
    
    labels = []
    for dev in y:
        if dev >= med_pos or dev <= med_neg:
            labels.append(1)
        else:
            labels.append(0)
    return labels

def init_models_data():
    instruments = ['S&P500', 'NASDAQ', 'US30']

    models = {}
    
    data_dir_sp500 = pd.read_csv("Models/Models Output/SP500_direction_predictions.csv", index_col=0)
    data_mag_sp500 = pd.read_csv("Models/Models Output/SP500_magnitude_predictions.csv", index_col=0)
    
    data_dir_us30 = pd.read_csv("Models/Models Output/US30_direction_predictions.csv", index_col=0)
    data_mag_us30 = pd.read_csv("Models/Models Output/US30_magnitude_predictions.csv", index_col=0)
    
    data_dir_nasdaq = pd.read_csv("Models/Models Output/NASDAQ_direction_predictions.csv", index_col=0)
    data_mag_nasdaq = pd.read_csv("Models/Models Output/NASDAQ_magnitude_predictions.csv", index_col=0)
    
        
#     print(accuracy(data_dir_sp500['pred'], data_dir_sp500['label']))
#     print(accuracy(data_mag_sp500['pred'], data_mag_sp500['label']))
    
#     print(accuracy(data_dir_us30['pred'], data_dir_us30['label']))
#     print(accuracy(data_mag_us30['pred'], data_mag_us30['label']))
    
#     print(accuracy(data_dir_nasdaq['pred'], data_dir_nasdaq['label']))
#     print(accuracy(data_mag_sp500['pred'], data_mag_nasdaq['label']))
    
#     print(data_dir_sp500.columns)
#     print(data_mag_sp500.columns)
#     print(data_dir_us30.columns)
#     print(data_mag_us30.columns)
#     print(data_dir_nasdaq.columns)
#     print(data_mag_nasdaq.columns)
    
    data = {
        'SP500_mag': data_mag_sp500,
        'US30_mag': data_mag_us30,
        'NASDAQ_mag': data_mag_nasdaq,
        'SP500_dir': data_dir_sp500,
        'US30_dir': data_dir_us30,
        'NASDAQ_dir': data_dir_nasdaq
    }
    return instruments, data, models

In [19]:
def plot_magnitude_labels(l, s, name):
    x = instruments
    fig = go.Figure(data=[
        go.Bar(name='Large', x=x, y=l),
        go.Bar(name='Small', x=x, y=s)
    ])
    fig.update_layout(
        title = f'Distribution of {name} Data Large/Small Labels', 
        xaxis1 = dict(title_text = ''),
        yaxis1 = dict(title_text = 'Frequency')
    )
    fig.update_layout(barmode='group')
    fig.write_image(f"Plots/Label Distribution Magnitude {name}.png")
    fig.show()
    
def plot_direction_labels(u, d, name):
    x = instruments
    fig = go.Figure(data=[
        go.Bar(name='Upwards', x=x, y=u),
        go.Bar(name='Downwards', x=x, y=d)
    ])
    fig.update_layout(
#         title = f'Distribution of {name} Data Upwards/Downwards Labels', 
        title = f'Number of Upwards and Downwards Movements per Index', 
        xaxis1 = dict(title_text = ''),
        yaxis1 = dict(title_text = 'Frequency')
    )
    fig.update_layout(barmode='group')
#     fig.write_image(f"Plots/Label Distribution Direction {name}.png")
    fig.write_image(f"Plots/Candle Direction Distribution.png")
    fig.show()



instruments, data, models = init_models_data()

larges = []
smalls = []

for d in ['SP500_mag', 'US30_mag', 'NASDAQ_mag']:
    print(d)
    p = data[d]['label'].sum()
    n = len(data[d]) - p
    larges.append(p)
    smalls.append(n)
plot_magnitude_labels(larges, smalls, "Full")

ups = []
downs = []
for d in ['SP500_dir', 'US30_dir', 'NASDAQ_dir']:
    print(d)
    p = data[d]['label'].sum()
    n = len(data[d]) - p
    ups.append(p)
    downs.append(n)
plot_direction_labels(ups, downs, "Full")

SP500_mag
US30_mag
NASDAQ_mag


SP500_dir
US30_dir
NASDAQ_dir


In [16]:
sets = ["Training", "Validation", "Test"]

larges = []
smalls = []
for d in ['SP500_mag', 'US30_mag', 'NASDAQ_mag']:
    data[d]["Date"] = pd.to_datetime(data[d]["Date"])
    train = data[d][data[d]['Date'].dt.year < 2018]
    p = train['label'].sum()
    n = len(train) - p
    larges.append(p)
    smalls.append(n)
plot_magnitude_labels(larges, smalls, "Train")

larges = []
smalls = []
for d in ['SP500_mag', 'US30_mag', 'NASDAQ_mag']:
    data[d]["Date"] = pd.to_datetime(data[d]["Date"])
    train = data[d][data[d]['Date'].dt.year == 2018]
    p = train['label'].sum()
    n = len(train) - p
    larges.append(p)
    smalls.append(n)
plot_magnitude_labels(larges, smalls, "Validation")

larges = []
smalls = []
for d in ['SP500_mag', 'US30_mag', 'NASDAQ_mag']:
    data[d]["Date"] = pd.to_datetime(data[d]["Date"])
    train = data[d][data[d]['Date'].dt.year == 2019]
    p = train['label'].sum()
    n = len(train) - p
    larges.append(p)
    smalls.append(n)
plot_magnitude_labels(larges, smalls, "Test")



ups = []
downs = []
for d in ['SP500_dir', 'US30_dir', 'NASDAQ_dir']:
    data[d]["Date"] = pd.to_datetime(data[d]["Date"])
    train = data[d][data[d]['Date'].dt.year < 2018]
    p = data[d]['label'].sum()
    n = len(data[d]) - p
    ups.append(p)
    downs.append(n)
plot_direction_labels(ups, downs, "Training")

ups = []
downs = []
for d in ['SP500_dir', 'US30_dir', 'NASDAQ_dir']:
    data[d]["Date"] = pd.to_datetime(data[d]["Date"])
    train = data[d][data[d]['Date'].dt.year == 2018]
    p = data[d]['label'].sum()
    n = len(data[d]) - p
    ups.append(p)
    downs.append(n)
plot_direction_labels(ups, downs, "Validation")

ups = []
downs = []
for d in ['SP500_dir', 'US30_dir', 'NASDAQ_dir']:
    data[d]["Date"] = pd.to_datetime(data[d]["Date"])
    train = data[d][data[d]['Date'].dt.year == 2019]
    p = data[d]['label'].sum()
    n = len(data[d]) - p
    ups.append(p)
    downs.append(n)
plot_direction_labels(ups, downs, "Test")

In [38]:
df1 = pd.DataFrame(data['SP500_dir']) 
df1["Date"] = pd.to_datetime(df1["Date"])
df2 = pd.DataFrame(data['US30_dir'])
df2["Date"] = pd.to_datetime(df2["Date"])
df3 = pd.DataFrame(data['NASDAQ_dir'])
df3["Date"] = pd.to_datetime(df3["Date"])

df = df1.merge(df2, on='Date', suffixes=('_SP500_dir', '_US30_dir'))
df = df.merge(df3, on='Date', suffixes=('', '_NASDAQ_dir'))
df['pred_NASDAQ_dir'] = df['pred']
df['label_NASDAQ_dir'] = df['label']
df = df.drop(columns=['pred', 'label'])

# df1 = pd.DataFrame(data['SP500_mag']) 
# df1["Date"] = pd.to_datetime(df1["Date"])
# df2 = pd.DataFrame(data['US30_mag'])
# df2["Date"] = pd.to_datetime(df2["Date"])
# df3 = pd.DataFrame(data['NASDAQ_mag'])
# df3["Date"] = pd.to_datetime(df3["Date"])

# df = df1.merge(df2, on='Date', suffixes=('_SP500_mag', '_US30_mag'))
# df = df.merge(df3, on='Date', suffixes=('', '_NASDAQ_mag'))
# df['pred_NASDAQ_mag'] = df['pred']
# df['label_NASDAQ_mag'] = df['label']
# df = df.drop(['pred', 'label'])

freqs = [0, 0, 0, 0]
larges_correct = [0, 0, 0, 0]
for i, row in df.iterrows():
    current = 0
    if row['pred_SP500_dir'] == row['label_SP500_dir']:
        current += 1
    if row['pred_US30_dir'] == row['label_US30_dir']:
        current += 1
    if row['pred_NASDAQ_dir'] == row['label_NASDAQ_dir']:
        current += 1
        
    if current == 0:
        freqs[0] += 1
    elif current == 1:
        freqs[1] += 1
    elif current == 2:
        freqs[2] += 1
    elif current == 3:
        freqs[3] += 1
        
    
print("Zeros", freqs[0])
print("Ones", freqs[1])
print("Twos", freqs[2])
print("Threes", freqs[3])

Zeros 353
Ones 679
Twos 885
Threes 716


In [39]:
x = ["0", "1", "2", "3"]
fig = go.Figure(data=[
    go.Bar(x=x, y=freqs)
])
fig.update_layout(
    title = f'Frequency of Number of Correct Predictions of Direction per Day', 
    xaxis1 = dict(title_text = ''),
    yaxis1 = dict(title_text = 'Frequency')
)
fig.write_image(f"Plots/Number Direction Predictions Correct.png")
fig.show()

In [40]:
df1 = pd.DataFrame(data['SP500_mag']) 
df1["Date"] = pd.to_datetime(df1["Date"])
df2 = pd.DataFrame(data['US30_mag'])
df2["Date"] = pd.to_datetime(df2["Date"])
df3 = pd.DataFrame(data['NASDAQ_mag'])
df3["Date"] = pd.to_datetime(df3["Date"])

df = df1.merge(df2, on='Date', suffixes=('_SP500_mag', '_US30_mag'))
df = df.merge(df3, on='Date', suffixes=('', '_NASDAQ_mag'))
df['pred_NASDAQ_mag'] = df['pred']
df['label_NASDAQ_mag'] = df['label']
df = df.drop(columns=['pred', 'label'])

freqs = [0, 0, 0, 0]
larges_correct = [0, 0, 0, 0]
for i, row in df.iterrows():
    current = 0
    if row['pred_SP500_mag'] == row['label_SP500_mag']:
        current += 1
    if row['pred_US30_mag'] == row['label_US30_mag']:
        current += 1
    if row['pred_NASDAQ_mag'] == row['label_NASDAQ_mag']:
        current += 1
        
    if current == 0:
        freqs[0] += 1
    elif current == 1:
        freqs[1] += 1
    elif current == 2:
        freqs[2] += 1
    elif current == 3:
        freqs[3] += 1
        
    
print("Zeros", freqs[0])
print("Ones", freqs[1])
print("Twos", freqs[2])
print("Threes", freqs[3])

Zeros 300
Ones 558
Twos 820
Threes 962


In [41]:
x = ["0", "1", "2", "3"]
fig = go.Figure(data=[
    go.Bar(x=x, y=freqs)
])
fig.update_layout(
    title = f'Frequency of Number of Correct Predictions of Magnitude per Day', 
    xaxis1 = dict(title_text = ''),
    yaxis1 = dict(title_text = 'Frequency')
)
fig.write_image(f"Plots/Number Magnitude Predictions Correct.png")
fig.show()

In [45]:
df1 = pd.DataFrame(data['SP500_mag']) 
df1["Date"] = pd.to_datetime(df1["Date"])
df2 = pd.DataFrame(data['US30_mag'])
df2["Date"] = pd.to_datetime(df2["Date"])
df3 = pd.DataFrame(data['NASDAQ_mag'])
df3["Date"] = pd.to_datetime(df3["Date"])

df = df1.merge(df2, on='Date', suffixes=('_SP500_mag', '_US30_mag'))
df = df.merge(df3, on='Date', suffixes=('', '_NASDAQ_mag'))
df['pred_NASDAQ_mag'] = df['pred']
df['label_NASDAQ_mag'] = df['label']
df = df.drop(columns=['pred', 'label'])

df1 = pd.DataFrame(data['SP500_dir']) 
df1["Date"] = pd.to_datetime(df1["Date"])
df2 = pd.DataFrame(data['US30_dir'])
df2["Date"] = pd.to_datetime(df2["Date"])
df3 = pd.DataFrame(data['NASDAQ_dir'])
df3["Date"] = pd.to_datetime(df3["Date"])

df = df.merge(df1, on='Date', suffixes=('', '_SP500_dir'))
df = df.merge(df2, on='Date', suffixes=('', '_US30_dir'))
df = df.merge(df3, on='Date', suffixes=('', '_NASDAQ_dir'))
df['pred_SP500_dir'] = df['pred']
df['label_SP500_dir'] = df['label']
df = df.drop(columns=['pred', 'label'])

# display(df)

smalls = 0
smalls_correct = 0
larges = 0
larges_correct = 0

for i, row in df.iterrows():
    current = 0
    if row['label_SP500_mag'] == 1:
        larges += 1
        if row['pred_SP500_dir'] == row['label_SP500_dir']:
            larges_correct += 1
    else:
        smalls += 1
        if row['pred_SP500_dir'] == row['label_SP500_dir']:
            smalls_correct += 1
    if row['label_US30_mag'] == 1:
        larges += 1
        if row['pred_US30_dir'] == row['label_US30_dir']:
            larges_correct += 1
    else:
        smalls += 1
        if row['pred_US30_dir'] == row['label_US30_dir']:
            smalls_correct += 1
    if row['label_NASDAQ_mag'] == 1:
        larges += 1
        if row['pred_NASDAQ_dir'] == row['label_NASDAQ_dir']:
            larges_correct += 1
    else:
        smalls += 1
        if row['pred_NASDAQ_dir'] == row['label_NASDAQ_dir']:
            smalls_correct += 1
        
    
print("Larges", larges)
print("Larges Correct", larges_correct)
print("Smalls", smalls)
print("Smalls Correct", smalls_correct)

Larges 3951
Larges Correct 2329
Smalls 3948
Smalls Correct 2268


In [51]:
x = ['Small Movements', 'Large Movements']
fig = go.Figure(data=[
    go.Bar(name='Correct Direction', x=x, y=[smalls_correct, larges_correct], text=[str(round(smalls_correct/smalls*100,2))+"%", str(round(larges_correct/larges*100,2))+"%"]),
    go.Bar(name='Incorrect Direction', x=x, y=[smalls-smalls_correct, larges-larges_correct])
])
# Change the bar mode
fig.update_layout(barmode='stack')
fig.update_layout(
    title = f'Frequency of Correct Direction Prediction for Large and Small Movements', 
    xaxis1 = dict(title_text = ''),
    yaxis1 = dict(title_text = 'Frequency')
)
fig.write_image(f"Plots/Number Direction Predictions Correct per Magnitude.png")
fig.show()